タイポグラフィルール(フォントやフォントサイズ)を元に該当ページ番号にしたがって PDF からテキストを抽出し、PDFlib を利用して PDF からテキストを抽出し、既存の文書のためのそれぞれのページへリンクする目次を作成します。
package com.pdflib.cookbook.tet.tet_and_pdflib;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import com.pdflib.PDFlibException;
import com.pdflib.TET;
import com.pdflib.TETException;
import com.pdflib.pdflib;
/**
* Extract some text from a PDF based on certain typographic criteria (font,
* fontsize) along with the corresponding page numbers, and use PDFlib to create
* a table of contents (TOC) for the original document, possibly enriched with
* active links to the respective pages. With PDFlib+PDI the TOC could be
* prepended to the original pages. With plain PDFlib a stand-alone TOC can be
* created.
*
* Required software: TET 3 and PDFlib+PDI 8 or PDFlib 8
*
* Required data: PDF document
*
*/
class create_table_of_contents {
/**
* Common search path for PDI and TET to find the input document.
*/
private static final String DOC_SEARCH_PATH = "../input";
/**
* Global option list. The program expects the "resource" directory parallel to
* the "java" directory.
*/
private static final String GLOBAL_OPTLIST = "searchpath={../resource/cmap ../resource/glyphlist "
+ DOC_SEARCH_PATH + "}";
/**
* Document specific option list.
*/
private static final String DOC_OPTLIST = "";
/**
* Page-specific option list.
*/
private static final String PAGE_OPTLIST = "granularity=page";
/**
* The encoding in which the output is sent to System.out. For running the
* example in a Windows command window, you can set this for example to
* "windows-1252" for getting Latin-1 output.
*/
private static final String OUTPUT_ENCODING = System.getProperty("file.encoding");
/**
* For printing to System.out in the encoding specified via OUTPUT_ENCODING.
*/
private static PrintStream out;
/**
* The name of the input file
*/
private String infilename;
/**
* The name of the output file
*/
private String outfilename;
/**
* The name of the font to search for.
*/
private static final String FONT_NAME = "TheSansBold-Plain";
/**
* The font size to search for in points.
*/
private static final double FONT_SIZE = 9;
/**
* The tolerance for the font size in points.
*/
private static final double FONT_SIZE_TOLERANCE = 0.01;
/**
* Nudge factor for ascender height of the Web links (relative to the font size)
*/
private static final double ASCENDER = 0.85;
/**
* Whether to use PDI to create a new document that consists of the original
* document and the TOC prepended to it. Set this to false in order not to use
* PDI, and in order to produce a document that only contains the TOC.
*/
private static final boolean USE_PDI = true;
/**
* The page width for the TOC pages (see "width" option for begin_page_ext() in
* the PDFlib Reference Manual)
*/
private static final String TOC_WIDTH = "a4.width";
/**
* The page height for the TOC pages (see "height" option for begin_page_ext()
* PDFlib Reference Manual)
*/
private static final String TOC_HEIGHT = "a4.height";
/**
* The title for the TOC.
*/
private static final String TOC_TITLE = "Table of Contents";
/**
* The font to use for the headline that is placed on each page of the TOC.
*/
private static final String TOC_TITLE_FONT = "Helvetica-Bold";
/**
* The fontsize to use for the headline that is placed on each page of the TOC.
*/
private static final int TOC_TITLE_FONTSIZE = 18;
/**
* The font to use for the TOC entries.
*/
private static final String TOC_FONT = "Helvetica";
/**
* The fontsize for the TOC entries.
*/
private static final int TOC_FONTSIZE = 12;
/**
* x-position of the lower-left corner of the TOC fitbox.
*/
private static final int TOC_LLX = 110;
/**
* y-position of the lower-left corner of the TOC fitbox.
*/
private static final int TOC_LLY = 100;
/**
* x-position of the upper-right corner of the TOC fitbox.
*/
private static final int TOC_URX = 450;
/**
* y-position of the upper-right corner of the TOC fitbox.
*/
private static final int TOC_URY = 700;
/**
* Lower-left y-position for the TOC headline.
*/
private static final int TOC_TITLE_LLY = 740;
/**
* The prefix for a destination name.
*/
private static final String TOC_DESTINATION_PREFIX = "tmx";
/**
* The text flow (including options) for the TOC contents.
*/
private StringBuffer tocTextflow = new StringBuffer();
/**
* The current destination number. Used to generate unique destination names.
*/
private int destNumber = 0;
/**
* Import the current page from the PDI import document and place it in the
* ouput document.
*
* @param p the pdflib object
* @param pdiHandle the PDI handle for the input document
* @param pageno the current page number
*
* @throws PDFlibException an error occurred in the PDFlib API
*/
private int put_pdi_page(pdflib p, int pdiHandle, int pageno) throws PDFlibException {
/*
* The page size will be adjusted later to match the size of the input pages
*/
p.begin_page_ext(10, 10, "group content");
int pageHandle = p.open_pdi_page(pdiHandle, pageno, "");
if (pageHandle != -1) {
/* Place the input page and adjust the page size */
p.fit_pdi_page(pageHandle, 0, 0, "adjustpage");
}
return pageHandle;
}
/**
* Tests whether the current character matches the criteria for text that shall
* get a an entry in the TOC. get_char_info must have been called before in
* order to ensure that the TET object contains the information for the current
* character.
*
* @param tet The TET object
* @param doc The TET document handle
* @throws TETException
*/
private boolean font_matches(TET tet, final int doc) throws TETException {
String name = tet.pcos_get_string(doc, "fonts[" + tet.fontid + "]/name");
return name.equals(FONT_NAME) && (Math.abs(tet.fontsize - FONT_SIZE) <= FONT_SIZE_TOLERANCE);
}
/**
* Add text and options to the textflow for the current TOC entry. The options
* take care that the TOC will be properly formattted (do not split TOC entries
* over page boundaries).
*
* @param p The pdflib object
* @param tocText The text of the TOC entry to add
* @param pageno The page number for the TOC entry
* @param ulx x-position of the identified text on the page
* @param uly y-position of the identified text on the page
*
* @throws PDFlibException An error occurred in the PDFlib API
*/
private void add_toc_entry(pdflib p, String tocText, int pageno, double ulx, double uly)
throws PDFlibException {
/*
* The same name is used for the matchbox name in the TOC and for the named
* destination that is the target of the "GoTo" action in the TOC.
*/
String destName = get_destination_name(destNumber);
/*
* We need pairwise marks that enclose the text that shall be kept together.
*/
tocTextflow.append("<mark=").append(destNumber * 2).append(
" alignment=left matchbox={name=").append(destName).append(
" boxheight={fontsize descender}}>").append(tocText).append(
"<leader={alignment={grid}}>\t").append(pageno).append(
"<matchbox=end nextline mark=").append((destNumber * 2) + 1)
.append(">\n");
if (USE_PDI) {
p.add_nameddest(destName, "type=fixed left=" + ulx + " top=" + uly);
}
destNumber += 1;
}
/**
* Create a unique name for each destination.
*
* @param number The number of the destination.
*
* @return A string that can be used as a destination name and as the
* corresponding matchbox name.
*/
private String get_destination_name(int number) {
return TOC_DESTINATION_PREFIX + number;
}
/**
* Process a page: Create a new page in the output document, place the page from
* the input document in the output document, and create TOC entries for all
* occurrences of text with the desired properties.
*
* @param tet TET object
* @param doc TET document handle
* @param p pdflib object
* @param pdiHandle PDI document handle
* @param pageno The current page number
* @throws TETException An error occurred in the TET API
* @throws PDFlibException An error occurred in the PDFlib API
*/
private void process_page(TET tet, final int doc, pdflib p, int pdiHandle, int pageno)
throws TETException, PDFlibException {
if (USE_PDI) {
put_pdi_page(p, pdiHandle, pageno);
}
final int page = tet.open_page(doc, pageno, PAGE_OPTLIST);
if (page == -1) {
System.err.println("Error " + tet.get_errnum() + " in "
+ tet.get_apiname() + "(): " + tet.get_errmsg());
} else {
/* Retrieve all text fragments for the page */
for (String text = tet.get_text(page); text != null; text = tet
.get_text(page)) {
int nextCharPos = 0;
int matchStart = -1;
double uly = 0;
double ulx = 0;
while (tet.get_char_info(page) != -1) {
nextCharPos += 1;
if (font_matches(tet, doc)) {
if (matchStart == -1) {
// start of new matching chunk
matchStart = nextCharPos - 1;
uly = tet.y + ASCENDER * tet.fontsize;
ulx = tet.x;
}
} else {
if (matchStart != -1) {
/*
* End of matching chunk. The last character that is
* belonging to the chunk is the one preceding the
* next character. matchEnd is the index of the first
* character after the bookmark characters.
*/
int matchEnd = nextCharPos - 1;
/*
* remove trailing whitespace
*/
while (matchEnd > matchStart
&& Character.isWhitespace(text.charAt(matchEnd - 1))) {
matchEnd -= 1;
}
String tocText = text.substring(matchStart, matchEnd);
out.println("Creating TOC entry \"" + tocText + "\"");
add_toc_entry(p, tocText, pageno, ulx, uly);
matchStart = -1;
}
}
}
}
if (tet.get_errnum() != 0) {
System.err.println("Error " + tet.get_errnum() + " in "
+ tet.get_apiname() + "(): " + tet.get_errmsg());
}
/*
* Close page in the input and output documents.
*/
if (USE_PDI) {
p.end_page_ext("");
}
tet.close_page(page);
}
}
private void create_toc(pdflib p) throws Exception {
final String optlist = "fontname=" + TOC_FONT + " fontsize="
+ TOC_FONTSIZE + " encoding=unicode ruler=100%"
+ " hortabmethod=ruler tabalignment=right";
/*
* Load the font for the title of the TOC
*/
int font = p.load_font(TOC_TITLE_FONT, "unicode", "");
if (font == -1)
throw new Exception("Error: " + p.get_errmsg());
/*
* Create a textflow for the collected TOC entries.
*/
int tf = p.create_textflow(tocTextflow.toString(), optlist);
if (tf == -1)
throw new Exception("Error: " + p.get_errmsg());
/*
* Maximum number of the mark defined in the Textflow
*/
int maxMark = (destNumber * 2) - 1;
/*
* Keep track of the first mark for each page for creating the "GoTo" actions on
* the page.
*/
int startMark = 0;
/*
* Loop until all of the text is placed; create new pages as long as more text
* needs to be placed.
*/
String result;
do {
p.begin_page_ext(0, 0, "group=toc width=" + TOC_WIDTH + " height="
+ TOC_HEIGHT);
/* Place a text line with a title */
p.setfont(font, TOC_TITLE_FONTSIZE);
p.fit_textline(TOC_TITLE, TOC_LLX, TOC_TITLE_LLY, "");
/*
* Place the Textflow with the table of contents in blind mode, to find out how
* many marks did fit into the box. With this information we also prevent a TOC
* entry from being split over two pages by using the "returnatmark" option.
*/
result = p.fit_textflow(tf, TOC_LLX, TOC_LLY, TOC_URX, TOC_URY, "blind");
int lastMark = (int) p.info_textflow(tf, "lastmark");
/*
* An even mark number indicates the start of a text section to be kept
* together. Reset it to the last odd mark number which indicates the end of a
* text section.
*/
if (lastMark % 2 == 0) {
lastMark -= 1;
}
/*
* Now actually fit the textflow. To rewind the textflow status to before the
* last call to fit_textflow() use "rewind=-1".
*/
result = p.fit_textflow(tf, TOC_LLX, TOC_LLY, TOC_URX, TOC_URY, "returnatmark="
+ lastMark + " rewind=-1");
/*
* When PDI is in use, create the "GoTo" actions that allows to click on a TOC
* entry for jumping to the corresponding section of the document.
*/
for (int i = startMark; USE_PDI && i <= lastMark / 2; i += 1) {
String destinationName = get_destination_name(i);
int action = p.create_action("GoTo", "destname=" + destinationName);
p.create_annotation(0, 0, 0, 0, "Link",
"action={activate " + action + "} linewidth=0 usematchbox={"
+ destinationName + "}");
}
startMark = (lastMark / 2) + 1;
p.end_page_ext("");
/*
* "_boxfull" means we must continue because there is more text; "_nextpage" is
* interpreted as "start new column"
*/
} while (!result.equals("_stop") && !result.equals("_boxempty")
&& !result.equals("_mark" + maxMark));
/* Check for errors */
if (result.equals("_boxempty"))
throw new Exception("Error: Textflow box for TOC is too small");
p.delete_textflow(tf);
}
private void execute() {
TET tet = null;
pdflib p = null;
int pageno = 0;
try {
tet = new TET();
tet.set_option(GLOBAL_OPTLIST);
p = new pdflib();
p.set_option("searchpath={" + DOC_SEARCH_PATH + "}");
if (p.begin_document(outfilename, "groups={toc content}") == -1) {
System.err.println("Error: " + p.get_errmsg());
return;
}
/* add document info entries */
p.set_info("Creator", "Create Table Of Contents TET Cookbook Example");
p.set_info("Author", "PDFlib GmbH");
p.set_info("Title", infilename);
final int doc = tet.open_document(infilename, DOC_OPTLIST);
if (doc == -1) {
System.err.println("Error " + tet.get_errnum() + " in " + tet.get_apiname()
+ "(): " + tet.get_errmsg());
return;
}
final int n_pages = (int) tet.pcos_get_number(doc, "length:pages");
int pdiHandle = -1;
if (USE_PDI) {
pdiHandle = p.open_pdi_document(infilename, "");
if (pdiHandle == -1) {
System.err.println("Error: " + p.get_errmsg());
return;
}
}
/*
* Loop over pages in the document
*/
for (pageno = 1; pageno <= n_pages; ++pageno) {
process_page(tet, doc, p, pdiHandle, pageno);
}
/*
* Use the information collected while processing the input document to create
* the table of contents.
*/
create_toc(p);
p.end_document("");
if (USE_PDI) {
p.close_pdi_document(pdiHandle);
}
tet.close_document(doc);
out.println("Created PDF output document \"" + outfilename + "\" with "
+ destNumber + " TOC entries.");
} catch (TETException e) {
if (pageno == 0) {
System.err.println("Error " + e.get_errnum() + " in " + e.get_apiname()
+ "(): " + e.get_errmsg() + "\n");
} else {
System.err.println("Error " + e.get_errnum() + " in " + e.get_apiname()
+ "() on page " + pageno + ": " + e.get_errmsg() + "\n");
}
} catch (PDFlibException e) {
if (pageno == 0) {
System.err.println("Error " + e.get_errnum() + " in " + e.get_apiname() + "(): "
+ e.get_errmsg() + "\n");
} else {
System.err.println("Error " + e.get_errnum() + " in " + e.get_apiname()
+ "() on page " + pageno + ": " + e.get_errmsg() + "\n");
}
} catch (Exception e) {
System.err.println(e);
} finally {
tet.delete();
p.delete();
}
}
/**
* @param infilename the name of the file for which the bookmarked file will be
* generated
* @param outfilename the name of the output file
*/
private create_table_of_contents(String infilename, String outfilename) {
this.infilename = infilename;
this.outfilename = outfilename;
}
public static void main(String[] args) throws UnsupportedEncodingException {
System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\"");
out = new PrintStream(System.out, true, OUTPUT_ENCODING);
if (args.length != 2) {
out.println("usage: create_table_of_contents <infilename> <outfilename>");
return;
}
create_table_of_contents t = new create_table_of_contents(args[0], args[1]);
t.execute();
}
}