package com.pdflib.cookbook.tet.tet_and_pdflib; import java.io.PrintStream; import java.io.UnsupportedEncodingException; import com.pdflib.PDFlibException; import com.pdflib.TET; import com.pdflib.TETException; import com.pdflib.pdflib; /** * Extract some text from a PDF based on certain typographic criteria (font, * fontsize) along with the corresponding page numbers, and use PDFlib to create * a table of contents (TOC) for the original document, possibly enriched with * active links to the respective pages. With PDFlib+PDI the TOC could be * prepended to the original pages. With plain PDFlib a stand-alone TOC can be * created. *

* Required software: TET 3 and PDFlib+PDI 8 or PDFlib 8 *

* Required data: PDF document * */ class create_table_of_contents { /** * Common search path for PDI and TET to find the input document. */ private static final String DOC_SEARCH_PATH = "../input"; /** * Global option list. The program expects the "resource" directory parallel to * the "java" directory. */ private static final String GLOBAL_OPTLIST = "searchpath={../resource/cmap ../resource/glyphlist " + DOC_SEARCH_PATH + "}"; /** * Document specific option list. */ private static final String DOC_OPTLIST = ""; /** * Page-specific option list. */ private static final String PAGE_OPTLIST = "granularity=page"; /** * The encoding in which the output is sent to System.out. For running the * example in a Windows command window, you can set this for example to * "windows-1252" for getting Latin-1 output. */ private static final String OUTPUT_ENCODING = System.getProperty("file.encoding"); /** * For printing to System.out in the encoding specified via OUTPUT_ENCODING. */ private static PrintStream out; /** * The name of the input file */ private String infilename; /** * The name of the output file */ private String outfilename; /** * The name of the font to search for. */ private static final String FONT_NAME = "TheSansBold-Plain"; /** * The font size to search for in points. */ private static final double FONT_SIZE = 9; /** * The tolerance for the font size in points. */ private static final double FONT_SIZE_TOLERANCE = 0.01; /** * Nudge factor for ascender height of the Web links (relative to the font size) */ private static final double ASCENDER = 0.85; /** * Whether to use PDI to create a new document that consists of the original * document and the TOC prepended to it. Set this to false in order not to use * PDI, and in order to produce a document that only contains the TOC. */ private static final boolean USE_PDI = true; /** * The page width for the TOC pages (see "width" option for begin_page_ext() in * the PDFlib Reference Manual) */ private static final String TOC_WIDTH = "a4.width"; /** * The page height for the TOC pages (see "height" option for begin_page_ext() * PDFlib Reference Manual) */ private static final String TOC_HEIGHT = "a4.height"; /** * The title for the TOC. */ private static final String TOC_TITLE = "Table of Contents"; /** * The font to use for the headline that is placed on each page of the TOC. */ private static final String TOC_TITLE_FONT = "Helvetica-Bold"; /** * The fontsize to use for the headline that is placed on each page of the TOC. */ private static final int TOC_TITLE_FONTSIZE = 18; /** * The font to use for the TOC entries. */ private static final String TOC_FONT = "Helvetica"; /** * The fontsize for the TOC entries. */ private static final int TOC_FONTSIZE = 12; /** * x-position of the lower-left corner of the TOC fitbox. */ private static final int TOC_LLX = 110; /** * y-position of the lower-left corner of the TOC fitbox. */ private static final int TOC_LLY = 100; /** * x-position of the upper-right corner of the TOC fitbox. */ private static final int TOC_URX = 450; /** * y-position of the upper-right corner of the TOC fitbox. */ private static final int TOC_URY = 700; /** * Lower-left y-position for the TOC headline. */ private static final int TOC_TITLE_LLY = 740; /** * The prefix for a destination name. */ private static final String TOC_DESTINATION_PREFIX = "tmx"; /** * The text flow (including options) for the TOC contents. */ private StringBuffer tocTextflow = new StringBuffer(); /** * The current destination number. Used to generate unique destination names. */ private int destNumber = 0; /** * Import the current page from the PDI import document and place it in the * ouput document. * * @param p the pdflib object * @param pdiHandle the PDI handle for the input document * @param pageno the current page number * * @throws PDFlibException an error occurred in the PDFlib API */ private int put_pdi_page(pdflib p, int pdiHandle, int pageno) throws PDFlibException { /* * The page size will be adjusted later to match the size of the input pages */ p.begin_page_ext(10, 10, "group content"); int pageHandle = p.open_pdi_page(pdiHandle, pageno, ""); if (pageHandle != -1) { /* Place the input page and adjust the page size */ p.fit_pdi_page(pageHandle, 0, 0, "adjustpage"); } return pageHandle; } /** * Tests whether the current character matches the criteria for text that shall * get a an entry in the TOC. get_char_info must have been called before in * order to ensure that the TET object contains the information for the current * character. * * @param tet The TET object * @param doc The TET document handle * @throws TETException */ private boolean font_matches(TET tet, final int doc) throws TETException { String name = tet.pcos_get_string(doc, "fonts[" + tet.fontid + "]/name"); return name.equals(FONT_NAME) && (Math.abs(tet.fontsize - FONT_SIZE) <= FONT_SIZE_TOLERANCE); } /** * Add text and options to the textflow for the current TOC entry. The options * take care that the TOC will be properly formattted (do not split TOC entries * over page boundaries). * * @param p The pdflib object * @param tocText The text of the TOC entry to add * @param pageno The page number for the TOC entry * @param ulx x-position of the identified text on the page * @param uly y-position of the identified text on the page * * @throws PDFlibException An error occurred in the PDFlib API */ private void add_toc_entry(pdflib p, String tocText, int pageno, double ulx, double uly) throws PDFlibException { /* * The same name is used for the matchbox name in the TOC and for the named * destination that is the target of the "GoTo" action in the TOC. */ String destName = get_destination_name(destNumber); /* * We need pairwise marks that enclose the text that shall be kept together. */ tocTextflow.append("").append(tocText).append("\t") .append(pageno).append("\n"); if (USE_PDI) { p.add_nameddest(destName, "type=fixed left=" + ulx + " top=" + uly); } destNumber += 1; } /** * Create a unique name for each destination. * * @param number The number of the destination. * * @return A string that can be used as a destination name and as the * corresponding matchbox name. */ private String get_destination_name(int number) { return TOC_DESTINATION_PREFIX + number; } /** * Process a page: Create a new page in the output document, place the page from * the input document in the output document, and create TOC entries for all * occurrences of text with the desired properties. * * @param tet TET object * @param doc TET document handle * @param p pdflib object * @param pdiHandle PDI document handle * @param pageno The current page number * @throws TETException An error occurred in the TET API * @throws PDFlibException An error occurred in the PDFlib API */ private void process_page(TET tet, final int doc, pdflib p, int pdiHandle, int pageno) throws TETException, PDFlibException { if (USE_PDI) { put_pdi_page(p, pdiHandle, pageno); } final int page = tet.open_page(doc, pageno, PAGE_OPTLIST); if (page == -1) { System.err.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg()); } else { /* Retrieve all text fragments for the page */ for (String text = tet.get_text(page); text != null; text = tet.get_text(page)) { int nextCharPos = 0; int matchStart = -1; double uly = 0; double ulx = 0; while (tet.get_char_info(page) != -1) { nextCharPos += 1; if (font_matches(tet, doc)) { if (matchStart == -1) { // start of new matching chunk matchStart = nextCharPos - 1; uly = tet.y + ASCENDER * tet.fontsize; ulx = tet.x; } } else { if (matchStart != -1) { /* * End of matching chunk. The last character that is belonging to the chunk is * the one preceding the next character. matchEnd is the index of the first * character after the bookmark characters. */ int matchEnd = nextCharPos - 1; /* * remove trailing whitespace */ while (matchEnd > matchStart && Character.isWhitespace(text.charAt(matchEnd - 1))) { matchEnd -= 1; } String tocText = text.substring(matchStart, matchEnd); out.println("Creating TOC entry \"" + tocText + "\""); add_toc_entry(p, tocText, pageno, ulx, uly); matchStart = -1; } } } } if (tet.get_errnum() != 0) { System.err .println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg()); } /* * Close page in the input and output documents. */ if (USE_PDI) { p.end_page_ext(""); } tet.close_page(page); } } private void create_toc(pdflib p) throws Exception { final String optlist = "fontname=" + TOC_FONT + " fontsize=" + TOC_FONTSIZE + " encoding=unicode ruler=100%" + " hortabmethod=ruler tabalignment=right"; /* * Load the font for the title of the TOC */ int font = p.load_font(TOC_TITLE_FONT, "unicode", ""); if (font == -1) throw new Exception("Error: " + p.get_errmsg()); /* * Create a textflow for the collected TOC entries. */ int tf = p.create_textflow(tocTextflow.toString(), optlist); if (tf == -1) throw new Exception("Error: " + p.get_errmsg()); /* * Maximum number of the mark defined in the Textflow */ int maxMark = (destNumber * 2) - 1; /* * Keep track of the first mark for each page for creating the "GoTo" actions on * the page. */ int startMark = 0; /* * Loop until all of the text is placed; create new pages as long as more text * needs to be placed. */ String result; do { p.begin_page_ext(0, 0, "group=toc width=" + TOC_WIDTH + " height=" + TOC_HEIGHT); /* Place a text line with a title */ p.setfont(font, TOC_TITLE_FONTSIZE); p.fit_textline(TOC_TITLE, TOC_LLX, TOC_TITLE_LLY, ""); /* * Place the Textflow with the table of contents in blind mode, to find out how * many marks did fit into the box. With this information we also prevent a TOC * entry from being split over two pages by using the "returnatmark" option. */ result = p.fit_textflow(tf, TOC_LLX, TOC_LLY, TOC_URX, TOC_URY, "blind"); int lastMark = (int) p.info_textflow(tf, "lastmark"); /* * An even mark number indicates the start of a text section to be kept * together. Reset it to the last odd mark number which indicates the end of a * text section. */ if (lastMark % 2 == 0) { lastMark -= 1; } /* * Now actually fit the textflow. To rewind the textflow status to before the * last call to fit_textflow() use "rewind=-1". */ result = p.fit_textflow(tf, TOC_LLX, TOC_LLY, TOC_URX, TOC_URY, "returnatmark=" + lastMark + " rewind=-1"); /* * When PDI is in use, create the "GoTo" actions that allows to click on a TOC * entry for jumping to the corresponding section of the document. */ for (int i = startMark; USE_PDI && i <= lastMark / 2; i += 1) { String destinationName = get_destination_name(i); int action = p.create_action("GoTo", "destname=" + destinationName); p.create_annotation(0, 0, 0, 0, "Link", "action={activate " + action + "} linewidth=0 usematchbox={" + destinationName + "}"); } startMark = (lastMark / 2) + 1; p.end_page_ext(""); /* * "_boxfull" means we must continue because there is more text; "_nextpage" is * interpreted as "start new column" */ } while (!result.equals("_stop") && !result.equals("_boxempty") && !result.equals("_mark" + maxMark)); /* Check for errors */ if (result.equals("_boxempty")) throw new Exception("Error: Textflow box for TOC is too small"); p.delete_textflow(tf); } private void execute() { TET tet = null; pdflib p = null; int pageno = 0; try { tet = new TET(); tet.set_option(GLOBAL_OPTLIST); p = new pdflib(); p.set_option("searchpath={" + DOC_SEARCH_PATH + "}"); if (p.begin_document(outfilename, "groups={toc content}") == -1) { System.err.println("Error: " + p.get_errmsg()); return; } /* add document info entries */ p.set_info("Creator", "Create Table Of Contents TET Cookbook Example"); p.set_info("Author", "PDFlib GmbH"); p.set_info("Title", infilename); final int doc = tet.open_document(infilename, DOC_OPTLIST); if (doc == -1) { System.err .println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg()); return; } final int n_pages = (int) tet.pcos_get_number(doc, "length:pages"); int pdiHandle = -1; if (USE_PDI) { pdiHandle = p.open_pdi_document(infilename, ""); if (pdiHandle == -1) { System.err.println("Error: " + p.get_errmsg()); return; } } /* * Loop over pages in the document */ for (pageno = 1; pageno <= n_pages; ++pageno) { process_page(tet, doc, p, pdiHandle, pageno); } /* * Use the information collected while processing the input document to create * the table of contents. */ create_toc(p); p.end_document(""); if (USE_PDI) { p.close_pdi_document(pdiHandle); } tet.close_document(doc); out.println("Created PDF output document \"" + outfilename + "\" with " + destNumber + " TOC entries."); } catch (TETException e) { if (pageno == 0) { System.err .println("Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg() + "\n"); } else { System.err.println("Error " + e.get_errnum() + " in " + e.get_apiname() + "() on page " + pageno + ": " + e.get_errmsg() + "\n"); } } catch (PDFlibException e) { if (pageno == 0) { System.err .println("Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg() + "\n"); } else { System.err.println("Error " + e.get_errnum() + " in " + e.get_apiname() + "() on page " + pageno + ": " + e.get_errmsg() + "\n"); } } catch (Exception e) { System.err.println(e); } finally { tet.delete(); p.delete(); } } /** * @param infilename the name of the file for which the bookmarked file will be * generated * @param outfilename the name of the output file */ private create_table_of_contents(String infilename, String outfilename) { this.infilename = infilename; this.outfilename = outfilename; } public static void main(String[] args) throws UnsupportedEncodingException { System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\""); out = new PrintStream(System.out, true, OUTPUT_ENCODING); if (args.length != 2) { out.println("usage: create_table_of_contents "); return; } create_table_of_contents t = new create_table_of_contents(args[0], args[1]); t.execute(); } }