package com.pdflib.cookbook.tet.tet_and_pdflib; import java.io.PrintStream; import java.io.UnsupportedEncodingException; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import com.pdflib.PDFlibException; import com.pdflib.TET; import com.pdflib.TETException; import com.pdflib.pdflib; /** * Split a document into smaller parts based on some page contents. Various * criteria for the split points could be useful. The splitting could for * example be done
*
* - after each empty page
* - when certain text appears on the page (e.g. "Address"). The text could be * visible on the page, or it could serve as a hidden marker (e.g. invisible * text or text outside the CropBox)
*

* The example below uses the latter approach. The input document "invoices.pdf" * contains a sequence of invoices. Each invoice has one or more pages. The * first page contains the recipient's address and the fixed text "INVOICE" at * known coordinates. Subsequent pages of the same invoice are blank in these * places. *

* The goal is to split the input document into multiple output documents based * on the recipient's country. A real-world benefit of this could be that the * postage is cheaper if letters are delivered sorted by country. In the same * spirit, the invoices could be sorted according zu ZIP code, name of the * addressee, etc. *

* Required software: TET 3 and PDFlib+PDI 8 *

* Required data: PDF document * */ class burst { /** * Common search path for PDI and TET to find the input document. */ private static final String DOC_SEARCH_PATH = "../input"; /** * Global option list. The program expects the "resource" directory parallel to * the "java" directory. */ private static final String GLOBAL_OPTLIST = "searchpath={../resource/cmap ../resource/glyphlist " + DOC_SEARCH_PATH + "}"; /** * Document specific option list. */ private static final String DOC_OPTLIST = ""; /** * Page-specific option list. */ private static final String PAGE_OPTLIST = "granularity=page"; /** * The encoding in which the output is sent to System.out. For running the * example in a Windows command window, you can set this for example to * "windows-1252" for getting Latin-1 output. */ private static final String OUTPUT_ENCODING = System.getProperty("file.encoding"); /** * x-position of the lower left corner of the rectangle that contains the text * for detecting the first page of a sequence. */ private static final double START_SEQ_TXT_LLX = 50; /** * y-position of the lower left corner of the rectangle that contains the text * for detecting the first page of a sequence. */ private static final double START_SEQ_TXT_LLY = 535; /** * x-position of the upper right corner of the rectangle that contains the text * for detecting the first page of a sequence. */ private static final double START_SEQ_TXT_URX = 105; /** * y-position of the upper right corner of the rectangle that contains the text * for detecting the first page of a sequence. */ private static final double START_SEQ_TXT_URY = 550; /** * Text that must be found in the rectangle defined by START_SEQ_TXT_LLX, * START_SEQ_TXT_LLY, START_SEQ_TXT_URX, START_SEQ_TXT_URY in order to identify * a page as the start of a sequence. */ private static final String START_SEQ_TXT = "INVOICE"; /** * x-position of the lower left corner of the rectangle that contains the text * for the routing criterion. */ private static final double CRITERION_TXT_LLX = 50; /** * y-position of the lower left corner of the rectangle that contains the text * for the routing criterion. */ private static final double CRITERION_TXT_LLY = 612; /** * x-position of the upper right corner of the rectangle that contains the text * for the routing criterion. */ private static final double CRITERION_TXT_URX = 175; /** * y-position of the upper right corner of the rectangle that contains the text * for the routing criterion. */ private static final double CRITERION_TXT_URY = 624; /** * For printing to System.out in the encoding specified via OUTPUT_ENCODING. */ private static PrintStream out; /** * The name of the input file */ private String infilename; /** * The name of the output file */ private String outfileBasename; /** * For mapping country names to output files. The key is the country name in * lowercase, the value is an object that describes the output document. */ private Map outputDocuments = new HashMap(); /** * The current pdflib object, used for all pages after the first one. */ private output_document currentOutputDocument = null; /** * Description of an output document. */ private class output_document { pdflib p; int pdiHandle; String filename; } /** * Import the current page from the PDI import document and place it in the * ouput document. * * @param doc The output document * @param pageno The current page number in the input document * * @throws PDFlibException an error occurred in the PDFlib API */ private boolean importPdiPage(output_document doc, int pageno) throws PDFlibException { /* * The page size will be adjusted later to match the size of the input pages */ doc.p.begin_page_ext(10, 10, ""); int pdiPage = doc.p.open_pdi_page(doc.pdiHandle, pageno, ""); if (pdiPage == -1) { throw new PDFlibException("Error: " + doc.p.get_errmsg()); } /* Place the input page and adjust the page size */ doc.p.fit_pdi_page(pdiPage, 0, 0, "adjustpage"); doc.p.close_pdi_page(pdiPage); doc.p.end_page_ext(""); return true; } /** * This routine implements the detection of the first page of a sequence. * * @param tet The TET object for the input document * @param doc The TET handle for the current page * @param pageNumber The number of the current page * * @return true if this is the first page of a sequence, false otherwise * * @throws TETException An error occurred in the TET API */ private boolean isFirstOfSequence(TET tet, int doc, int pageNumber) throws TETException { String includeBox = "includebox={{ " + START_SEQ_TXT_LLX + " " + START_SEQ_TXT_LLY + " " + START_SEQ_TXT_URX + " " + START_SEQ_TXT_URY + " }}"; int page = tet.open_page(doc, pageNumber, PAGE_OPTLIST + " " + includeBox); String text = tet.get_text(page); boolean retval = text != null && text.equals(START_SEQ_TXT); tet.close_page(page); return retval; } /** * Fetch the routing criterion from the area of interest. * * @param tet The TET object for the input document * @param doc The TET handle for the input document * @param pageNumber The number of the current page * * @return The String for looking up the output document * * @throws TETException An error occurred in the TET API */ private String getRoutingCriterion(TET tet, int doc, int pageNumber) throws TETException { String includeBox = "includebox={{ " + CRITERION_TXT_LLX + " " + CRITERION_TXT_LLY + " " + CRITERION_TXT_URX + " " + CRITERION_TXT_URY + " }}"; int page = tet.open_page(doc, pageNumber, PAGE_OPTLIST + " " + includeBox); String text = tet.get_text(page); tet.close_page(page); return text; } /** * Fetch the output document based on the criterion. Create a new output * document if none exists yet for the criterion. * * @param criterion Criterion for identifying the output document * * @return The output document for the criterion * * @throws PDFlibException An error occurred in the PDFlib API */ private output_document fetchOutputDocument(String criterion) throws PDFlibException { output_document retval = (output_document) outputDocuments.get(criterion); if (retval == null) { String outputFilename = outfileBasename + "_" + criterion.replaceAll(" ", "_").toLowerCase() + ".pdf"; pdflib p = new pdflib(); p.set_option("searchpath={" + DOC_SEARCH_PATH + "}"); if (p.begin_document(outputFilename, "") == -1) { throw new PDFlibException("Error: " + p.get_errmsg()); } /* add document info entries */ p.set_info("Creator", "Burst TET Cookbook Example"); p.set_info("Author", "PDFlib GmbH"); p.set_info("Title", infilename); p.set_info("Subject", "Invoices for recipient country " + criterion.toString()); int pdiHandle = p.open_pdi_document(infilename, ""); if (pdiHandle == -1) { throw new PDFlibException("Error: " + p.get_errmsg()); } retval = new output_document(); retval.p = p; retval.pdiHandle = pdiHandle; retval.filename = outputFilename; outputDocuments.put(criterion, retval); } return retval; } /** * Based on some criteria decide to which output document the current page * should go. First the function identifies whether the page is the start of a * new sequence or the continuation of a sequence. In the first case the output * document is looked up in the map of output documents, and created if * necessary. In the second case the page is simply routed to the current * document. * * @param tet The TET object for the input document * @param doc The TET handle for the input document * @param pageNumber The number of the current page * * @return The document to which the current page of the input document shall be * routed to * * @throws TETException An error occurred in the TET API * @throws PDFlibException An error occurred in the PDFlib API */ private output_document routePage(TET tet, int doc, int pageNumber) throws TETException, PDFlibException { if (currentOutputDocument == null || isFirstOfSequence(tet, doc, pageNumber)) { String criterion = getRoutingCriterion(tet, doc, pageNumber); currentOutputDocument = fetchOutputDocument(criterion); } return currentOutputDocument; } /** * Process a page: Determine into which output document the current page should * be placed, and put it into the output document. * * @param tet TET object * @param doc TET document handle * @param p pdflib object * @param pdiHandle PDI document handle * @param pageno The current page number * @throws TETException An error occurred in the TET API * @throws PDFlibException An error occurred in the PDFlib API */ private void process_page(TET tet, final int doc, int pageno) throws TETException, PDFlibException { final int page = tet.open_page(doc, pageno, PAGE_OPTLIST); if (page == -1) { System.err.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg()); } else { /* * Decide about routing the input pages */ output_document o = routePage(tet, doc, pageno); /* * Copy page from input document to output document. */ importPdiPage(o, pageno); /* * Close page in the input document. */ tet.close_page(page); } } private void execute() { TET tet = null; int pageno = 0; try { tet = new TET(); tet.set_option(GLOBAL_OPTLIST); final int doc = tet.open_document(infilename, DOC_OPTLIST); if (doc == -1) { System.err .println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg()); return; } /* * Loop over pages in the document */ final int n_pages = (int) tet.pcos_get_number(doc, "length:pages"); for (pageno = 1; pageno <= n_pages; ++pageno) { process_page(tet, doc, pageno); } /* * Close all output documents */ Collection values = outputDocuments.values(); Iterator i = values.iterator(); while (i.hasNext()) { output_document o = (output_document) i.next(); o.p.end_document(""); o.p.close_pdi_document(o.pdiHandle); out.println("Closed output document \"" + o.filename + "\""); } tet.close_document(doc); } catch (TETException e) { if (pageno == 0) { System.err .println("Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg() + "\n"); } else { System.err.println("Error " + e.get_errnum() + " in " + e.get_apiname() + "() on page " + pageno + ": " + e.get_errmsg() + "\n"); } } catch (PDFlibException e) { if (pageno == 0) { System.err .println("Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg() + "\n"); } else { System.err.println("Error " + e.get_errnum() + " in " + e.get_apiname() + "() on page " + pageno + ": " + e.get_errmsg() + "\n"); } } finally { tet.delete(); Collection values = outputDocuments.values(); Iterator i = values.iterator(); while (i.hasNext()) { output_document o = (output_document) i.next(); o.p.delete(); } } } /** * @param infilename the name of the file for which the bookmarked file will be * generated * @param outfilename the name of the output file */ private burst(String infilename, String outfilename) { this.infilename = infilename; /* * As the input document will be split into multiple output documents, strip a * potential ".pdf" suffix from the name. */ int basenameEnd = outfilename.toLowerCase().lastIndexOf(".pdf"); this.outfileBasename = basenameEnd == -1 ? outfilename : outfilename.substring(0, basenameEnd); } public static void main(String[] args) throws UnsupportedEncodingException { System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\""); out = new PrintStream(System.out, true, OUTPUT_ENCODING); if (args.length != 2) { out.println("usage: burst "); return; } burst t = new burst(args[0], args[1]); t.execute(); } }