/** * Extract text from annotations with PDFlib TET and the pCOS interface * * The topic "formfields" in the pCOS Cookbook demonstrates how to read * the values of form fields. *

* Required software: TET 5 *

* Required data: PDF document * */ package com.pdflib.cookbook.tet.text; import java.io.PrintStream; import java.io.UnsupportedEncodingException; import java.text.DecimalFormat; import java.text.DecimalFormatSymbols; import java.util.Locale; import com.pdflib.TETException; import com.pdflib.TET; public class text_from_annotations { /** * Global option list */ static final String GLOBAL_OPTLIST = "searchpath={../input}"; /** * Document-specific option list */ static final String DOC_OPTLIST = ""; /** * Page-specific option list */ static final String PAGE_OPTLIST = ""; /** * Separator to use as paragraph break. Text in PDF annotations uses * U+000D as paragraph separator which is impractical in many environments, * therefore we replace it. */ static final String SEPARATOR = "\n"; /** * The encoding in which the output is sent to System.out. For running the * example in a Windows command window, you can set this for example to * "windows-1252" for getting Latin-1 output. */ private static final String OUTPUT_ENCODING = System.getProperty("file.encoding"); /** * For printing to System.out in the encoding specified via OUTPUT_ENCODING. */ private static PrintStream out; public static void main(String argv[]) throws UnsupportedEncodingException { System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\""); out = new PrintStream(System.out, true, OUTPUT_ENCODING); TET tet = null; try { if (argv.length != 1) { throw new Exception("usage: text_from_annotations "); } tet = new TET(); tet.set_option(GLOBAL_OPTLIST); int doc = tet.open_document(argv[0], DOC_OPTLIST); if (doc == -1) { throw new Exception( "Error " + tet.get_errnum() + "in " + tet.get_apiname() + "(): " + tet.get_errmsg()); } out.print("File name: " + tet.pcos_get_string(doc, "filename") + "\n"); int pagecount = (int) tet.pcos_get_number(doc, "length:pages"); /* Loop over all pages */ for (int page = 0; page < pagecount; page++) { String base_path = "pages[" + page + "]/annots"; /* Get number of annotations on this page */ int anncount = (int) tet.pcos_get_number(doc, "length:" + base_path); for (int ann = 0; ann < anncount; ann++) { String objtype; /* pCOS path for the next annotation: "pages[n]/annotations[m]" */ String annotation_path = base_path + "[" + ann + "]"; String subtype = tet.pcos_get_string(doc, annotation_path + "/Subtype"); /* Ignore form fields (=Widgets) */ if (subtype.equals("Widget")) { continue; } /* Ignore annotations without any /Contents entry or empty Contents */ objtype = tet.pcos_get_string(doc, "type:" + annotation_path + "/Contents"); if (!objtype.equals("string")) { continue; } String contents = tet.pcos_get_string(doc, annotation_path + "/Contents"); if (contents.length() == 0) continue; /* * Print the type of the annotation. You can use it to filter * out unwanted annotation types. For example, annotation * type "FreeText" is the only type which places text * directly on the page. */ System.out.print("page " + (page + 1) + ", annotation type: " + subtype + ", "); /* Print the rectangle for the annotation. */ System.out.print("rectangle: "); String rect_path = annotation_path + "/Rect"; if (tet.pcos_get_string(doc, "type:" + rect_path).equals("array") && (int) tet.pcos_get_number(doc, "length:" + rect_path) == 4) { System.out.print("["); DecimalFormat format = new DecimalFormat(); format.setMinimumFractionDigits(0); format.setMaximumFractionDigits(2); format.setDecimalFormatSymbols(new DecimalFormatSymbols(Locale.US)); for (int i = 0; i < 4; i += 1) { if (i > 0) { System.out.print(" "); } System.out.print(format.format(tet.pcos_get_number(doc, rect_path + "[" + i + "]"))); } out.print("]\n"); } else { out.print("(not available)\n"); } /* * Print contents of the annotation, i.e. the actual text. * Text in PDF annotations uses U+000D as paragraph separator * which is impractical in many environments, therefore we * replace it. */ contents = contents.replace("\r", SEPARATOR); out.print("contents: '" + contents + "'\n"); } } tet.close_document(doc); } catch (TETException e) { System.err.println("TET exception occurred in text_from_annotations sample:"); System.err.println("[" + e.get_errnum() + "] " + e.get_apiname() + ": " + e.get_errmsg()); System.exit(1); } catch (Exception e) { System.err.println(e); System.exit(1); } finally { if (tet != null) { tet.delete(); } } } }