PDFlib TET サンプル集(クックブック)
PDFlib TET と pCOS インターフェースを使用して、 PDF文書上の注釈からテキストを抽出するサンプルプログラムです。
必要な製品: PDFlib TET
/**
* Extract text from annotations with PDFlib TET and the pCOS interface
*
* The topic "formfields" in the pCOS Cookbook demonstrates how to read
* the values of form fields.
*
* Required software: TET 5
*
* Required data: PDF document
*
*/
package com.pdflib.cookbook.tet.text;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.text.DecimalFormat;
import java.text.DecimalFormatSymbols;
import java.util.Locale;
import com.pdflib.TETException;
import com.pdflib.TET;
public class text_from_annotations {
/**
* Global option list
*/
static final String GLOBAL_OPTLIST = "searchpath={../input}";
/**
* Document-specific option list
*/
static final String DOC_OPTLIST = "";
/**
* Page-specific option list
*/
static final String PAGE_OPTLIST = "";
/**
* Separator to use as paragraph break. Text in PDF annotations uses
* U+000D as paragraph separator which is impractical in many environments,
* therefore we replace it.
*/
static final String SEPARATOR = "\n";
/**
* The encoding in which the output is sent to System.out. For running the
* example in a Windows command window, you can set this for example to
* "windows-1252" for getting Latin-1 output.
*/
private static final String OUTPUT_ENCODING = System.getProperty("file.encoding");
/**
* For printing to System.out in the encoding specified via OUTPUT_ENCODING.
*/
private static PrintStream out;
public static void main(String argv[]) throws UnsupportedEncodingException {
System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\"");
out = new PrintStream(System.out, true, OUTPUT_ENCODING);
TET tet = null;
try {
if (argv.length != 1) {
throw new Exception("usage: text_from_annotations <filename>");
}
tet = new TET();
tet.set_option(GLOBAL_OPTLIST);
int doc = tet.open_document(argv[0], DOC_OPTLIST);
if (doc == -1) {
throw new Exception(
"Error " + tet.get_errnum() + "in " + tet.get_apiname() + "(): " + tet.get_errmsg());
}
out.print("File name: " + tet.pcos_get_string(doc, "filename") + "\n");
int pagecount = (int) tet.pcos_get_number(doc, "length:pages");
/* Loop over all pages */
for (int page = 0; page < pagecount; page++) {
String base_path = "pages[" + page + "]/annots";
/* Get number of annotations on this page */
int anncount = (int) tet.pcos_get_number(doc, "length:" + base_path);
for (int ann = 0; ann < anncount; ann++) {
String objtype;
/* pCOS path for the next annotation: "pages[n]/annotations[m]" */
String annotation_path = base_path + "[" + ann + "]";
String subtype = tet.pcos_get_string(doc, annotation_path + "/Subtype");
/* Ignore form fields (=Widgets) */
if (subtype.equals("Widget")) {
continue;
}
/* Ignore annotations without any /Contents entry or empty Contents */
objtype = tet.pcos_get_string(doc, "type:" + annotation_path + "/Contents");
if (!objtype.equals("string")) {
continue;
}
String contents = tet.pcos_get_string(doc, annotation_path + "/Contents");
if (contents.length() == 0)
continue;
/*
* Print the type of the annotation. You can use it to filter
* out unwanted annotation types. For example, annotation
* type "FreeText" is the only type which places text
* directly on the page.
*/
System.out.print("page " + (page + 1) + ", annotation type: " + subtype + ", ");
/* Print the rectangle for the annotation. */
System.out.print("rectangle: ");
String rect_path = annotation_path + "/Rect";
if (tet.pcos_get_string(doc, "type:" + rect_path).equals("array")
&& (int) tet.pcos_get_number(doc, "length:" + rect_path) == 4) {
System.out.print("[");
DecimalFormat format = new DecimalFormat();
format.setMinimumFractionDigits(0);
format.setMaximumFractionDigits(2);
format.setDecimalFormatSymbols(new DecimalFormatSymbols(Locale.US));
for (int i = 0; i < 4; i += 1) {
if (i > 0) {
System.out.print(" ");
}
System.out.print(format.format(tet.pcos_get_number(doc, rect_path + "[" + i + "]")));
}
out.print("]\n");
}
else {
out.print("(not available)\n");
}
/*
* Print contents of the annotation, i.e. the actual text.
* Text in PDF annotations uses U+000D as paragraph separator
* which is impractical in many environments, therefore we
* replace it.
*/
contents = contents.replace("\r", SEPARATOR);
out.print("contents: '" + contents + "'\n");
}
}
tet.close_document(doc);
} catch (TETException e) {
System.err.println("TET exception occurred in text_from_annotations sample:");
System.err.println("[" + e.get_errnum() + "] " + e.get_apiname() + ": " + e.get_errmsg());
System.exit(1);
} catch (Exception e) {
System.err.println(e);
System.exit(1);
} finally {
if (tet != null) {
tet.delete();
}
}
}
}
(Oct 16, 2019 - Oct 19, 2022)