PDFlib TET サンプル集(クックブック)
PDFlib TET で、文書から全てのユニークな単語を抽出して、件数と共にソートされたリストを作成するサンプルプログラムです。
package com.pdflib.cookbook.tet.text;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import com.pdflib.TET;
import com.pdflib.TETException;
/**
* Create a sorted list of unique words in a document along with counts.
*
* Required software: TET 3
*
* Required data: PDF document
*
*/
class concordance {
/**
* Global option list. The program expects the "resource" directory parallel to
* the "java" directory.
*/
private static final String GLOBAL_OPTLIST = "searchpath={../resource/cmap " +
"../resource/glyphlist ../input}";
/**
* Document specific option list.
*/
private static final String DOC_OPTLIST = "";
/**
* Page-specific option list.
*/
private static final String PAGE_OPTLIST = "granularity=word";
/**
* The encoding in which the output is sent to System.out. For running the
* example in a Windows command window, you can set this for example to
* "windows-1252" for getting Latin-1 output.
*/
private static final String OUTPUT_ENCODING = System.getProperty("file.encoding");
/**
* For printing to System.out in the encoding specified via OUTPUT_ENCODING.
*/
private static PrintStream out;
/**
* Set this to true if all words are to be lowercased.
*/
private static final boolean LOWERCASE_WORDS = false;
/**
* The name of the file to process.
*/
private String filename;
/**
* The map to store the per-word counters. The key is the word, the value is the
* number of occurrences of the word.
*/
private Map<String, Integer> wordCounters = new HashMap<String, Integer>();
/**
* Process a single page of text.
*
* @param tet TET object
* @param doc TET document handle
* @param pageno Page to process
*
* @throws TETException An error occurred in the TET API
*/
private void process_page(TET tet, final int doc, int pageno) throws TETException {
final int page = tet.open_page(doc, pageno, PAGE_OPTLIST);
if (page == -1) {
System.err.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() +
"(): " + tet.get_errmsg());
} else {
/*
* Fetch the text word-wise.
*/
for (String text = tet.get_text(page); text != null; text = tet.get_text(page)) {
/*
* Only include words that start with a letter.
*/
if (Character.isLetter(text.charAt(0))) {
if (LOWERCASE_WORDS) {
text = text.toLowerCase();
}
Integer value = (Integer) wordCounters.get(text);
if (value != null) {
// Increment counter
value = Integer.valueOf(value.intValue() + 1);
} else {
// Initialize with first counted word
value = Integer.valueOf(1);
}
wordCounters.put(text, value);
}
}
if (tet.get_errnum() != 0) {
System.err
.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() +
"(): " + tet.get_errmsg());
}
tet.close_page(page);
}
}
/**
* Print out the results.
*
* @throws TETException
*/
private void print_concordance(TET tet, int doc) throws TETException {
out.println("List of words in the document \"" + filename +
"\" along with the number of occurrences:");
out.println();
/*
* Sort the key-value pairs from the Map descending according to their count.
*/
String[] words = new String[wordCounters.size()];
words = (String[]) wordCounters.keySet().toArray(words);
Arrays.sort(words, new Comparator<Object>() {
public int compare(Object o1, Object o2) {
Integer count1 = (Integer) wordCounters.get(o1);
Integer count2 = (Integer) wordCounters.get(o2);
return count2.compareTo(count1);
}
});
for (int i = 0; i < words.length; i += 1) {
out.println(words[i] + " " + wordCounters.get(words[i]));
}
out.println();
out.println("Total unique words: " + words.length);
}
/**
* Generate the concordance for the given file.
*/
private void execute() {
TET tet = null;
int pageno = 0;
try {
tet = new TET();
tet.set_option(GLOBAL_OPTLIST);
final int doc = tet.open_document(filename, DOC_OPTLIST);
if (doc == -1) {
System.err
.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() +
"(): " + tet.get_errmsg());
} else {
/*
* Loop over pages in the document
*/
final int n_pages = (int) tet.pcos_get_number(doc, "length:pages");
for (pageno = 1; pageno <= n_pages; ++pageno) {
process_page(tet, doc, pageno);
}
print_concordance(tet, doc);
tet.close_document(doc);
}
} catch (TETException e) {
if (pageno == 0) {
System.err
.println("Error " + e.get_errnum() + " in " + e.get_apiname() +
"(): " + e.get_errmsg() + "\n");
} else {
System.err.println("Error " + e.get_errnum() + " in " + e.get_apiname() +
"() on page " + pageno + ": " + e.get_errmsg() + "\n");
}
System.exit(1);
} finally {
tet.delete();
}
}
/**
* @param filename the name of the file for which the concordance will be
* generated
*/
private concordance(String filename) {
this.filename = filename;
}
public static void main(String[] args) throws UnsupportedEncodingException {
System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\"");
out = new PrintStream(System.out, true, OUTPUT_ENCODING);
if (args.length != 1) {
out.println("usage: concordance <infilename>");
return;
}
concordance c = new concordance(args[0]);
c.execute();
}
}
(May 6, 2010 - Oct 16, 2019)