package com.pdflib.cookbook.tet.text; import java.io.PrintStream; import java.io.UnsupportedEncodingException; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.Map; import com.pdflib.TET; import com.pdflib.TETException; /** * Create a sorted list of unique words in a document along with counts. * * Required software: TET 3 *

* Required data: PDF document * */ class concordance { /** * Global option list. The program expects the "resource" directory parallel to * the "java" directory. */ private static final String GLOBAL_OPTLIST = "searchpath={../resource/cmap " + "../resource/glyphlist ../input}"; /** * Document specific option list. */ private static final String DOC_OPTLIST = ""; /** * Page-specific option list. */ private static final String PAGE_OPTLIST = "granularity=word"; /** * The encoding in which the output is sent to System.out. For running the * example in a Windows command window, you can set this for example to * "windows-1252" for getting Latin-1 output. */ private static final String OUTPUT_ENCODING = System.getProperty("file.encoding"); /** * For printing to System.out in the encoding specified via OUTPUT_ENCODING. */ private static PrintStream out; /** * Set this to true if all words are to be lowercased. */ private static final boolean LOWERCASE_WORDS = false; /** * The name of the file to process. */ private String filename; /** * The map to store the per-word counters. The key is the word, the value is the * number of occurrences of the word. */ private Map wordCounters = new HashMap(); /** * Process a single page of text. * * @param tet TET object * @param doc TET document handle * @param pageno Page to process * * @throws TETException An error occurred in the TET API */ private void process_page(TET tet, final int doc, int pageno) throws TETException { final int page = tet.open_page(doc, pageno, PAGE_OPTLIST); if (page == -1) { System.err.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg()); } else { /* * Fetch the text word-wise. */ for (String text = tet.get_text(page); text != null; text = tet.get_text(page)) { /* * Only include words that start with a letter. */ if (Character.isLetter(text.charAt(0))) { if (LOWERCASE_WORDS) { text = text.toLowerCase(); } Integer value = (Integer) wordCounters.get(text); if (value != null) { // Increment counter value = Integer.valueOf(value.intValue() + 1); } else { // Initialize with first counted word value = Integer.valueOf(1); } wordCounters.put(text, value); } } if (tet.get_errnum() != 0) { System.err .println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg()); } tet.close_page(page); } } /** * Print out the results. * * @throws TETException */ private void print_concordance(TET tet, int doc) throws TETException { out.println("List of words in the document \"" + filename + "\" along with the number of occurrences:"); out.println(); /* * Sort the key-value pairs from the Map descending according to their count. */ String[] words = new String[wordCounters.size()]; words = (String[]) wordCounters.keySet().toArray(words); Arrays.sort(words, new Comparator