/* * Create a sorted list of all words in the document along with the page numbers * where the words occur. * * Note that the index is limited to words starting with characters [A-Za-z], as * the demonstration program lacks the features that would be necessary for * making it a truly internationalized index program. * * Required software: TET 5 * * Required data: PDF document * */ package com.pdflib.cookbook.tet.text; import java.io.PrintStream; import java.io.UnsupportedEncodingException; import java.text.Collator; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.Map; import java.util.Set; import com.pdflib.TET; import com.pdflib.TETException; class back_of_the_book_index { /* * Global option list. The program expects the "resource" directory parallel to * the "java" directory. */ private static final String GLOBAL_OPTLIST = "searchpath={../resource/cmap " + "../resource/glyphlist ../input}"; /* * Document specific option list. */ private static final String DOC_OPTLIST = ""; /* * Page-specific option list. */ private static final String PAGE_OPTLIST = "granularity=word"; /* * The encoding in which the output is sent to System.out. For running the * example in a Windows command window, you can set this for example to * "windows-1252" for getting Latin-1 output. */ private static final String OUTPUT_ENCODING = System.getProperty("file.encoding"); /* * For printing to System.out in the encoding specified via OUTPUT_ENCODING. */ private static PrintStream out; /* * A word must start with one of the characters in this string to be included in * the index (case doesn't matter). */ private static final String INCLUDE_CHARS = "abcdefghijklmnopqrstuvwxyz"; /* * Set this to true if all words are to be lowercased. */ private static final boolean LOWERCASE_WORDS = false; /* * The name of the file to process. */ private String filename; /* * A map of sets. The map key is the word, the value is a set of page numbers. * For the page set a LinkedHashSet is used, as we traverse the document in page * order, and the LinkedHashSet preserves the insertion order, which will give * us the desired sorted list of page numbers. */ private Map> wordPages = new HashMap>(); /* * Process a single page of text. * * @param tet TET object * @param doc TET document handle * @param pageno Page to process * * @throws TETException An error occurred in the TET API */ private void process_page(TET tet, final int doc, int pageno) throws TETException { final int page = tet.open_page(doc, pageno, PAGE_OPTLIST); if (page == -1) { System.err.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg()); } else { /* * Fetch the text word-wise */ for (String text = tet.get_text(page); text != null; text = tet.get_text(page)) { /* * Only include words that start with a letter out of the set of interesting * characters. */ if (INCLUDE_CHARS.indexOf(Character.toLowerCase(text.charAt(0))) != -1) { if (LOWERCASE_WORDS) { text = text.toLowerCase(); } Set pages = wordPages.get(text); if (pages == null) { pages = new LinkedHashSet(); wordPages.put(text, pages); } pages.add(Integer.valueOf(pageno)); } } if (tet.get_errnum() != 0) { System.err .println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg()); } tet.close_page(page); } } /* * Print out the results. * * @throws TETException */ private void print_index(TET tet, int doc) throws TETException { out.println("Alphabetical list of words in the document \"" + filename + "\" along with their page number:"); out.println(); String[] words = new String[wordPages.size()]; words = wordPages.keySet().toArray(words); /* * Sort according to the sorting rules of the default locale. */ final Collator collator = Collator.getInstance(); Arrays.sort(words, new Comparator