import java.io.ByteArrayInputStream; import java.io.PrintStream; import java.io.UnsupportedEncodingException; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.XMLReaderFactory; import com.pdflib.TET; import com.pdflib.TETException; /** * Extract text from PDF document as XML. If an output filename is specified, * write the XML to the output file. Otherwise fetch the XML in memory, parse it * and print some information to System.out. *

* Required software: TET 3 *

* Required data: PDF document * * @version $Id: tetml.java,v 1.1 2008/11/21 07:18:51 stm Exp $ */ public class tetml { /** * Global option list. */ static final String GLOBAL_OPTLIST = "searchpath={../resource/cmap " + "../resource/glyphlist ../input}"; /** * Document specific option list. */ static final String BASE_DOC_OPTLIST = ""; /** * Page-specific option list. */ static final String PAGE_OPTLIST = "granularity=word"; /** * The encoding in which the output is sent to System.out. For running the * example in a Windows command window, you can set this for example to * "windows-1252" for getting Latin-1 output. */ private static final String OUTPUT_ENCODING = System .getProperty("file.encoding"); /** * For printing to System.out in the encoding specified via OUTPUT_ENCODING. */ private static PrintStream out; /** * Set to true for in-memory processing. */ private static final boolean INMEMORY = true; /** * Word counter for in-memory processing code. */ int word_count = 0; /** * SAX handler class to count the words in the document. */ private class sax_handler extends DefaultHandler { public void startElement(String uri, String local_name, String qualified_name, Attributes attributes) throws SAXException { if (local_name.equals("Word")) { word_count += 1; } else if (local_name.equals("Font")) { out.println("Font " + attributes.getValue("", "name") + " (" + attributes.getValue("", "type") + ")"); } } } public static void main(String[] args) throws UnsupportedEncodingException { System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\""); out = new PrintStream(System.out, true, OUTPUT_ENCODING); if (args.length != 1) { System.err.println("usage: tetml "); return; } /* * For JRE 1.4 the property must be set what XML parser to use, later * JREs seem to have a default set internally. It seems to be the case * that in 1.4 org.apache.crimson.parser.XMLReaderImpl is always * available. */ String jre_version = System.getProperty("java.version"); if (jre_version.startsWith("1.4")) { System.setProperty("org.xml.sax.driver", "org.apache.crimson.parser.XMLReaderImpl"); } /* * We need a tetml object, otherwise it's not possible to set up the * handler for the SAX parser with the local sax_handler class. */ tetml t = new tetml(); t.process_xml(args); } private void process_xml(String[] args) { TET tet = null; try { tet = new TET(); tet.set_option(GLOBAL_OPTLIST); final String outputfilename = args[0] + ".tetml"; final String docoptlist = (INMEMORY ? "tetml={}" : "tetml={filename={" + outputfilename + "}}") + " " + BASE_DOC_OPTLIST; if (INMEMORY) { out.println("Processing TETML output for document \"" + args[0] + "\" in memory..."); } else { out.println("Extracting TETML for document \"" + args[0] + "\" to file \"" + outputfilename + "\"..."); } final int doc = tet.open_document(args[0], docoptlist); if (doc == -1) { System.err.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg()); tet.delete(); return; } final int n_pages = (int) tet.pcos_get_number(doc, "length:pages"); /* * Loop over pages in the document; */ for (int pageno = 0; pageno <= n_pages; ++pageno) { tet.process_page(doc, pageno, PAGE_OPTLIST); } /* * This could be combined with the last page-related call. */ tet.process_page(doc, 0, "tetml={trailer}"); if (INMEMORY) { /* * Get the XML document as a byte array. */ final byte[] tetml = tet.get_xml_data(doc, ""); if (tetml == null) { System.err.println("tetml: couldn't retrieve XML data"); return; } /* * Process the in-memory XML document to print out some * information that is extracted with the sax_handler class. */ XMLReader reader = XMLReaderFactory.createXMLReader(); reader.setContentHandler(new sax_handler()); reader.parse(new InputSource(new ByteArrayInputStream(tetml))); out.println("Found " + word_count + " words in document"); } tet.close_document(doc); } catch (TETException e) { System.err.println("Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg()); } catch (Exception e) { e.printStackTrace(); } finally { if (tet != null) { tet.delete(); } } } }