import java.io.ByteArrayInputStream; import java.io.PrintStream; import java.io.UnsupportedEncodingException; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.XMLReaderFactory; import com.pdflib.TET; import com.pdflib.TETException; /** * Extract text from PDF document as XML. If an output filename is specified, * write the XML to the output file. Otherwise fetch the XML in memory, parse it * and print some information to System.out. *
* Required software: TET 3 *
* Required data: PDF document
*
* @version $Id: tetml.java,v 1.1 2008/11/21 07:18:51 stm Exp $
*/
public class tetml {
/**
* Global option list.
*/
static final String GLOBAL_OPTLIST = "searchpath={../resource/cmap "
+ "../resource/glyphlist ../input}";
/**
* Document specific option list.
*/
static final String BASE_DOC_OPTLIST = "";
/**
* Page-specific option list.
*/
static final String PAGE_OPTLIST = "granularity=word";
/**
* The encoding in which the output is sent to System.out. For running the
* example in a Windows command window, you can set this for example to
* "windows-1252" for getting Latin-1 output.
*/
private static final String OUTPUT_ENCODING = System
.getProperty("file.encoding");
/**
* For printing to System.out in the encoding specified via OUTPUT_ENCODING.
*/
private static PrintStream out;
/**
* Set to true for in-memory processing.
*/
private static final boolean INMEMORY = true;
/**
* Word counter for in-memory processing code.
*/
int word_count = 0;
/**
* SAX handler class to count the words in the document.
*/
private class sax_handler extends DefaultHandler {
public void startElement(String uri, String local_name,
String qualified_name, Attributes attributes)
throws SAXException {
if (local_name.equals("Word")) {
word_count += 1;
}
else if (local_name.equals("Font")) {
out.println("Font " + attributes.getValue("", "name")
+ " (" + attributes.getValue("", "type") + ")");
}
}
}
public static void main(String[] args) throws UnsupportedEncodingException {
System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\"");
out = new PrintStream(System.out, true, OUTPUT_ENCODING);
if (args.length != 1) {
System.err.println("usage: tetml