PDFlib

高度なPDFアプリケーションの開発を支援する定番プログラムライブラリー Supported by インフォテック株式会社

PDFlib TET サンプル集(クックブック)

本サンプルプログラムは、PDF テキスト抽出ライブラリーの実装である TET の基本的な機能を実際のプログラムで紹介したものです。

本サイトでダウンロードした TET は、一部機能の制限を除き、評価版として無償でお使いいただけます。

PDFlib TET で、PDF 文書からテキストを XML として抽出するサンプルプログラムです。

出力ファイル名が指定されている場合は、出力ファイルに XML を書き出します。その他の場合には、メモリ上に XML を抽出した後、これを処理して情報を標準出力 System.out に出力します。


import java.io.ByteArrayInputStream;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;

import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;

import com.pdflib.TET;
import com.pdflib.TETException;

/**
 * Extract text from PDF document as XML. If an output filename is specified,
 * write the XML to the output file. Otherwise fetch the XML in memory, parse it
 * and print some information to System.out.
 * <p>
 * Required software: TET 3
 * <p>
 * Required data: PDF document
 * 
 * @version $Id: tetml.java,v 1.1 2008/11/21 07:18:51 stm Exp $
 */
public class tetml {
    /**
     * Global option list.
     */
    static final String GLOBAL_OPTLIST = "searchpath={../resource/cmap "
        + "../resource/glyphlist ../input}";

    /**
     * Document specific option list.
     */
    static final String BASE_DOC_OPTLIST = "";

    /**
     * Page-specific option list.
     */
    static final String PAGE_OPTLIST = "granularity=word";

    /**
     * The encoding in which the output is sent to System.out. For running the
     * example in a Windows command window, you can set this for example to
     * "windows-1252" for getting Latin-1 output.
     */
    private static final String OUTPUT_ENCODING = System
            .getProperty("file.encoding");

    /**
     * For printing to System.out in the encoding specified via OUTPUT_ENCODING.
     */
    private static PrintStream out;
    
    /**
     * Set to true for in-memory processing.
     */
    private static final boolean INMEMORY = true;
    
    /**
     * Word counter for in-memory processing code.
     */
    int word_count = 0;
    
    /**
     * SAX handler class to count the words in the document.
     */
    private class sax_handler extends DefaultHandler {
        public void startElement(String uri, String local_name,
                String qualified_name, Attributes attributes)
                throws SAXException {
            if (local_name.equals("Word")) {
                word_count += 1;
            }
            else if (local_name.equals("Font")) {
                out.println("Font " + attributes.getValue("", "name")
                        + " (" + attributes.getValue("", "type") + ")");
            }
        }
    }
    
    public static void main(String[] args) throws UnsupportedEncodingException {
        System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\"");
        out = new PrintStream(System.out, true, OUTPUT_ENCODING);

        if (args.length != 1) {
            System.err.println("usage: tetml <pdffilename>");
            return;
        }

        /*
         * For JRE 1.4 the property must be set what XML parser to use, later
         * JREs seem to have a default set internally. It seems to be the case
         * that in 1.4 org.apache.crimson.parser.XMLReaderImpl is always
         * available.
         */
        String jre_version = System.getProperty("java.version");
        if (jre_version.startsWith("1.4")) {
            System.setProperty("org.xml.sax.driver",
                    "org.apache.crimson.parser.XMLReaderImpl");
        }

        /*
         * We need a tetml object, otherwise it's not possible to set up the
         * handler for the SAX parser with the local sax_handler class.
         */
        tetml t = new tetml();
        t.process_xml(args);
    }

    private void process_xml(String[] args) {
        TET tet = null;
        try {
            tet = new TET();
            tet.set_option(GLOBAL_OPTLIST);

            final String outputfilename = args[0] + ".tetml";
            
            final String docoptlist = (INMEMORY ? "tetml={}"
                    : "tetml={filename={" + outputfilename + "}}")
                    + " " + BASE_DOC_OPTLIST;

            if (INMEMORY) {
                out.println("Processing TETML output for document \""
                        + args[0] + "\" in memory...");
            }
            else {
                out.println("Extracting TETML for document \"" + args[0] 
                          + "\" to file \"" + outputfilename + "\"...");
            }

            final int doc = tet.open_document(args[0], docoptlist);
            if (doc == -1) {
                System.err.println("Error " + tet.get_errnum() + " in "
                        + tet.get_apiname() + "(): " + tet.get_errmsg());
                tet.delete();
                return;
            }

            final int n_pages = (int) tet.pcos_get_number(doc, "length:pages");

            /*
             * Loop over pages in the document;
             */
            for (int pageno = 0; pageno <= n_pages; ++pageno) {
                tet.process_page(doc, pageno, PAGE_OPTLIST);
            }

            /*
             * This could be combined with the last page-related call.
             */
            tet.process_page(doc, 0, "tetml={trailer}");

            if (INMEMORY) {
                /*
                 * Get the XML document as a byte array.
                 */
                final byte[] tetml = tet.get_xml_data(doc, "");

                if (tetml == null) {
                    System.err.println("tetml: couldn't retrieve XML data");
                    return;
                }

                /*
                 * Process the in-memory XML document to print out some
                 * information that is extracted with the sax_handler class.
                 */

                XMLReader reader = XMLReaderFactory.createXMLReader();
                reader.setContentHandler(new sax_handler());
                reader.parse(new InputSource(new ByteArrayInputStream(tetml)));
                out.println("Found " + word_count + " words in document");
            }

            tet.close_document(doc);
        }
        catch (TETException e) {
            System.err.println("Error " + e.get_errnum() + " in "
                    + e.get_apiname() + "(): " + e.get_errmsg());
        }
        catch (Exception e) {
            e.printStackTrace();
        }
        finally {
            if (tet != null) {
                tet.delete();
            }
        }
    }
}
(May 6, 2010 - Feb 21, 2014)