PDFlib

高度なPDFアプリケーションの開発を支援する定番プログラムライブラリー Supported by インフォテック株式会社

PDFlib TET サンプル集(クックブック)

本サンプルプログラムは、PDF テキスト抽出ライブラリーの実装である TET の基本的な機能を実際のプログラムで紹介したものです。

本サイトでダウンロードした TET は、一部機能の制限を除き、評価版として無償でお使いいただけます。

PDFlib TET で、文書から全ての単語を抽出して索引を作成するサンプルプログラムです。

索引はアルファベット順でソートされ、単語が出現するページ番号を一覧表示します。

A~Z および a~z で始まる単語に限定してインデックスしていることに注意してください。(デモプログラムは国際化されたインデックスプログラムを適切に作成するのに十分な機能を持ってはいません)

必要な製品: PDFlib TET


/*
 * Create a sorted list of all words in the document along with the page numbers
 * where the words occur.
 * 
 * Note that the index is limited to words starting with characters [A-Za-z], as
 * the demonstration program lacks the features that would be necessary for
 * making it a truly internationalized index program.
 * 
 * Required software: TET 5
 * 
 * Required data: PDF document
 * 
 */
package com.pdflib.cookbook.tet.text;

import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.text.Collator;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;

import com.pdflib.TET;
import com.pdflib.TETException;


class back_of_the_book_index {
    /*
     * Global option list. The program expects the "resource" directory parallel to
     * the "java" directory.
     */
    private static final String GLOBAL_OPTLIST = "searchpath={../resource/cmap " + "../resource/glyphlist ../input}";

    /*
     * Document specific option list.
     */
    private static final String DOC_OPTLIST = "";

    /*
     * Page-specific option list.
     */
    private static final String PAGE_OPTLIST = "granularity=word";

    /*
     * The encoding in which the output is sent to System.out. For running the
     * example in a Windows command window, you can set this for example to
     * "windows-1252" for getting Latin-1 output.
     */
    private static final String OUTPUT_ENCODING = System.getProperty("file.encoding");

    /*
     * For printing to System.out in the encoding specified via OUTPUT_ENCODING.
     */
    private static PrintStream out;

    /*
     * A word must start with one of the characters in this string to be included in
     * the index (case doesn't matter).
     */
    private static final String INCLUDE_CHARS = "abcdefghijklmnopqrstuvwxyz";

    /*
     * Set this to true if all words are to be lowercased.
     */
    private static final boolean LOWERCASE_WORDS = false;

    /*
     * The name of the file to process.
     */
    private String filename;

    /*
     * A map of sets. The map key is the word, the value is a set of page numbers.
     * For the page set a LinkedHashSet is used, as we traverse the document in page
     * order, and the LinkedHashSet preserves the insertion order, which will give
     * us the desired sorted list of page numbers.
     */
    private Map<String, Set<Integer>> wordPages = new HashMap<String, Set<Integer>>();

    /*
     * Process a single page of text.
     * 
     * @param tet    TET object
     * @param doc    TET document handle
     * @param pageno Page to process
     * 
     * @throws TETException An error occurred in the TET API
     */
    private void process_page(TET tet, final int doc, int pageno) throws TETException {
        final int page = tet.open_page(doc, pageno, PAGE_OPTLIST);

        if (page == -1) {
            System.err.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
        } else {
            /*
             * Fetch the text word-wise
             */
            for (String text = tet.get_text(page); text != null; text = tet.get_text(page)) {
                /*
                 * Only include words that start with a letter out of the set of interesting
                 * characters.
                 */
                if (INCLUDE_CHARS.indexOf(Character.toLowerCase(text.charAt(0))) != -1) {
                    if (LOWERCASE_WORDS) {
                        text = text.toLowerCase();
                    }

                    Set	<Integer> pages = wordPages.get(text);
                    if (pages == null) {
                        pages = new LinkedHashSet<Integer>();
                        wordPages.put(text, pages);
                    }
                    pages.add(Integer.valueOf(pageno));
                }
            }

            if (tet.get_errnum() != 0) {
                System.err
                        .println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
            }

            tet.close_page(page);
        }
    }

    /*
     * Print out the results.
     * 
     * @throws TETException
     */
    private void print_index(TET tet, int doc) throws TETException {
        out.println("Alphabetical list of words in the document \"" + filename + "\" along with their page number:");
        out.println();

        String[] words = new String[wordPages.size()];
        words = wordPages.keySet().toArray(words);

        /*
         * Sort according to the sorting rules of the default locale.
         */
        final Collator collator = Collator.getInstance();
        Arrays.sort(words, new Comparator<Object>() {
            public int compare(Object o1, Object o2) {
                return collator.compare(o1, o2);
            }
        });

        char currentGroup = 0;

        /*
         * Print out the words with the pages they appear on, grouped by first letter.
         */
        for (int i = 0; i < words.length; i += 1) {
            String word = words[i];
            char firstChar = Character.toUpperCase(word.charAt(0));

            if (firstChar != currentGroup) {
                out.println(firstChar);
                currentGroup = firstChar;
            }

            out.print(word + " ");

            Set<Integer> pages = wordPages.get(word);
            Iterator<Integer> j = pages.iterator();
            boolean first = true;
            while (j.hasNext()) {
                if (!first) {
                    out.print(", ");
                } else {
                    first = false;
                }
                out.print(j.next());
            }
            out.println();
        }
    }

    /*
     * Generate the index for the given file.
     */
    private void execute() {
        TET tet = null;
        int pageno = 0;

        try {
            tet = new TET();
            tet.set_option(GLOBAL_OPTLIST);

            final int doc = tet.open_document(filename, DOC_OPTLIST);
            if (doc == -1) {
                System.err
                        .println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
            } else {
                /*
                 * Loop over pages in the document
                 */
                final int n_pages = (int) tet.pcos_get_number(doc, "length:pages");
                for (pageno = 1; pageno <= n_pages; ++pageno) {
                    process_page(tet, doc, pageno);
                }

                print_index(tet, doc);

                tet.close_document(doc);
            }
        } catch (TETException e) {
            if (pageno == 0) {
                System.err
                        .println("Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg() + "\n");
            } else {
                System.err.println("Error " + e.get_errnum() + " in " + e.get_apiname() + "() on page " + pageno + ": "
                        + e.get_errmsg() + "\n");
            }
            System.exit(1);
        } finally {
            tet.delete();
        }
    }

    /*
     * @param filename the name of the file for which the concordance will be
     *                 generated
     */
    private back_of_the_book_index(String filename) {
        this.filename = filename;
    }

    public static void main(String[] args) throws UnsupportedEncodingException {
        System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\"");
        out = new PrintStream(System.out, true, OUTPUT_ENCODING);

        if (args.length != 1) {
            out.println("usage: back_of_the_book_index <infilename>");
            return;
        }

        back_of_the_book_index c = new back_of_the_book_index(args[0]);
        c.execute();
    }
}

(May 6, 2010 - Oct 19, 2022)