PDFlib TET サンプル集（クックブック)

本サンプルプログラムは、PDF テキスト抽出ライブラリーの実装である TET の基本的な機能を実際のプログラムで紹介したものです。

本サイトでダウンロードした TET は、一部機能の制限を除き、評価版として無償でお使いいただけます。

イメージ抽出

PDFlib TET により PDF イメージを抽出するサンプルプログラムです。

必要な製品：PDFlib TET


import java.io.PrintStream;
import java.io.UnsupportedEncodingException;

import com.pdflib.TETException;
import com.pdflib.TET;

/**
 * PDF image extractor based on PDFlib TET.
 * <p>
 * Required software: TET 3
 * <p>
 * Required data: PDF document
 * 
 * @version $Id: image_extractor.java,v 1.6 2008/12/15 10:50:54 stm Exp $
 */
public class image_extractor
{
    /**
     * Global option list
     */
    static final String GLOBAL_OPTLIST = "searchpath={../resource/cmap "
        + "../resource/glyphlist ../input}";
    
    /**
     * Document-specific option list
     */
    static final String DOC_OPTLIST = "";
    
    /**
     * Page-specific option list
     */
    static final String PAGE_OPTLIST = "granularity=page";
    
    /**
     * Basic image extract options (more below)
     */
    static final String BASE_IMAGE_OPTLIST = "compression=auto format=auto";

    /**
     * The encoding in which the output is sent to System.out. For running the
     * example in a Windows command window, you can set this for example to
     * "windows-1252" for getting Latin-1 output.
     */
    private static final String OUTPUT_ENCODING = System
            .getProperty("file.encoding");

    /**
     * For printing to System.out in the encoding specified via OUTPUT_ENCODING.
     */
    private static PrintStream out;
    
    public static void main(String argv[]) throws UnsupportedEncodingException {
        System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\"");
        out = new PrintStream(System.out, true, OUTPUT_ENCODING);

        TET tet = null;

        try {
            if (argv.length != 1) {
                throw new Exception("usage: image_extractor <filename>");
            }

            String outfilebase = argv[0];

            tet = new TET();

            tet.set_option(GLOBAL_OPTLIST);

            int doc = tet.open_document(argv[0], DOC_OPTLIST);
            if (doc == -1) {
                throw new Exception("Error " + tet.get_errnum() + "in "
                        + tet.get_apiname() + "(): " + tet.get_errmsg());
            }

            /* get number of pages in the document */
            int n_pages = (int) tet.pcos_get_number(doc, "length:pages");

            /* loop over pages */
            for (int pageno = 1; pageno <= n_pages; ++pageno) {
                int page = tet.open_page(doc, pageno, PAGE_OPTLIST);

                if (page < 0) {
                    print_tet_error(tet, pageno);
                    continue; /* try next page */
                }

                /* Retrieve all images on the page */
                int imageno = -1;
                while (tet.get_image_info(page) == 1) {
                    imageno++;

                    /*
                     * Use the name of the input file and generate image
                     * names from it.
                     */
                    String imagename = outfilebase + "_p" + pageno
                                                    + "_" + imageno;
                    String imageoptlist = BASE_IMAGE_OPTLIST 
                        + " filename={" + imagename + "}";

                    out.println("Extracting image " + imagename);
                    
                    /* Fetch the image data and write it to a disk file */
                    if (tet.write_image_file(doc, tet.imageid,
                                    imageoptlist) == -1) {
                        print_tet_error(tet, pageno);
                    }
                }

                if (tet.get_errnum() != 0) {
                    print_tet_error(tet, pageno);
                }

                tet.close_page(page);
            }

            tet.close_document(doc);
        }
        catch (TETException e) {
            System.err.println("TET exception occurred in extractor sample:");
            System.err.println("[" + e.get_errnum() + "] " + e.get_apiname()
                    + ": " + e.get_errmsg());
        }
        catch (Exception e) {
            System.err.println(e.getMessage());
        }
        finally {
            if (tet != null) {
                tet.delete();
            }
        }
    }

    /**
     * Report a TET error.
     * 
     * @param tet
     *            The TET object
     * @param pageno
     *            The page number on which the error occurred
     */
    private static void print_tet_error(TET tet, int pageno) {
        System.err.println("Error " + tet.get_errnum() + " in  "
                + tet.get_apiname() + "() on page " + pageno + ": "
                + tet.get_errmsg());
    }
}

(May 6, 2010 - Oct 25, 2022)