PDFlib

高度なPDFアプリケーションの開発を支援する定番プログラムライブラリー Supported by インフォテック株式会社

PDFlib TET サンプル集(クックブック)

本サンプルプログラムは、PDF テキスト抽出ライブラリーの実装である TET の基本的な機能を実際のプログラムで紹介したものです。

本サイトでダウンロードした TET は、一部機能の制限を除き、評価版として無償でお使いいただけます。

グリフ情報の抽出

PDFlib TET で、 PDF文書からグリフ情報を抽出するサンプルプログラムです。


/**
 * Simple PDF glyph dumper based on PDFlib TET
 *
 * Required software: TET 5.2 (if you define the constants TET.CT_SEQ_START
 * etc. the code can also be used with TET 5.0)
 * 
 * Required data: PDF document
 * 
 */

package com.pdflib.cookbook.tet.text;

import java.io.BufferedWriter;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Formatter;
import java.util.Locale;

import com.pdflib.TET;
import com.pdflib.TETException;


public class glyphinfo {
    /**
     * Global option list
     */
    static final String globaloptlist = "searchpath={{../input} {../resource/cmap}}";

    /**
     * Document-specific option list
     */
    static final String docoptlist = "";

    /**
     * Page-specific option list
     */
    static final String pageoptlist = "granularity=word";

    private static void print_color_value(Formatter formatter, TET tet, int doc, int colorid) throws TETException {
        String csname; /* color space name */
        int i;

        /*
         * We handle only the fill color, but ignore the stroke color. The stroke color
         * can be retrieved analogously with the keyword "stroke".
         */
        tet.get_color_info(doc, colorid, "usage=fill");

        if (tet.colorspaceid == -1 && tet.patternid == -1) {
            formatter.format(" (not filled)");
            return;
        }

        formatter.format(" (");

        if (tet.patternid != -1) {
            int patterntype = (int) tet.pcos_get_number(doc, "patterns[" + tet.patternid + "]/PatternType");

            if (patterntype == 1) /* Tiling pattern */
            {
                int painttype = (int) tet.pcos_get_number(doc, "patterns[" + tet.patternid + "]/PaintType");
                if (painttype == 1) {
                    formatter.format("colored Pattern)");
                    return;
                } else if (painttype == 2) {
                    formatter.format("uncolored Pattern, base color: ");
                    /* FALLTHROUGH to colorspaceid output */
                }
            } else if (patterntype == 2) /* Shading pattern */
            {
                int shadingtype = (int) tet.pcos_get_number(doc, "patterns[" + tet.patternid + "]/Shading/ShadingType");

                formatter.format("shading Pattern, ShadingType=%d)", shadingtype);
                return;
            }
        }

        csname = tet.pcos_get_string(doc, "colorspaces[" + tet.colorspaceid + "]/name");

        formatter.format("%s", csname);

        /* Emit more details depending on the colorspace type */
        if (csname.equals("ICCBased")) {
            int iccprofileid;
            String profilename;
            String profilecs;
            String errormessage;

            iccprofileid = (int) tet.pcos_get_number(doc, "colorspaces[" + tet.colorspaceid + "]/iccprofileid");

            errormessage = tet.pcos_get_string(doc, "iccprofiles[" + iccprofileid + "]/errormessage");

            /* Check whether the embedded profile is damaged */
            if (errormessage.equals("")) {
                formatter.format(" (%s)", errormessage);
            } else {
                profilename = tet.pcos_get_string(doc, "iccprofiles[" + iccprofileid + "]/profilename");
                formatter.format(" '%s'", profilename);

                profilecs = tet.pcos_get_string(doc, "iccprofiles[" + iccprofileid + "]/profilecs");
                formatter.format(" '%s'", profilecs);
            }
        } else if (csname.equals("Separation")) {
            String colorantname = tet.pcos_get_string(doc, "colorspaces[" + tet.colorspaceid + "]/colorantname");
            formatter.format(" '%s'", colorantname);
        } else if (csname.equals("DeviceN")) {
            formatter.format(" ");

            for (i = 0; i < tet.components.length; i++) {
                String colorantname = tet.pcos_get_string(doc,
                        "colorspaces[" + tet.colorspaceid + "]/colorantnames[" + i + "]");

                formatter.format("%s", colorantname);

                if (i != tet.components.length - 1)
                    formatter.format("/");
            }
        } else if (csname.equals("Indexed")) {
            int baseid = (int) tet.pcos_get_number(doc, "colorspaces[" + tet.colorspaceid + "]/baseid");

            csname = tet.pcos_get_string(doc, "colorspaces[" + baseid + "]/name");

            formatter.format(" %s", csname);

        }

        formatter.format(" ");
        for (i = 0; i < tet.components.length; i++) {
            formatter.format("%g", tet.components[i]);

            if (i != tet.components.length - 1)
                formatter.format("/");
        }
        formatter.format(")");
    }

    public static void main(String argv[]) {
        TET tet = null;

        try {
            if (argv.length != 1) {
                throw new Exception("usage: glyphinfo ");
            }

            /* print UTF-8 BOM */
            byte[] bom = new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF };
            System.out.write(bom);

            Writer outfp = new BufferedWriter(new OutputStreamWriter(System.out, "UTF-8"));

            Formatter formatter = new Formatter(outfp, Locale.US);

            tet = new TET();

            tet.set_option(globaloptlist);

            int doc = tet.open_document(argv[0], docoptlist);

            if (doc == -1) {
                formatter.close();
                throw new Exception(
                        "Error " + tet.get_errnum() + "in " + tet.get_apiname() + "(): " + tet.get_errmsg());
            }

            /* get number of pages in the document */
            int n_pages = (int) tet.pcos_get_number(doc, "length:pages");

            /* loop over pages in the document */
            for (int pageno = 1; pageno <= n_pages; ++pageno) {
                String text;
                int page;
                int previouscolor = -1;

                page = tet.open_page(doc, pageno, pageoptlist);

                if (page == -1) {
                    print_tet_error(tet, pageno);
                    continue; /* try next page */
                }

                /* Administrative information */
                formatter.format("\n[ Document: '" + tet.pcos_get_string(doc, "filename") + "' ]\n");

                formatter.format("[ Document options: '%s' ]\n", docoptlist);

                formatter.format("[ Page options: '%s' ]\n", pageoptlist);

                formatter.format("[ ----- Page %d ----- ]\n", pageno);

                /* Retrieve all text fragments */
                while ((text = tet.get_text(page)) != null) {
                    @SuppressWarnings("unused")
                    int ci;

                    /* print the retrieved text */
                    outfp.write("[" + text + "]\n");

                    /* Loop over all glyphs and print their details */
                    while ((ci = tet.get_char_info(page)) != -1) {
                        final String fontname;

                        /* Fetch the font name with pCOS (based on its ID) */
                        fontname = tet.pcos_get_string(doc, "fonts[" + tet.fontid + "]/name");

                        /* Print the Unicode value of the character */
                        formatter.format("U+%04X", tet.uv);

                        /* ...and its UTF-32 value */
                        formatter.format(" '%c'", tet.uv);

                        /* Print font name, size, and position */
                        formatter.format(" %s size=%.2f x=%.2f y=%.2f", fontname, tet.fontsize, tet.x, tet.y);

                        /* Print the color id */
                        formatter.format(" colorid=%d", tet.colorid);

                        /* Check wheater the text color changed */
                        if (tet.colorid != previouscolor) {
                            print_color_value(formatter, tet, doc, tet.colorid);
                            previouscolor = tet.colorid;
                        }

                        /* Examine the "type" member */
                        if (tet.type == TET.CT_SEQ_START)
                            formatter.format(" ligature_start");

                        else if (tet.type == TET.CT_SEQ_CONT)
                            formatter.format(" ligature_cont");

                        /* Separators are only inserted for granularity > word*/
                        else if (tet.type == TET.CT_INSERTED)
                            formatter.format(" inserted");

                        /* Examine the bit flags in the "attributes" member */
                        if (tet.attributes != TET.ATTR_NONE) {
                            if ((tet.attributes & TET.ATTR_SUB) != 0)
                                    formatter.format("/sub");
                            if ((tet.attributes & TET.ATTR_SUP) != 0)
                                    formatter.format("/sup");
                            if ((tet.attributes & TET.ATTR_DROPCAP) != 0)
                                    formatter.format("/dropcap");
                            if ((tet.attributes & TET.ATTR_SHADOW) != 0)
                                    formatter.format("/shadow");
                            if ((tet.attributes & TET.ATTR_DEHYPHENATION_PRE) != 0)
                                    formatter.format("/dehyphenation_pre");
                            if ((tet.attributes & TET.ATTR_DEHYPHENATION_ARTIFACT) != 0)
                                    formatter.format("/dehyphenation_artifact");
                            if ((tet.attributes & TET.ATTR_DEHYPHENATION_POST) != 0)
                                    formatter.format("/dehyphenation_post");
                            if ((tet.attributes & TET.ATTR_ARTIFACT) != 0)
                                    formatter.format("/Artifact");
                        }

                        formatter.format("\n");
                    }

                    formatter.format("\n");
                }
                if (tet.get_errnum() != 0) {
                    print_tet_error(tet, pageno);
                }

                tet.close_page(page);
            }

            tet.close_document(doc);
            outfp.close();
        } catch (TETException e) {
            System.err.println("TET exception occurred in glyphinfo sample:");
            System.err.println("[" + e.get_errnum() + "] " + e.get_apiname() + ": " + e.get_errmsg());
        } catch (Exception e) {
            System.err.println(e);
        } finally {
            if (tet != null) {
                tet.delete();
            }
        }
    }

    /**
     * Report a TET error.
     * 
     * @param tet    The TET object
     * @param pageno The page number on which the error occurred
     */
    private static void print_tet_error(TET tet, int pageno) {
        System.err.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "() on page " + pageno + ": "
                + tet.get_errmsg());
    }
}
(Oct 16, 2019 - )