/* * TET を使用したテキストの抽出 * * 必要な製品 : TET 5 * * 必要なデー: PDF 文書 * */ package com.pdflib.cookbook.tet.text; import java.io.PrintStream; import java.io.UnsupportedEncodingException; import com.pdflib.TETException; import com.pdflib.TET; public class text_extractor { /* * グローバルオプションリスト */ static final String GLOBAL_OPTLIST = "searchpath={../resource/cmap " + "../resource/glyphlist ../input}"; /* * 文書のオプションリスト */ static final String DOC_OPTLIST = ""; /* * ページのオプションリスト */ static final String PAGE_OPTLIST = "granularity=page"; /* * テキストの各塊の後に出力されるセパレータ。 * これはアプリケーションの必要性によるもの。granularity=wordの場合、スペース文字が * 有用となる場合がある。 */ static final String SEPARATOR = "\n"; /* * System.out に送られるエンコーディング */ private static final String OUTPUT_ENCODING = System.getProperty("file.encoding"); /* * 取得したエンコーディング(OUTPUT_ENCODING)を元に System.outより出力 */ private static PrintStream out; public static void main(String argv[]) throws UnsupportedEncodingException { System.out.println("Using output encoding \"" + OUTPUT_ENCODING + "\""); out = new PrintStream(System.out, true, OUTPUT_ENCODING); TET tet = null; try { if (argv.length != 1) { throw new Exception("usage: text_extractor "); } tet = new TET(); tet.set_option(GLOBAL_OPTLIST); int doc = tet.open_document(argv[0], DOC_OPTLIST); if (doc == -1) { throw new Exception( "Error " + tet.get_errnum() + "in " + tet.get_apiname() + "(): " + tet.get_errmsg()); } /* PDF文書のページ数を取得する */ int n_pages = (int) tet.pcos_get_number(doc, "length:pages"); /* PDF文書のページ分ループする */ for (int pageno = 1; pageno <= n_pages; ++pageno) { String text; int page = tet.open_page(doc, pageno, PAGE_OPTLIST); if (page < 0) { print_tet_error(tet, pageno); continue; /* 次のページへ */ } /* * テキストを取得する；これは" granularity=page" を指定している場合は * 必要のない処理であり、他の取得単位の場合必要となる。 */ while ((text = tet.get_text(page)) != null) { /* 全ての文字分ループする */ while (tet.get_char_info(page) != -1) { /* * 以下に、フォント名を取り出す方法を示す。 * 位置情報が必要な場合はx,y座標より取り出すことができる。 * fontname変数は使用しておらす、警告を避けるためコメントアウト * している */ /* String fontname = */ tet.pcos_get_string(doc, "fonts[" + tet.fontid + "]/name"); } /* 取得したテキストを出力する */ out.print(text); /* 抽出したテキストのまとまりの間に区切り文字を挿入する */ out.print(SEPARATOR); } if (tet.get_errnum() != 0) { print_tet_error(tet, pageno); } tet.close_page(page); } tet.close_document(doc); } catch (TETException e) { System.err.println("TET exception occurred in extractor sample:"); System.err.println("[" + e.get_errnum() + "] " + e.get_apiname() + ": " + e.get_errmsg()); System.exit(1); } catch (Exception e) { System.err.println(e); System.exit(1); } finally { if (tet != null) { tet.delete(); } } } /* * TET のエラーを報告する * * @param tet TET オブジェクト * @param pageno エラーが発生したページ番号 */ private static void print_tet_error(TET tet, int pageno) { System.err.println("Error " + tet.get_errnum() + " in " + tet.get_apiname() + "() on page " + pageno + ": " + tet.get_errmsg()); } }