高度なPDFアプリケーションの開発を支援する定番プログラムライブラリー Supported by インフォテック株式会社

PDFlib TET サンプル集(クックブック)

本サンプルプログラムは、PDF テキスト抽出ライブラリーの実装である TET の基本的な機能を実際のプログラムで紹介したものです。

本サイトでダウンロードした TET は、一部機能の制限を除き、評価版として無償でお使いいただけます。

PDFlib TET で、アルファベットでソートされた目次を生成するサンプルプログラムです。

<?xml version="1.0" encoding="UTF-8"?>
    (C) PDFlib GmbH 2008 www.pdflib.com

    Purpose: Create an alphabetically sorted "back-of-the-book" index
    Required input: TETML in "word" or "wordplus" mode.
    Stylesheet parameters: none
    Version: $Id: index.xsl,v 1.2 2008/11/19 10:37:32 stm Exp $

<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    <xsl:output method="text" />
    <!-- Minimum word length for inclusion in index -->
    <xsl:param name="min-length">4</xsl:param>
        Index required by the "Muenchian" method. We index the Word elements
        based on the content of the Text subelements.
    <xsl:key name="words" match="tet:TET/tet:Document/tet:Pages//tet:Word" use="tet:Text" />
    <!-- Characters that may appear at the beginning of a word -->
    <xsl:variable name="allowed-chars"
        select="'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvw'" />
        Index for the unique letters 
    <xsl:key name="letter" match="tet:TET/tet:Document/tet:Pages//tet:Word"
                        'ABCDEFGHIJKLMNOPQRSTUVWXYZ')" />
    <xsl:template match="/">
	<!-- Make sure that word information is present in the input TETML. -->
	<xsl:if test="tet:TET/tet:Document/tet:Pages//tet:Content[not(@granularity = 'word')]">
		<xsl:message terminate="yes">
			<xsl:text>Stylesheet index.xsl processing TETML for document '</xsl:text>
			<xsl:value-of select="tet:TET/tet:Document/@filename" />
			<xsl:text>': this stylesheet requires word info in TETML. </xsl:text>
			<xsl:text>Create the input in page mode "word" or "wordplus".</xsl:text>

        <xsl:text>Alphabetical list of words in the document along with their page number:&#xa;&#xa;</xsl:text>
            Group by first letter, sort by first letter. 
            select="tet:TET/tet:Document/tet:Pages//tet:Word[generate-id() =
                select="translate(tet:Text, 'abcdefghijklmnopqrstuvwxyz', 
                               'ABCDEFGHIJKLMNOPQRSTUVWXYZ')" />
    <xsl:template match="tet:Word" mode="index-letters">
        <!-- Get the group key -->
        <xsl:variable name="key"
            select="translate(substring(tet:Text, 1, 1),
                                      'ABCDEFGHIJKLMNOPQRSTUVWXYZ')" />
        <!-- Suppress groups that are not in the allowed set of characters -->
        <xsl:if test="string-length(translate($key, $allowed-chars, '')) = 0">
            <!-- Select all words that start with the current group letter -->
            <xsl:variable name="letter-group"
                select="key('letter', $key)
                   [generate-id() = generate-id(key('words', tet:Text)[1])]" />
                Filter out words that are not long enough or that start with
                a disallowed character.
            <xsl:variable name="allowed-words"
                select="$letter-group[string-length(tet:Text) &gt;= $min-length and
                               string-length(translate(substring(tet:Text, 1, 1), $allowed-chars, '')) = 0]"/>
            <!-- Suppress empty groups -->
            <xsl:if test="count($allowed-words) &gt; 0">
                <!-- Output label for current index group -->
                <xsl:value-of select="$key" />
                <xsl:apply-templates select="$allowed-words" mode="index-words">
                                            'ABCDEFGHIJKLMNOPQRSTUVWXYZ')" />
    <xsl:template match="tet:Word" mode="index-words">
        <!-- Find all occurences of index term -->
        <xsl:variable name="occurences" select="key('words', tet:Text)" />
        <!-- Output text of index term -->
        <xsl:value-of select="tet:Text" />
        <xsl:text> </xsl:text>
        <!-- Output page numbers where the term occurs -->
        <xsl:for-each select="$occurences/ancestor::tet:Page">
            <!-- Separate multiple page numbers by blanks -->
            <xsl:if test="position() != 1">
                <xsl:text> </xsl:text>
            <xsl:value-of select="@number" />
        <!-- Terminate word entry with new-line -->
(May 6, 2010 - Feb 21, 2014)