PDFlib TET サンプル集（クックブック)

本サンプルプログラムは、PDF テキスト抽出ライブラリーの実装である TET の基本的な機能を実際のプログラムで紹介したものです。
本サイトでダウンロードした TET は、一部機能の制限を除き、評価版として無償でお使いいただけます。
関連リンク
PDFlib とは / PDFlib 製品情報 / 技術情報 / ダウンロード / 見積申込み / 購入申込み
サンプル集一覧

<?xml version="1.0" encoding="UTF-8"?>
<!--
    (C) PDFlib GmbH 2008 www.pdflib.com

    Purpose: Create an alphabetically sorted "back-of-the-book" index
    
    Required input: TETML in "word" or "wordplus" mode.
    
    Stylesheet parameters: none
    
    Version: $Id: index.xsl,v 1.2 2008/11/19 10:37:32 stm Exp $
-->

<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:tet="http://www.pdflib.com/XML/TET3/TET-3.0"
>
    <xsl:output method="text" />
    
    <!-- Minimum word length for inclusion in index -->
    <xsl:param name="min-length">4</xsl:param>
    
    <!--
        Index required by the "Muenchian" method. We index the Word elements
        based on the content of the Text subelements.
    -->
    <xsl:key name="words" match="tet:TET/tet:Document/tet:Pages//tet:Word" use="tet:Text" />
    
    <!-- Characters that may appear at the beginning of a word -->
    <xsl:variable name="allowed-chars"
        select="'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvw'" />
        
    <!--
        Index for the unique letters 
    -->
    <xsl:key name="letter" match="tet:TET/tet:Document/tet:Pages//tet:Word"
        use="translate(substring(tet:Text,1,1), 
                        'abcdefghijklmnopqrstuvwxyz', 
                        'ABCDEFGHIJKLMNOPQRSTUVWXYZ')" />
                        
    <xsl:template match="/">
	<!-- Make sure that word information is present in the input TETML. -->
	<xsl:if test="tet:TET/tet:Document/tet:Pages//tet:Content[not(@granularity = 'word')]">
		<xsl:message terminate="yes">
			<xsl:text>Stylesheet index.xsl processing TETML for document '</xsl:text>
			<xsl:value-of select="tet:TET/tet:Document/@filename" />
			<xsl:text>': this stylesheet requires word info in TETML. </xsl:text>
			<xsl:text>Create the input in page mode "word" or "wordplus".</xsl:text>
		</xsl:message>
	</xsl:if>

        <xsl:text>Alphabetical list of words in the document along with their page number:&#xa;&#xa;</xsl:text>
        
        <!--
            Group by first letter, sort by first letter. 
        -->
        <xsl:apply-templates
            select="tet:TET/tet:Document/tet:Pages//tet:Word[generate-id() =
                                generate-id(key('letter',
                                 translate(substring(tet:Text,1,1), 
                                   'abcdefghijklmnopqrstuvwxyz', 
                                   'ABCDEFGHIJKLMNOPQRSTUVWXYZ'))[1])]"
            mode="index-letters">
            <xsl:sort
                select="translate(tet:Text, 'abcdefghijklmnopqrstuvwxyz', 
                               'ABCDEFGHIJKLMNOPQRSTUVWXYZ')" />
        </xsl:apply-templates>
    </xsl:template>
    
    
    <xsl:template match="tet:Word" mode="index-letters">
        <!-- Get the group key -->
        <xsl:variable name="key"
            select="translate(substring(tet:Text, 1, 1),
                                      'abcdefghijklmnopqrstuvwxyz', 
                                      'ABCDEFGHIJKLMNOPQRSTUVWXYZ')" />
    
        <!-- Suppress groups that are not in the allowed set of characters -->
        <xsl:if test="string-length(translate($key, $allowed-chars, '')) = 0">
        
            <!-- Select all words that start with the current group letter -->
            <xsl:variable name="letter-group"
                select="key('letter', $key)
                   [generate-id() = generate-id(key('words', tet:Text)[1])]" />
    
            <!--
                Filter out words that are not long enough or that start with
                a disallowed character.
            -->
            <xsl:variable name="allowed-words"
                select="$letter-group[string-length(tet:Text) &gt;= $min-length and
                               string-length(translate(substring(tet:Text, 1, 1), $allowed-chars, '')) = 0]"/>
            
            <!-- Suppress empty groups -->
            <xsl:if test="count($allowed-words) &gt; 0">
                <!-- Output label for current index group -->
                <xsl:value-of select="$key" />
                <xsl:text>&#xa;</xsl:text>
            
                <xsl:apply-templates select="$allowed-words" mode="index-words">
                    <xsl:sort
                        select="translate(tet:Text, 
                                            'abcdefghijklmnopqrstuvwxyz',
                                            'ABCDEFGHIJKLMNOPQRSTUVWXYZ')" />
                </xsl:apply-templates>
                <xsl:text>&#xa;</xsl:text>
            </xsl:if>
        </xsl:if>
    </xsl:template>
    
    <xsl:template match="tet:Word" mode="index-words">
        <!-- Find all occurences of index term -->
        <xsl:variable name="occurences" select="key('words', tet:Text)" />
    
        <!-- Output text of index term -->
        <xsl:value-of select="tet:Text" />
        <xsl:text> </xsl:text>
        
        <!-- Output page numbers where the term occurs -->
        <xsl:for-each select="$occurences/ancestor::tet:Page">
            <!-- Separate multiple page numbers by blanks -->
            <xsl:if test="position() != 1">
                <xsl:text> </xsl:text>
            </xsl:if>
            <xsl:value-of select="@number" />
        </xsl:for-each>
        
        <!-- Terminate word entry with new-line -->
        <xsl:text>&#xa;</xsl:text>
    </xsl:template>
</xsl:stylesheet>
(May 6, 2010 - Feb 21, 2014)
PDFlib TET サンプル集（クックブック)

目次