#!/usr/bin/perl # index.pl - index tei files using kinosearch # Eric Lease Morgan # 2007-07-06 - added source and publisher # 2007-07-05 - got much of indexing done; works but needs improvement # 2007-06-16 - first cut; based on work for catholic portal # define use constant DATADIR => ( '/var/www/html/main/musings/oss-and-xml/etexts/tei' ); use constant INDEX => '/var/www/html/main/musings/oss-and-xml/etexts/etc/index'; # require use strict; use File::Find; use XML::LibXML; use KinoSearch::InvIndexer; use KinoSearch::Analysis::PolyAnalyzer; # initialize the index my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en' ); my $invindexer = KinoSearch::InvIndexer->new( invindex => INDEX, create => 1, analyzer => $analyzer ); $invindexer->spec_field( name => 'identifier' ); $invindexer->spec_field( name => 'title' ); $invindexer->spec_field( name => 'creator' ); $invindexer->spec_field( name => 'description' ); $invindexer->spec_field( name => 'language' ); $invindexer->spec_field( name => 'date' ); $invindexer->spec_field( name => 'subject' ); $invindexer->spec_field( name => 'source' ); $invindexer->spec_field( name => 'publisher' ); # get a list of files to index my @tei_files; find( \&get_file_names, DATADIR ); # process each file my $index = 0; my $total_records = $#tei_files + 1; foreach ( @tei_files ) { # parse my ( $identifier, $title, $creator, $date, $language, $subjects, $publisher, $source, $full_text ) = split /\t/, &parse( $_ ); # display/debug $index++; print " record: $index of $total_records\n"; print " identifier: $identifier\n"; print " title: $title\n"; print " publisher: $publisher\n"; print " source: $source\n"; print " creator: $creator\n"; print " date: $date\n"; print " language: $language\n"; print " subject: $subjects\n"; #print " description: $full_text\n"; print "\n"; # index my $doc = $invindexer->new_doc; $doc->set_value( identifier => $identifier ); $doc->set_value( title => $title ); $doc->set_value( subject => $subjects ); $doc->set_value( creator => $creator ); $doc->set_value( description => $full_text ); $doc->set_value( date => $date ); $doc->set_value( language => $language ); $doc->set_value( source => $source ); $doc->set_value( publisher => $publisher ); $invindexer->add_doc( $doc ); } # optimize $invindexer->finish( optimize => 1 ); # done exit; ############# # subroutines sub get_file_names { # get the full path my $file = $File::Find::name; # remove non-xml files and add them to the list next if ( $file !~ /\.xml$/ ); push @tei_files, $file; } sub parse { # initialize my $file = shift; my $parser = XML::LibXML->new; my ( @nodes, $node ); my $tei = $parser->parse_file( $file ); # identifier my $identifier = $tei->findvalue( '/TEI.2/teiHeader/fileDesc/publicationStmt/idno' ); # creators; not exactly correct, but works @nodes = $tei->findnodes( '/TEI.2/teiHeader/fileDesc/titleStmt/author/name' ); my $creator = $nodes[0]->findvalue( '.' ); # title; not exactly correct, but works @nodes = $tei->findnodes( '/TEI.2/teiHeader/fileDesc/titleStmt/title' ); my $title = $nodes[0]->findvalue( '.' ); # date my $date = $tei->findvalue( '/TEI.2/teiHeader/profileDesc/creation/date' ); # publisher my $publisher = $tei->findvalue( '/TEI.2/teiHeader/fileDesc/publicationStmt/publisher' ); # source @nodes = $tei->findnodes( '/TEI.2/teiHeader/fileDesc/sourceDesc' ); my $source = ''; foreach $node ( @nodes ) { $source .= $node->findvalue( '.' ) . ' ' } $source =~ s/\n/ /g; while ( $source =~ / / ) { $source =~ s/ / /g } while ( $source =~ /^ / ) { $source =~ s/^ // } # language my $language = $tei->findvalue( '/TEI.2/teiHeader/profileDesc/langUsage/language' ); # keywords/subjects @nodes = $tei->findnodes( '/TEI.2/teiHeader/profileDesc/textClass/keywords/list/item' ); my $subjects = ''; foreach $node ( @nodes ) { $subjects .= $node->findvalue( '.' ) . '; ' } # fulltext; uses poor-man's normalize-space my $full_text = $tei->findvalue( '/TEI.2/text/body' ); $full_text =~ s/\n/ /g; while ( $full_text =~ / / ) { $full_text =~ s/ / /g } while ( $full_text =~ /^ / ) { $full_text =~ s/^ // } # return return "$identifier\t$title\t$creator\t$date\t$language\t$subjects\t$publisher\t$source\t$full_text"; }