#!/usr/bin/perl # tei2kinosearch.pl - index tei files using kinosearch # Eric Lease Morgan # 2007-04-26 - first cut; based on ead2kinosearch.pl use constant DATADIR => ( '../etc/tei' ); use constant INDEX => '../etc/index'; # require use strict; use File::Find; use XML::LibXML; use KinoSearch::InvIndexer; use KinoSearch::Analysis::PolyAnalyzer; # initialize the index my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en' ); my $invindexer = KinoSearch::InvIndexer->new( invindex => INDEX, create => 0, analyzer => $analyzer ); $invindexer->spec_field( name => 'identifier' ); $invindexer->spec_field( name => 'title' ); $invindexer->spec_field( name => 'creator' ); $invindexer->spec_field( name => 'subject' ); $invindexer->spec_field( name => 'description' ); $invindexer->spec_field( name => 'publisher' ); $invindexer->spec_field( name => 'collection' ); # get a list of files to index my @tei_files; find( \&get_file_names, DATADIR ); # process each file my $index = 0; foreach my $file ( @tei_files ) { # parse my ( $identifier, $title, $subjects, $creators, $description, $publisher ) = split /\t/, &parse( $file ); # display/debug print " identifier: $identifier\n"; print " title: $title\n"; my @subject_terms = split /\|/, $subjects; foreach my $subject_term ( @subject_terms ) { print " subject: $subject_term\n" } my @creator_names = split /\|/, $creators; foreach my $creator_name ( @creator_names ) { print " creator: $creator_name\n" } print " description: $description\n"; print " publisher: $publisher\n"; print "\n"; # index $index++; my $doc = $invindexer->new_doc( $index ); $doc->set_value( identifier => $identifier ); $doc->set_value( title => $title ); $doc->set_value( subject => $subjects ); $doc->set_value( creator => $creators ); $doc->set_value( description => $description ); $doc->set_value( publisher => $publisher ); $doc->set_value( collection => 'tei' ); $invindexer->add_doc( $doc ); } # optimize $invindexer->finish( optimize => 1 ); # done exit; sub get_file_names { # get the full path my $file = $File::Find::name; # remove non-xml files and add them to the list next if ( $file !~ /\.xml$/ ); push @tei_files, $file; } sub parse { # initialize my $file = shift; my $parser = XML::LibXML->new; my ( @nodes, $node ); my $tei = $parser->parse_file( $file ); # extract the identifier/url my $identifier = $tei->findvalue( '/TEI.2/teiHeader/fileDesc/publicationStmt/idno' ); # extract creators @nodes = (); my $creators = ''; push @nodes, $tei->findnodes( '/TEI.2/teiHeader/fileDesc/titleStmt/author/name' ); $creators = &normalize( $nodes[ 0 ]->textContent ); # extract the title @nodes = (); my $title = ''; push @nodes, $tei->findnodes( '/TEI.2/teiHeader/fileDesc/titleStmt/title' ); $title = &normalize( $nodes[ 0 ]->textContent ); # extract the subjects @nodes = (); $node = ''; my $subjects = ''; push @nodes, $tei->findnodes( '//keywords/list/item' ); foreach $node ( @nodes ) { $subjects .= &normalize( $node->textContent ) . '|' } # extract (create) the description my $description = &normalize( substr( $tei->findvalue( '/TEI.2/text/body' ), 0, 253 ). '...' ); # extract the publisher my $publisher = &normalize( $tei->findvalue( '/TEI.2/teiHeader/fileDesc/publicationStmt/publisher' )); # return a record return "$identifier\t$title\t$subjects\t$creators\t$description\t$publisher"; } sub normalize { my $s = shift; while ( $s =~ /^\s/ ) { $s =~ s/^\s// } while ( $s =~ /\s$/ ) { $s =~ s/\s$// } $s =~ s/\t/ /g; $s =~ s/\n/ /g; $s =~ s/\r/ /g; while ( $s =~ / / ) { $s =~ s/ / / } return $s; }