#!/usr/bin/perl # vra2kinosearch.pl - index vra files using kinosearch # Eric Lease Morgan # 2007-04-27 - first cut; based on ead2kinosearch.pl use constant DATADIR => ( '../etc/vra' ); use constant INDEX => '../etc/index'; # require use strict; use File::Find; use XML::LibXML; use KinoSearch::InvIndexer; use KinoSearch::Analysis::PolyAnalyzer; # initialize the index my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en' ); my $invindexer = KinoSearch::InvIndexer->new( invindex => INDEX, create => 0, analyzer => $analyzer ); $invindexer->spec_field( name => 'identifier' ); $invindexer->spec_field( name => 'title' ); $invindexer->spec_field( name => 'creator' ); $invindexer->spec_field( name => 'subject' ); $invindexer->spec_field( name => 'description' ); $invindexer->spec_field( name => 'format' ); $invindexer->spec_field( name => 'source' ); $invindexer->spec_field( name => 'rights' ); $invindexer->spec_field( name => 'date' ); $invindexer->spec_field( name => 'collection' ); # get a list of files to index my @vra_files; find( \&get_file_names, DATADIR ); # process each file my $index = 0; foreach my $file ( @vra_files ) { # parse my ( $identifier, $title, $subjects, $creators, $description, $formats, $source, $rights, $date ) = split /\t/, &parse( $file ); # display/debug print " identifier: $identifier\n"; print " title: $title\n"; my @subject_terms = split /\|/, $subjects; foreach my $subject_term ( @subject_terms ) { print " subject: $subject_term\n" } my @creator_names = split /\|/, $creators; foreach my $creator_name ( @creator_names ) { print " creator: $creator_name\n" } print " description: $description\n"; my @format_names = split /\|/, $formats; foreach my $format_name ( @format_names ) { print " format: $format_name\n" } print " source: $source\n"; print " rights: $rights\n"; print " date: $date\n"; print "\n"; # index $index++; my $doc = $invindexer->new_doc( $index ); $doc->set_value( identifier => $identifier ); $doc->set_value( title => $title ); $doc->set_value( subject => $subjects ); $doc->set_value( creator => $creators ); $doc->set_value( description => $description ); $doc->set_value( format => $formats ); $doc->set_value( source => $source ); $doc->set_value( rights => $rights ); $doc->set_value( date => $date ); $doc->set_value( collection => 'vra' ); $invindexer->add_doc( $doc ); } # optimize $invindexer->finish( optimize => 1 ); # done exit; sub get_file_names { # get the full path my $file = $File::Find::name; # remove non-xml files and add them to the list next if ( $file !~ /\.xml$/ ); push @vra_files, $file; } sub parse { # initialize my $file = shift; my $parser = XML::LibXML->new; my ( @nodes, $node ); my $vra = $parser->parse_file( $file ); $vra = XML::LibXML::XPathContext->new( $vra ); $vra->registerNs( 'v', 'http://www.vraweb.org/vracore4.htm' ); # extract the identifier/url my $identifier = $vra->findvalue( '/v:vra/v:work/@refid' ); # extract creators @nodes = (); my $creators = ''; $creators = $vra->findvalue( '/v:vra/v:work/v:agentSet/v:display' ); # extract the title my $title = ''; $title = &normalize( $vra->findvalue( '/v:vra/v:work/v:titleSet/v:display' )); # extract the subjects @nodes = (); $node = ''; my $subjects = ''; push @nodes, $vra->findnodes( '//v:subject/v:term' ); foreach $node ( @nodes ) { $subjects .= &normalize( $node->textContent ) . '|' } # extract (create) the description @nodes = (); $node = ''; my $description = ''; push @nodes, $vra->findnodes( '//v:image/v:titleSet/v:display' ); $description = &normalize( $nodes[ 0 ]->textContent ); # extract the format @nodes = (); $node = ''; my $formats = ''; push @nodes, $vra->findnodes( '//v:image/v:worktypeSet/v:display' ); $formats = &normalize( $nodes[ 0 ]->textContent ); # extract the source my $source = $vra->findvalue( '/v:vra/v:work/@source' ); # extract the source my $rights = $vra->findvalue( '/v:vra/v:image/v:rightsSet/v:rights' ); # extract the date my $date = $vra->findvalue( '/v:vra/v:work/v:dateSet/v:display' ); # return a record return "$identifier\t$title\t$subjects\t$creators\t$description\t$formats\t$source\t$rights\t$date"; } sub normalize { my $s = shift; while ( $s =~ /^\s/ ) { $s =~ s/^\s// } while ( $s =~ /\s$/ ) { $s =~ s/\s$// } $s =~ s/\t/ /g; $s =~ s/\n/ /g; $s =~ s/\r/ /g; while ( $s =~ / / ) { $s =~ s/ / / } return $s; }