#!/usr/bin/perl # mods2kinosearch.pl - indexs a mods file with kinosearch # Eric Lease Morgan # 2007-07-14 - added namespace declaration; seems inelegant but functional # 2007-06-24 - first cut; based on crri # define use constant INDEX => '../etc/index'; # require use strict; use XML::LibXML; use KinoSearch::InvIndexer; use KinoSearch::Analysis::PolyAnalyzer; # get the input my $input = shift; if ( ! $input ) { print "Usage: $0 filename\n"; exit; } # initialize the index my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en' ); my $invindexer = KinoSearch::InvIndexer->new( invindex => INDEX, create => 1, analyzer => $analyzer ); $invindexer->spec_field( name => 'collection' ); $invindexer->spec_field( name => 'coverage' ); $invindexer->spec_field( name => 'creator' ); $invindexer->spec_field( name => 'date' ); $invindexer->spec_field( name => 'description' ); $invindexer->spec_field( name => 'format' ); $invindexer->spec_field( name => 'identifier' ); $invindexer->spec_field( name => 'language' ); $invindexer->spec_field( name => 'publisher' ); $invindexer->spec_field( name => 'relation' ); $invindexer->spec_field( name => 'rights' ); $invindexer->spec_field( name => 'subject' ); $invindexer->spec_field( name => 'title' ); $invindexer->spec_field( name => 'type' ); # initialize/configure my $index = 0; my $node; binmode( STDOUT, ':utf8' ); # parse and register the mods namespace my $parser = XML::LibXML->new; my $collection = $parser->parse_file( $input ); $collection = XML::LibXML::XPathContext->new( $collection ); $collection->registerNs( 'mods', 'http://www.loc.gov/mods/v3' ); # loop through each mods element foreach my $mods ( $collection->findnodes( '//mods:mods' )) { # register the namespace, again! $mods = XML::LibXML::XPathContext->new( $mods ); $mods->registerNs( 'mods', 'http://www.loc.gov/mods/v3' ); # increment $index++; # map mods to dc; see http://www.loc.gov/standards/mods/mods-dcsimple.html my $titles = ''; foreach $node ( $mods->findnodes( './/mods:titleInfo/mods:title' )) { $titles .= &normalize( $node->textContent ) . '|' } my $creators = ''; foreach $node ( $mods->findnodes( './/mods:name/mods:namePart' )) { $creators .= &normalize( $node->textContent ) . '|' } my $subjects = ''; foreach $node ( $mods->findnodes( './/mods:subject/mods:topic' )) { $subjects .= &normalize( $node->textContent ) . '|' } foreach $node ( $mods->findnodes( './/mods:subject/mods:name' )) { $subjects .= &normalize( $node->textContent ) . '|' } foreach $node ( $mods->findnodes( './/mods:subject/mods:occupation' )) { $subjects .= &normalize( $node->textContent ) . '|' } foreach $node ( $mods->findnodes( './/mods:classification' )) { $subjects .= &normalize( $node->textContent ) . '|' } my $descriptions = ''; foreach $node ( $mods->findnodes( './/mods:abstract' )) { $descriptions .= &normalize( $node->textContent ) . '|' } foreach $node ( $mods->findnodes( './/mods:note' )) { $descriptions .= &normalize( $node->textContent ) . '|' } foreach $node ( $mods->findnodes( './/mods:tableOfContents' )) { $descriptions .= &normalize( $node->textContent ) . '|' } my $publishers = ''; foreach $node ( $mods->findnodes( './/mods:originInfo/mods:publisher' )) { $publishers .= &normalize( $node->textContent ) . '|' } my $dates = ''; foreach $node ( $mods->findnodes( './/mods:originInfo/mods:dateIssued' )) { $dates .= &normalize( $node->textContent ) . '|' } foreach $node ( $mods->findnodes( './/mods:originInfo/mods:dateCreated' )) { $dates .= &normalize( $node->textContent ) . '|' } foreach $node ( $mods->findnodes( './/mods:originInfo/mods:dateCaptured' )) { $dates .= &normalize( $node->textContent ) . '|' } foreach $node ( $mods->findnodes( './/mods:originInfo/mods:dateOther' )) { $dates .= &normalize( $node->textContent ) . '|' } my $types = ''; foreach $node ( $mods->findnodes( './/mods:typeOfResource' )) { $types .= &normalize( $node->textContent ) . '|' } foreach $node ( $mods->findnodes( './/mods:genre' )) { $types .= &normalize( $node->textContent ) . '|' } my $formats = ''; foreach $node ( $mods->findnodes( './/mods:physicalDescription' )) { $formats .= &normalize( $node->textContent ) . '|' } foreach $node ( $mods->findnodes( './/mods:internetMediaType' )) { $formats .= &normalize( $node->textContent ) . '|' } foreach $node ( $mods->findnodes( './/mods:extent' )) { $formats .= &normalize( $node->textContent ) . '|' } foreach $node ( $mods->findnodes( './/mods:form' )) { $formats .= &normalize( $node->textContent ) . '|' } my $identifiers = ''; foreach $node ( $mods->findnodes( './/mods:identifier' )) { $identifiers .= &normalize( $node->textContent ) . '|' } foreach $node ( $mods->findnodes( './/mods:location/URL' )) { $identifiers .= &normalize( $node->textContent ) . '|' } my $languages = ''; foreach $node ( $mods->findnodes( './/mods:language' )) { $languages .= &normalize( $node->textContent ) . '|' } my $relations = ''; foreach $node ( $mods->findnodes( './/mods:relatedItem' )) { $relations .= &normalize( $node->textContent ) . '|' } my $coverages = ''; foreach $node ( $mods->findnodes( './/mods:subject/mods:geographic' )) { $coverages .= &normalize( $node->textContent ) . '|' } foreach $node ( $mods->findnodes( './/mods:subject/mods:temporal' )) { $coverages .= &normalize( $node->textContent ) . '|' } foreach $node ( $mods->findnodes( './/mods:subject/mods:hierarchicalGeographic' )) { $coverages .= &normalize( $node->textContent ) . '|' } foreach $node ( $mods->findnodes( './/mods:subject/mods:cartographics' )) { $coverages .= &normalize( $node->textContent ) . '|' } my $rights = ''; foreach $node ( $mods->findnodes( './/accessCondition' )) { $rights .= &normalize( $node->textContent ) . '|' } # display print " record: $index\n"; print " title: $titles\n"; print " creators: $creators\n"; print " subjects: $subjects\n"; print " descriptions: $descriptions\n"; print " publishers: $publishers\n"; print " dates: $dates\n"; print " types: $types\n"; print " formats: $formats\n"; print " identifiers: $identifiers\n"; print " languages: $languages\n"; print " relations: $relations\n"; print " coverages: $coverages\n"; print " rights: $rights\n"; print "\n"; # update the index my $doc = $invindexer->new_doc( $index ); $doc->set_value( coverage => $coverages ); $doc->set_value( creator => $creators ); $doc->set_value( date => $dates ); $doc->set_value( description => $descriptions ); $doc->set_value( format => $formats ); $doc->set_value( identifier => $identifiers ); $doc->set_value( language => $languages ); $doc->set_value( publisher => $publishers ); $doc->set_value( relation => $relations ); $doc->set_value( rights => $rights ); $doc->set_value( subject => $subjects ); $doc->set_value( title => $titles ); $doc->set_value( type => $types ); $doc->set_value( collection => 'mods' ); $invindexer->add_doc( $doc ); } # optimize print "Optimizing... "; $invindexer->finish( optimize => 1 ); print "done.\n"; # done exit; sub normalize { my $s = shift; while ( $s =~ /^\s/ ) { $s =~ s/^\s// } while ( $s =~ /\s$/ ) { $s =~ s/\s$// } $s =~ s/\t/ /g; $s =~ s/\n/ /g; $s =~ s/\r/ /g; while ( $s =~ / / ) { $s =~ s/ / / } return $s; }