#!/usr/bin/perl # ead2kinosearch.pl - index ead files using kinosearch # Eric Lease Morgan # 2007-04-19 - actually got it to go # 2007-04-17 - first cut use constant DATADIR => ( '../etc/ead' ); use constant INDEX => '../etc/index'; # require use strict; use File::Find; use XML::LibXML; use KinoSearch::InvIndexer; use KinoSearch::Analysis::PolyAnalyzer; # initialize the index my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en' ); my $invindexer = KinoSearch::InvIndexer->new( invindex => INDEX, create => 0, analyzer => $analyzer ); $invindexer->spec_field( name => 'identifier' ); $invindexer->spec_field( name => 'title' ); $invindexer->spec_field( name => 'creator' ); $invindexer->spec_field( name => 'subject' ); $invindexer->spec_field( name => 'description' ); $invindexer->spec_field( name => 'publisher' ); $invindexer->spec_field( name => 'collection' ); # get a list of files to index my @ead_files; find( \&get_file_names, DATADIR ); # process each file my $index = 0; foreach my $file ( @ead_files ) { # parse my ( $identifier, $title, $subjects, $creators, $description, $publisher ) = split /\t/, &parse( $file ); # display/debug print " identifier: $identifier\n"; print " title: $title\n"; my @subject_terms = split /\|/, $subjects; foreach my $subject_term ( @subject_terms ) { print " subject: $subject_term\n" } my @creator_names = split /\|/, $creators; foreach my $creator_name ( @creator_names ) { print " creator: $creator_name\n" } print " description: $description\n"; print " publisher: $publisher\n"; print "\n"; # index $index++; my $doc = $invindexer->new_doc( $index ); $doc->set_value( identifier => $identifier ); $doc->set_value( title => $title ); $doc->set_value( subject => $subjects ); $doc->set_value( creator => $creators ); $doc->set_value( description => $description ); $doc->set_value( publisher => $publisher ); $doc->set_value( collection => 'ead' ); $invindexer->add_doc( $doc ); } # do it again, but for each folder this time foreach my $file ( @ead_files ) { # initialize my $parser = XML::LibXML->new; my ( @nodes, $node ); my $ead = $parser->parse_file( $file ); # extract the identifier/url my $identifier = $ead->findvalue( '/ead/eadheader/eadid/@url' ); # extract creators @nodes = (); $node = ''; my $creators; push @nodes, $ead->findnodes( '//origination/corpname' ); push @nodes, $ead->findnodes( '//origination/persname' ); push @nodes, $ead->findnodes( '//origination/famname' ); foreach $node ( @nodes ) { $creators .= &normalize( $node->textContent ) . '|' } # extract the title my ( $title, $subtitle, $titleproper ); $title = &normalize( $ead->findvalue( '/ead/eadheader/filedesc/titlestmt/titleproper' )); $subtitle = &normalize( $ead->findvalue( '/ead/eadheader/filedesc/titlestmt/subtitle' )); if ( $subtitle ) { $titleproper = "$title / $subtitle" } else { $titleproper = $title } # extract the subjects @nodes = (); $node = ''; my $subjects; push @nodes, $ead->findnodes( '//controlaccess/subject' ); push @nodes, $ead->findnodes( '//controlaccess/persname' ); push @nodes, $ead->findnodes( '//controlaccess/corpname' ); push @nodes, $ead->findnodes( '//controlaccess/genreform' ); foreach $node ( @nodes ) { $subjects .= &normalize( $node->textContent ) . '|' } # extract the publisher my $publisher = &normalize( $ead->findvalue( '/ead/eadheader/filedesc/publicationstmt/publisher' )); # loop through each folder @nodes = (); push @nodes, $ead->findnodes( '//c|//c01|//c02|//c03|//c04|//c05|//c06|//c07|//c08|//c09|//c10|//c11|//c12' ); foreach $node ( @nodes ) { # get this folder my $folder = &normalize( $node->findvalue( './did/unittitle' )); # display/debug print " identifier: $identifier\n"; print " title: $title\n"; my @subject_terms = split /\|/, $subjects; foreach my $subject_term ( @subject_terms ) { print " subject: $subject_term\n" } my @creator_names = split /\|/, $creators; foreach my $creator_name ( @creator_names ) { print " creator: $creator_name\n" } print " folder: $folder\n"; print " publisher: $publisher\n"; print "\n"; # index $index++; my $doc = $invindexer->new_doc( $index ); $doc->set_value( identifier => $identifier ); $doc->set_value( title => $title ); $doc->set_value( subject => $subjects ); $doc->set_value( creator => $creators ); $doc->set_value( description => $folder ); $doc->set_value( publisher => $publisher ); $invindexer->add_doc( $doc ); } } # optimize $invindexer->finish( optimize => 1 ); # done exit; sub get_file_names { # get the full path my $file = $File::Find::name; # remove non-xml files and add them to the list next if ( $file !~ /\.xml$/ ); push @ead_files, $file; } sub parse_folders { } sub parse { # initialize my $file = shift; my $parser = XML::LibXML->new; my ( @nodes, $node ); my $ead = $parser->parse_file( $file ); # extract the identifier/url my $identifier = $ead->findvalue( '/ead/eadheader/eadid/@url' ); # extract creators @nodes = (); $node = ''; my $creators; push @nodes, $ead->findnodes( '//origination/corpname' ); push @nodes, $ead->findnodes( '//origination/persname' ); push @nodes, $ead->findnodes( '//origination/famname' ); foreach $node ( @nodes ) { $creators .= &normalize( $node->textContent ) . '|' } # extract the title my ( $title, $subtitle, $titleproper ); $title = &normalize( $ead->findvalue( '/ead/eadheader/filedesc/titlestmt/titleproper' )); $subtitle = &normalize( $ead->findvalue( '/ead/eadheader/filedesc/titlestmt/subtitle' )); if ( $subtitle ) { $titleproper = "$title / $subtitle" } else { $titleproper = $title } # extract the subjects @nodes = (); $node = ''; my $subjects; push @nodes, $ead->findnodes( '//controlaccess/subject' ); push @nodes, $ead->findnodes( '//controlaccess/persname' ); push @nodes, $ead->findnodes( '//controlaccess/corpname' ); push @nodes, $ead->findnodes( '//controlaccess/genreform' ); foreach $node ( @nodes ) { $subjects .= &normalize( $node->textContent ) . '|' } # extract the description my $description = &normalize( $ead->findvalue( '/ead/archdesc/scopecontent' )); # extract the publisher my $publisher = &normalize( $ead->findvalue( '/ead/eadheader/filedesc/publicationstmt/publisher' )); # return a record return "$identifier\t$titleproper\t$subjects\t$creators\t$description\t$publisher"; } sub normalize { my $s = shift; while ( $s =~ /^\s/ ) { $s =~ s/^\s// } while ( $s =~ /\s$/ ) { $s =~ s/\s$// } $s =~ s/\t/ /g; $s =~ s/\n/ /g; $s =~ s/\r/ /g; while ( $s =~ / / ) { $s =~ s/ / / } return $s; }