#!/usr/bin/perl

# vra2kinosearch.pl - index vra files using kinosearch

# Eric Lease Morgan <emorgan@nd.edu>
# 2007-04-27 - first cut; based on ead2kinosearch.pl

use constant DATADIR => ( '../etc/vra' );
use constant INDEX   => '../etc/index';

# require
use strict;
use File::Find;
use XML::LibXML;
use KinoSearch::InvIndexer;
use KinoSearch::Analysis::PolyAnalyzer;

# initialize the index
my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en' );
my $invindexer = KinoSearch::InvIndexer->new(
	invindex => INDEX,
	create   => 0,
	analyzer => $analyzer
);
$invindexer->spec_field( name => 'identifier' );
$invindexer->spec_field( name => 'title' );
$invindexer->spec_field( name => 'creator' );
$invindexer->spec_field( name => 'subject' );
$invindexer->spec_field( name => 'description' );
$invindexer->spec_field( name => 'format' );
$invindexer->spec_field( name => 'source' );
$invindexer->spec_field( name => 'rights' );
$invindexer->spec_field( name => 'date' );
$invindexer->spec_field( name => 'collection' );

# get a list of files to index
my @vra_files;
find( \&get_file_names, DATADIR );

# process each file
my $index = 0;
foreach my $file ( @vra_files ) {

	# parse
	my ( $identifier, $title, $subjects, $creators, $description, $formats, $source, $rights, $date ) = split /\t/, &parse( $file );

	# display/debug
	print "   identifier: $identifier\n";
	print "        title: $title\n";
	my @subject_terms = split /\|/, $subjects;
	foreach my $subject_term ( @subject_terms ) { print "      subject: $subject_term\n" }
	my @creator_names = split /\|/, $creators;
	foreach my $creator_name ( @creator_names ) { print "      creator: $creator_name\n" }
	print "  description: $description\n";
	my @format_names = split /\|/, $formats;
	foreach my $format_name ( @format_names ) { print "       format: $format_name\n" }
	print "       source: $source\n";
	print "       rights: $rights\n";
	print "         date: $date\n";
	print "\n";

	# index
	$index++;
	my $doc = $invindexer->new_doc( $index );
	$doc->set_value( identifier  => $identifier  );
	$doc->set_value( title       => $title       );
	$doc->set_value( subject     => $subjects    );
	$doc->set_value( creator     => $creators    );
	$doc->set_value( description => $description );
	$doc->set_value( format      => $formats );
	$doc->set_value( source      => $source );
	$doc->set_value( rights      => $rights );
	$doc->set_value( date        => $date );
	$doc->set_value( collection  => 'vra' );
	$invindexer->add_doc( $doc );

}


# optimize
$invindexer->finish( optimize => 1 );

# done
exit;


sub get_file_names {

	# get the full path
	my $file = $File::Find::name;
	
	# remove non-xml files and add them to the list
	next if ( $file !~ /\.xml$/ );
	push @vra_files, $file;
			
}


sub parse {

	# initialize
	my $file   = shift;
	my $parser = XML::LibXML->new;
	my ( @nodes, $node );
	my $vra = $parser->parse_file( $file );
	$vra = XML::LibXML::XPathContext->new( $vra );
	$vra->registerNs( 'v', 'http://www.vraweb.org/vracore4.htm' );

	# extract the identifier/url
	my $identifier = $vra->findvalue( '/v:vra/v:work/@refid' );
		
	# extract creators
	@nodes = ();
	my $creators = '';
	$creators = $vra->findvalue( '/v:vra/v:work/v:agentSet/v:display' );

	# extract the title
	my $title = '';
	$title = &normalize( $vra->findvalue( '/v:vra/v:work/v:titleSet/v:display' ));
	
	# extract the subjects
	@nodes = ();
	$node  = '';
	my $subjects = '';
	push @nodes, $vra->findnodes( '//v:subject/v:term'   );
	foreach $node ( @nodes ) { $subjects .= &normalize( $node->textContent ) . '|' }

	# extract (create) the description
	@nodes = ();
	$node  = '';
	my $description = '';
	push @nodes, $vra->findnodes( '//v:image/v:titleSet/v:display'   );
	$description = &normalize( $nodes[ 0 ]->textContent );
	
	# extract the format
	@nodes = ();
	$node  = '';
	my $formats = '';
	push @nodes, $vra->findnodes( '//v:image/v:worktypeSet/v:display'   );
	$formats = &normalize( $nodes[ 0 ]->textContent );
	
	# extract the source
	my $source = $vra->findvalue( '/v:vra/v:work/@source' );
	
	# extract the source
	my $rights = $vra->findvalue( '/v:vra/v:image/v:rightsSet/v:rights' );
	
	# extract the date
	my $date = $vra->findvalue( '/v:vra/v:work/v:dateSet/v:display' );
	
	# return a record
	return "$identifier\t$title\t$subjects\t$creators\t$description\t$formats\t$source\t$rights\t$date";

}


sub normalize {

	my $s = shift;
	while ( $s =~ /^\s/ ) { $s =~ s/^\s// }
	while ( $s =~ /\s$/ ) { $s =~ s/\s$// }
	$s =~ s/\t/ /g;
	$s =~ s/\n/ /g;
	$s =~ s/\r/ /g;
	while ( $s =~ /  / ) { $s =~ s/  / / }
	return $s;
	
}