#!/usr/bin/perl

# ead2kinosearch.pl - index ead files using kinosearch

# Eric Lease Morgan <emorgan@nd.edu>
# 2007-04-19 - actually got it to go
# 2007-04-17 - first cut

use constant DATADIR => ( '../etc/ead' );
use constant INDEX   => '../etc/index';

# require
use strict;
use File::Find;
use XML::LibXML;
use KinoSearch::InvIndexer;
use KinoSearch::Analysis::PolyAnalyzer;

# initialize the index
my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en' );
my $invindexer = KinoSearch::InvIndexer->new(
	invindex => INDEX,
	create   => 0,
	analyzer => $analyzer
);
$invindexer->spec_field( name => 'identifier' );
$invindexer->spec_field( name => 'title' );
$invindexer->spec_field( name => 'creator' );
$invindexer->spec_field( name => 'subject' );
$invindexer->spec_field( name => 'description' );
$invindexer->spec_field( name => 'publisher' );
$invindexer->spec_field( name => 'collection' );

# get a list of files to index
my @ead_files;
find( \&get_file_names, DATADIR );

# process each file
my $index = 0;
foreach my $file ( @ead_files ) {

	# parse
	my ( $identifier, $title, $subjects, $creators, $description, $publisher ) = split /\t/, &parse( $file );

	# display/debug
	print "   identifier: $identifier\n";
	print "        title: $title\n";
	my @subject_terms = split /\|/, $subjects;
	foreach my $subject_term ( @subject_terms ) { print "      subject: $subject_term\n" }
	my @creator_names = split /\|/, $creators;
	foreach my $creator_name ( @creator_names ) { print "      creator: $creator_name\n" }
	print "  description: $description\n";
	print "    publisher: $publisher\n";
	print "\n";

	# index
	$index++;
	my $doc = $invindexer->new_doc( $index );
	$doc->set_value( identifier  => $identifier  );
	$doc->set_value( title       => $title       );
	$doc->set_value( subject     => $subjects    );
	$doc->set_value( creator     => $creators    );
	$doc->set_value( description => $description );
	$doc->set_value( publisher   => $publisher   );
	$doc->set_value( collection  => 'ead'   );
	$invindexer->add_doc( $doc );

}

# do it again, but for each folder this time
foreach my $file ( @ead_files ) {

	# initialize
	my $parser = XML::LibXML->new;
	my ( @nodes, $node );
	my $ead = $parser->parse_file( $file );

	# extract the identifier/url
	my $identifier = $ead->findvalue( '/ead/eadheader/eadid/@url' );
		
	# extract creators
	@nodes = ();
	$node  = '';
	my $creators;
	push @nodes, $ead->findnodes( '//origination/corpname' );
	push @nodes, $ead->findnodes( '//origination/persname' );
	push @nodes, $ead->findnodes( '//origination/famname'  );
	foreach $node ( @nodes ) { $creators .= &normalize( $node->textContent ) . '|' }

	# extract the title
	my ( $title, $subtitle, $titleproper );
	$title = &normalize( $ead->findvalue( '/ead/eadheader/filedesc/titlestmt/titleproper' ));
	$subtitle = &normalize( $ead->findvalue( '/ead/eadheader/filedesc/titlestmt/subtitle' ));
	if ( $subtitle ) { $titleproper = "$title / $subtitle" }
	else { $titleproper = $title }
	
	# extract the subjects
	@nodes = ();
	$node  = '';
	my $subjects;
	push @nodes, $ead->findnodes( '//controlaccess/subject'   );
	push @nodes, $ead->findnodes( '//controlaccess/persname'  );
	push @nodes, $ead->findnodes( '//controlaccess/corpname'  );
	push @nodes, $ead->findnodes( '//controlaccess/genreform' );
	foreach $node ( @nodes ) { $subjects .= &normalize( $node->textContent ) . '|' }

	# extract the publisher
	my $publisher = &normalize( $ead->findvalue( '/ead/eadheader/filedesc/publicationstmt/publisher' ));
	
	# loop through each folder
	@nodes = ();
	push @nodes, $ead->findnodes( '//c|//c01|//c02|//c03|//c04|//c05|//c06|//c07|//c08|//c09|//c10|//c11|//c12' );
	foreach $node ( @nodes ) {
	
		# get this folder
		my $folder = &normalize( $node->findvalue( './did/unittitle' ));
		
		# display/debug
		print "   identifier: $identifier\n";
		print "        title: $title\n";
		my @subject_terms = split /\|/, $subjects;
		foreach my $subject_term ( @subject_terms ) { print "      subject: $subject_term\n" }
		my @creator_names = split /\|/, $creators;
		foreach my $creator_name ( @creator_names ) { print "      creator: $creator_name\n" }
		print "       folder: $folder\n";
		print "    publisher: $publisher\n";
		print "\n";

		# index
		$index++;
		my $doc = $invindexer->new_doc( $index );
		$doc->set_value( identifier  => $identifier  );
		$doc->set_value( title       => $title       );
		$doc->set_value( subject     => $subjects    );
		$doc->set_value( creator     => $creators    );
		$doc->set_value( description => $folder      );
		$doc->set_value( publisher   => $publisher   );
		$invindexer->add_doc( $doc );
		
	}

}


# optimize
$invindexer->finish( optimize => 1 );

# done
exit;


sub get_file_names {

	# get the full path
	my $file = $File::Find::name;
	
	# remove non-xml files and add them to the list
	next if ( $file !~ /\.xml$/ );
	push @ead_files, $file;
			
}


sub parse_folders {

	
}

sub parse {

	# initialize
	my $file   = shift;
	my $parser = XML::LibXML->new;
	my ( @nodes, $node );
	my $ead = $parser->parse_file( $file );

	# extract the identifier/url
	my $identifier = $ead->findvalue( '/ead/eadheader/eadid/@url' );
		
	# extract creators
	@nodes = ();
	$node  = '';
	my $creators;
	push @nodes, $ead->findnodes( '//origination/corpname' );
	push @nodes, $ead->findnodes( '//origination/persname' );
	push @nodes, $ead->findnodes( '//origination/famname'  );
	foreach $node ( @nodes ) { $creators .= &normalize( $node->textContent ) . '|' }

	# extract the title
	my ( $title, $subtitle, $titleproper );
	$title = &normalize( $ead->findvalue( '/ead/eadheader/filedesc/titlestmt/titleproper' ));
	$subtitle = &normalize( $ead->findvalue( '/ead/eadheader/filedesc/titlestmt/subtitle' ));
	if ( $subtitle ) { $titleproper = "$title / $subtitle" }
	else { $titleproper = $title }
	
	# extract the subjects
	@nodes = ();
	$node  = '';
	my $subjects;
	push @nodes, $ead->findnodes( '//controlaccess/subject'   );
	push @nodes, $ead->findnodes( '//controlaccess/persname'  );
	push @nodes, $ead->findnodes( '//controlaccess/corpname'  );
	push @nodes, $ead->findnodes( '//controlaccess/genreform' );
	foreach $node ( @nodes ) { $subjects .= &normalize( $node->textContent ) . '|' }

	# extract the description
	my $description = &normalize( $ead->findvalue( '/ead/archdesc/scopecontent' ));
	
	# extract the publisher
	my $publisher = &normalize( $ead->findvalue( '/ead/eadheader/filedesc/publicationstmt/publisher' ));
	
	# return a record
	return "$identifier\t$titleproper\t$subjects\t$creators\t$description\t$publisher";

}


sub normalize {

	my $s = shift;
	while ( $s =~ /^\s/ ) { $s =~ s/^\s// }
	while ( $s =~ /\s$/ ) { $s =~ s/\s$// }
	$s =~ s/\t/ /g;
	$s =~ s/\n/ /g;
	$s =~ s/\r/ /g;
	while ( $s =~ /  / ) { $s =~ s/  / / }
	return $s;
	
}