#!/usr/bin/perl

# tei2kinosearch.pl - index tei files using kinosearch

# Eric Lease Morgan <emorgan@nd.edu>
# 2007-04-26 - first cut; based on ead2kinosearch.pl

use constant DATADIR => ( '../etc/tei' );
use constant INDEX   => '../etc/index';

# require
use strict;
use File::Find;
use XML::LibXML;
use KinoSearch::InvIndexer;
use KinoSearch::Analysis::PolyAnalyzer;

# initialize the index
my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en' );
my $invindexer = KinoSearch::InvIndexer->new(
	invindex => INDEX,
	create   => 0,
	analyzer => $analyzer
);
$invindexer->spec_field( name => 'identifier' );
$invindexer->spec_field( name => 'title' );
$invindexer->spec_field( name => 'creator' );
$invindexer->spec_field( name => 'subject' );
$invindexer->spec_field( name => 'description' );
$invindexer->spec_field( name => 'publisher' );
$invindexer->spec_field( name => 'collection' );

# get a list of files to index
my @tei_files;
find( \&get_file_names, DATADIR );

# process each file
my $index = 0;
foreach my $file ( @tei_files ) {

	# parse
	my ( $identifier, $title, $subjects, $creators, $description, $publisher ) = split /\t/, &parse( $file );

	# display/debug
	print "   identifier: $identifier\n";
	print "        title: $title\n";
	my @subject_terms = split /\|/, $subjects;
	foreach my $subject_term ( @subject_terms ) { print "      subject: $subject_term\n" }
	my @creator_names = split /\|/, $creators;
	foreach my $creator_name ( @creator_names ) { print "      creator: $creator_name\n" }
	print "  description: $description\n";
	print "    publisher: $publisher\n";
	print "\n";

	# index
	$index++;
	my $doc = $invindexer->new_doc( $index );
	$doc->set_value( identifier  => $identifier  );
	$doc->set_value( title       => $title       );
	$doc->set_value( subject     => $subjects    );
	$doc->set_value( creator     => $creators    );
	$doc->set_value( description => $description );
	$doc->set_value( publisher   => $publisher   );
	$doc->set_value( collection  => 'tei'   );
	$invindexer->add_doc( $doc );

}


# optimize
$invindexer->finish( optimize => 1 );

# done
exit;


sub get_file_names {

	# get the full path
	my $file = $File::Find::name;
	
	# remove non-xml files and add them to the list
	next if ( $file !~ /\.xml$/ );
	push @tei_files, $file;
			
}


sub parse {

	# initialize
	my $file   = shift;
	my $parser = XML::LibXML->new;
	my ( @nodes, $node );
	my $tei = $parser->parse_file( $file );

	# extract the identifier/url
	my $identifier = $tei->findvalue( '/TEI.2/teiHeader/fileDesc/publicationStmt/idno' );
		
	# extract creators
	@nodes = ();
	my $creators = '';
	push @nodes, $tei->findnodes( '/TEI.2/teiHeader/fileDesc/titleStmt/author/name' );
	$creators = &normalize( $nodes[ 0 ]->textContent );

	# extract the title
	@nodes = ();
	my $title = '';
	push @nodes, $tei->findnodes( '/TEI.2/teiHeader/fileDesc/titleStmt/title' );
	$title = &normalize( $nodes[ 0 ]->textContent );
	
	# extract the subjects
	@nodes = ();
	$node  = '';
	my $subjects = '';
	push @nodes, $tei->findnodes( '//keywords/list/item'   );
	foreach $node ( @nodes ) { $subjects .= &normalize( $node->textContent ) . '|' }

	# extract (create) the description
	my $description = &normalize( substr( $tei->findvalue( '/TEI.2/text/body' ), 0, 253 ). '...' );
	
	# extract the publisher
	my $publisher = &normalize( $tei->findvalue( '/TEI.2/teiHeader/fileDesc/publicationStmt/publisher' ));
	
	# return a record
	return "$identifier\t$title\t$subjects\t$creators\t$description\t$publisher";

}


sub normalize {

	my $s = shift;
	while ( $s =~ /^\s/ ) { $s =~ s/^\s// }
	while ( $s =~ /\s$/ ) { $s =~ s/\s$// }
	$s =~ s/\t/ /g;
	$s =~ s/\n/ /g;
	$s =~ s/\r/ /g;
	while ( $s =~ /  / ) { $s =~ s/  / / }
	return $s;
	
}