package Apache2::Highlights; # highlights.cgi - create lists from a MyLibrary database # Eric Lease Morgan # September 8, 2009 - first investigations; based on MyLibary, obviously # September 12, 2009 - added search; tweaked wording # September 13, 2009 - tweaked keywords; made it a package # September 15, 2009 - tweaked about # configure use constant INSTANCE => 'highlights'; use constant LOCATION_LOCAL => 2; use constant SOLR => 'http://localhost:210/solr/highlights'; use constant CMDTERM => 'term'; # require/use use Apache2::Const -compile => qw( OK ); use CGI; use MyLibrary::Core; use strict; use WebService::Solr; sub handler { # initialize my $r = shift; my $html = ''; my $cgi = CGI->new; my $cmd = $cgi->param( 'cmd' ); MyLibrary::Config->instance( INSTANCE ); # branch accordingly if ( ! $cmd ) { # display the home page $html = &template; $html =~ s/##CONTENT##/&home/e; $html =~ s/##QUERY##//e; $html =~ s/##NUMBER_OF_RESOURCES##/&number_of_items/e; $html =~ s/##FACETS##/&facet_term_combinations( $cgi )/e; } elsif ($cmd eq 'about') { # display the home page $html = &template; $html =~ s/##CONTENT##/&about/e; $html =~ s/##QUERY##//e; } elsif ( $cmd eq 'search' ) { my $solr = WebService::Solr->new( SOLR ); my $query = $cgi->param( 'query' ); my $response = $solr->search( $query ); my @hits = $response->docs; my $hits = $#hits + 1; # build the hit list my $list = ''; foreach my $doc ( @hits ) { # slurp my $id = 'h-' . $doc->value_for( 'id' ); my $name = $doc->value_for( 'name' ); my $note = $doc->value_for( 'note' ); my $creator = $doc->value_for( 'creator' ); my $date_published = $doc->value_for( 'date_published' ); my $date_read = $doc->value_for( 'date_read' ); my $source = $doc->value_for( 'source' ); my $rights = $doc->value_for( 'rights' ); my $local_url = $doc->value_for( 'local_url' ); my $remote_url = $doc->value_for( 'remote_url' ); my @formats = $doc->values_for( 'Formats' ); my @keywords = $doc->values_for( 'facet_subject' ); my @themes = $doc->values_for( 'Themes' ); # details my $sublist = ''; if ( $creator ) { $sublist .= $cgi->li( "Creator(s) - $creator" ) } if ( $date_published ) { $sublist .= $cgi->li( "Date published - $date_published" ) } if ( $date_read ) { $sublist .= $cgi->li( "Date read - $date_read" ) } if ( $source ) { $sublist .= $cgi->li( "Source - $source" ) } if ( $rights ) { $sublist .= $cgi->li( "Rights - $rights" ) } if ( $remote_url ) { $sublist .= $cgi->li( "Remote URL - " . $cgi->a({ href => $remote_url }, $remote_url )) } my $keywords = ''; foreach ( @keywords ) { $keywords .= "$_; " } if ( $keywords ) { $sublist .= $cgi->li( "Keywords - $keywords" ) } my $formats = ''; foreach ( @formats ) { $formats .= "$_; " } $sublist .= $cgi->li( "Formats - $formats" ); my $themes = ''; foreach ( @themes ) { $themes .= "$_; " } $sublist .= $cgi->li( "Themes - $themes" ); $sublist = $cgi->ul( {style => 'margin-bottom: 1em'}, $sublist ); $sublist = $cgi->div({ -id => $id, -style => 'display: none' }, $sublist ); # create the hit list $list .= $cgi->li( $cgi->a({ href => $local_url }, $name ) . " - $note". '  ' . $cgi->a({ -href => "javascript:expand('$id')", -style => 'color: grey; text-decoration: none' }, 'details...' ) . $sublist ); } $list = $cgi->ol( $list ); # display the page $html = &template; $html =~ s/##CONTENT##/&results/e; $html =~ s/##QUERY##/$query/ge; $html =~ s/##HITS##/$hits/e; $html =~ s/##RESULTS##/$list/e; } elsif ($cmd eq 'term') { # get the input and find the term my $term_id = $cgi->param('id'); my $term = MyLibrary::Term->new( id => $term_id ); # build an html list of resources for the input my @resource_ids = $term->related_resources( sort => 'name' ); my $list; foreach my $id ( @resource_ids ) { # get this resouce my $resource = MyLibrary::Resource->new( id => $id ); # urls my $local_url = ''; foreach ( $resource->resource_locations ) { if ( $_->resource_location_type == LOCATION_LOCAL ) { $local_url = $_->location } } # build a list $list .= $cgi->li($cgi->a({-href => $local_url}, $resource->name ) . ' - ' . $resource->note); } $list = $cgi->ol($list); # display the about page $html = &template; $html =~ s/##CONTENT##/&term/e; $html =~ s/##QUERY##//e; $html =~ s/##TERM##/$term->term_name/e; $html =~ s/##TERM_NOTE##/$term->term_note/e; $html =~ s/##RESOURCE_LIST##/$list/e; } # done $html =~ s/##VERSION##/MyLibrary->version/e; $r->content_type( 'text/html' ); $r->print( $html ); return Apache2::Const::OK; } sub number_of_items { return scalar(MyLibrary::Resource->get_resources(output => 'id')) } sub facet_term_combinations { my $cgi = shift; # get all the facets, build a list, and display it my @facets = MyLibrary::Facet->get_facets (sort => 'name'); my $items; foreach (@facets) { # create a list of terms associated with each facet my $terms = '
    '; foreach my $id ($_->related_terms(sort => 'name')) { my $term = MyLibrary::Term->new(id => $id); $terms .= '
  1. ' . $term->term_name . ' (' . scalar($term->related_resources) . ' items)
  2. '; } $terms .= '
'; # build the list $items .= $cgi->li('' . $_->facet_name, ' - ', $_->facet_note, $terms); } # done return $cgi->ol($items); } sub term { return <Highlights & Annotations - ##TERM##

##TERM_NOTE##

##RESOURCE_LIST## EOF } sub template { return < Highlights & Annotations
##CONTENT##
EOF } sub results { return <Search results

Your search -- ##QUERY## -- found ##HITS## hit(s):

##RESULTS## EOF } sub home { return <Highlights & Annotations: A value-added reading list

This is a collection of things I've read with complete access to PDF versions. The collection presently includes ##NUMBER_OF_RESOURCES## items divided into the following set of facets and terms:

##FACETS## EOF } sub about { return <About Highlights & Annotations

I have been having fun recently indexing PDF files.

For the pasts six months or so I have been keeping the articles I've read in a pile, and I was rather amazed at the size of the pile. It was about a foot tall. When I read these articles I "actively" read them -- meaning, I write, scribble, highlight, and annotate the text with my own special notation denoting names, keywords, definitions, citations, quotations, list items, examples, etc. This active reading process: 1) makes for better comprehension on my part, and 2) makes the articles easier to review and pick out the ideas I thought were salient. Being the librarian I am, I thought it might be cool ("kewl") to make the articles into a collection. Thus, the beginnings of Highlights & Annotations: A Value-Added Reading List.

The techno-weenie process for creating and maintaining the content is something this community might find interesting:

  1. Print article and read it actively.
  2. Convert the printed article into a PDF file -- complete with embedded OCR -- with my handy-dandy ScanSnap scanner.
  3. Use MyLibrary to create metadata (author, title, date published, date read, note, keywords, facet/term combinations, local and remote URLs, etc.) describing the article.
  4. Save the PDF to my file system.
  5. Use pdttotext to extract the OCRed text from the PDF and index it along with the MyLibrary metadata using Solr.
  6. Provide a searchable/browsable user interface to the collection through a mod_perl module.

Software is never done, and if it were then it would be called hardware. Accordingly, I know there are some things I need to do before I can truely deem the system version 1.0. At the same time my excitment is overflowing and I thought I'd share some geekdom with my fellow hackers.

Fun with PDF files and open source software.

EOF } 1; # return true or die