#!/usr/bin/perl # breath.cgi - using Protovis, display co-occurances in the form of a network diagram # Eric Lease Morgan # December 24, 2010 - first investigations as a CGI script # December 26, 2010 - added ability to configure radius and threshold # December 28, 2010 - tweaked interface # January 2, 2011 - added sliders # January 9, 2011 - tweaked visualization; added matrix # configure use constant CORPUS => '../../etc/walden.txt'; use constant QUERY => 'woods'; use constant RADIUS => 60; use constant THRESHOLD => 5; # require use CGI; use Lingua::Concordance; use Lingua::StopWords qw( getStopWords ); use strict; require '../../lib/breath.pl'; # initialize my $cgi = CGI->new; my $query = $cgi->param( 'q' ) ? $cgi->param( 'q' ) : QUERY; my $radius = $cgi->param( 'r' ) ? $cgi->param( 'r' ) : RADIUS; my $threshold = $cgi->param( 't' ) ? $cgi->param( 't' ) : THRESHOLD; my $corpus = &slurp( CORPUS ); my $stopwords = &getStopWords( 'en' ); $$stopwords{ 'one' }++; my %matrix = (); my $html = ''; # get initial words found near the query and sort them by frequency my $words = &concordance( $corpus, $query, $radius, $stopwords ); my @keys = sort { $$words{ $b } <=> $$words{ $a } } keys %$words; # process each word (key) below a particular threshold for ( my $i = 0; $i < $threshold; $i++ ) { my $query = $keys[ $i ]; my $words = &concordance( $corpus, $query, $radius, $stopwords ); my @subkeys = ( sort { $$words{ $b } <=> $$words{ $a } } keys %$words ); my $coocurrances = &coocurances( $subkeys[ 0 ], $words, $threshold ); my @list = (); my $j = 0; my $key = ''; foreach ( sort { $$coocurrances{ $b } <=> $$coocurrances{ $a } } keys %$coocurrances ) { $j++; if ( $j == 1 ) { $key = $_ } push @list, $_; } $matrix{ $key } = [ @list ]; } my $matrix = ''; foreach ( sort keys %matrix ) { $matrix .= ''; my $list = $matrix{ $_ }; foreach my $word ( @$list ) { $matrix .= "" } $matrix .= ''; } $matrix .= '
$word
'; # create an ordered list of the found words my %words = (); my $i = 0; foreach ( keys %matrix ) { my $list = $matrix{ $_ }; foreach my $word ( @$list ) { my $found = 0; foreach my $key ( keys %words ) { if ( $key eq $word ) { $found = 1 } } if ( ! $found ) { $words{ $word } = $i; $i++; } } } # build a list of nodes for Protovis my $nodes = ''; foreach ( sort { $words{ $a } <=> $words{ $b } } keys %words ) { $nodes .= qq({nodeName:"$_"},) } chop $nodes; # build a list of links for Protovis my $links = ''; foreach my $source ( keys %matrix ) { my $list = $matrix{ $source }; foreach ( my $i = 1; $i < $threshold; $i++ ) { $links .= qq({source:$words{ $$list[ $source ] },target:$words{ $$list[ $i ] }},); } } chop $links; # build the html my $javascript = &same_breath; my $data = qq(\n); $html = &template; $html =~ s/##JAVASCRIPT##/$javascript/e; $html =~ s/##MATRIX##/$matrix/e; $html =~ s/##DATA##/$data/e; $html =~ s/##TITLEQUERY##/" :: $query"/eg; $html =~ s/##QUERY##/$query/eg; $html =~ s/##BREATH##/$radius/eg; $html =~ s/##DETAIL##/$threshold/eg; # done print $cgi->header; print $html; exit; sub template { return < ##DATA## Walden ##TITLEQUERY## ##JAVASCRIPT##

Walden

Play with this page to literally see what Henry David Thoreau said in the same breath when he used a given word in his book called Walden.


How to get the most out of this application:

  1. Enter or a word (or "regular expression") to locate in the book. This is your query.
  2. Change the size of the breath to increase or decrease the number of characters on either side of the query where co-occurances will be found. Values between 40 and 80 work well.
  3. Change the amount of detail to increase or decrease the number of co-occurances to identify for each query. Values between 4 and 7 work well.
  4. Adjust the breath and detail until the resulting diagram forms simple patterns with very few crossing lines.

The resulting graphic will tell you something about the text, and you will be doing "distant reading".

The visualization is based on the following matrix of terms, starting with and building upon "##QUERY##":

##MATRIX##

Zoom in and out to see detail. Drag nodes to simplify the diagram. Note enclosed polygons to "read" coherent thoughts. (Matrix)

Interesting queries: fish, god, ice or snow, man, pond, science, Shakespeare or Bacon, spring, walden, woodchuck. Compare them with another book -- A Week on the Concord and Merrimack Rivers.

For more information, see the blog posting.


Author: Eric Lease Morgan <eric_morgan\@infomotions.com>
Date created: December 24, 2010
Date updated: January 9, 2011
URL: http://infomotions.com/sandbox/network-diagrams/bin/walden/

EOT }