#!/usr/bin/perl
#
# gscloud.pl - Google Search Cloud
#
# (c) Copyright, 2006 by John Bokma, http://johnbokma.com/
# License: The Artistic License
#
# Last updated: 2006-10-19 10:30:23 -0500

use strict;
use warnings;

use Carp;
use Encode;
use HTML::Entities;
use URI::Escape;
use Getopt::Long;

my $time = time;
my $steps = 18;
my $mapping = 'log';
my $sort = 'alpha';
my $limit = 75;
my $scale = 0;
my $prefix = '';


sub print_usage_and_exit {

	print <<USAGE;
usage: gscloud.pl [OPTIONS] ACCESS_LOG

options:

	steps   - number of cloud sizes, default $steps
	mapping - log or lin, default $mapping
	sort    - alpha or num, default $sort
	limit   - maximum number of phrases, default $limit
	scale   - scale when phrases less then steps, default $scale
	prefix  - prefix for paths (creates links), default none
USAGE

	exit;
}


GetOptions(

	"steps=i"   => \$steps,
	"mapping=s" => \$mapping,
	"sort=s"    => \$sort,
	"limit=i"   => \$limit,
	"scale=i"   => \$scale,
	"prefix=s"  => \$prefix
);

my $filename = shift;
defined $filename or print_usage_and_exit;

open my $fh, $filename or
	die "Can't open '$filename' for reading: $!";

my %stats;
while ( my $line = <$fh> ) {

	$line =~ m!

		\[\d{2}/\w{3}/\d{4}(?::\d\d){3}.+?\]
		\s"GET\s(\S+)\sHTTP/\d.\d"
		\s(\S+)
		\s\S+
		\s"http://w{1,3}\.google\.
		(?:[a-z]{2}|com?\.[a-z]{2}|com)\.?/
		[^\"]*q=([^\"&]+)[^\"]*"

	!xi or next;

	my ( $path, $status, $query ) = ( $1, $2, $3 );

	$query =~ s/\+/ /g;
	$query = join ' ' => split ' ', uri_unescape $query;
	$query = Encode::decode_utf8 $query;

	$stats{ "$path:$status" }{ sum }++;
	$stats{ "$path:$status" }{ queries }{ $query }++;
}

close $fh or die "Can't close '$filename' after reading: $!";

print_html_start();

my @ps = sort { $stats{ $b }{ sum } <=> $stats{ $a }{ sum } } keys %stats;
for my $ps ( @ps ) {

	my ( $path, $status ) = $ps =~ /(.*):(\d+)/;
	my $sum = $stats{ $ps }{ sum };

	my $section = $path;
	$prefix and $section = qq(<a href="$prefix$path">$section</a>);

	print "<h2>$section",
		qq( <span class="small">total: $sum, status: $status</span>),
		"</h2>\n";
	print_cloud_as_html_list(

		frequencies => $stats{ $ps }{ queries },
		steps => $steps,
		mapping => $mapping,
		sort => $sort,
		limit => $limit,
		scale => $scale,
	);
}

print_html_end( time - $time );
exit;


sub print_html_start {

	print <<"START";
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
 "http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
	<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
	<title>Google Search Cloud (beta)</title>
	<link rel="stylesheet" type="text/css" href="gscloud.css">
</head>
<body>
<h1>Google <span class="blue">Search Cloud</span>
<span class="beta">beta</span></h1>
START
}


sub print_html_end {

	my $delta = shift;
	print <<FOOTER;
<div class="footer">
	<a href="http://johnbokma.com/perl/google-search-cloud.html">Google
	Search Cloud</a>, written by John Bokma, took $delta seconds to
	generate this page.
</div>
FOOTER
}


sub print_cloud_as_html_list {

	my %params = @_;

	my $frequencies = $params{ frequencies }
		or croak "Parameter 'frequencies' not given";

	my $steps = $params{ steps }
		or croak "Parameter 'steps' not given";

	my $mapping = $params{ mapping } || 'log';
	$mapping eq 'log' or $mapping eq 'lin'
		or croak "Parameter 'mapping' has an unsupported value ($mapping)";

	my $sort_method = $params{ sort } || 'alpha';
	$sort_method eq 'alpha' or $sort_method eq 'num'
		or croak "Parameter 'sort' has an unsupported value ($sort_method)";

	my @keys = sort
		{ $frequencies->{ $b } <=> $frequencies->{ $a } } keys %$frequencies;

	# if there is a limit, take the top limit frequencies
	$params{ limit } and @keys = splice @keys, 0, $params{ limit };
	@keys or return;    # nothing to do

	$steps = @keys if $params{ scale } and $steps > @keys;
	my $max_steps = $steps - 1;

	my ( $max, $min ) = @$frequencies{ $keys[ 0 ], $keys[ -1 ] };

	print qq(<ul class="cloud">\n);

	my $step = $min == $max
		? sub { 1 }
		: $mapping eq 'log'
			? sub {

				1 + int( $max_steps * (
					( log( $frequencies->{ $_[ 0 ] } ) - log( $min )) /
					( log( $max ) - log( $min ) ) )
				)
			}
			: sub {

				1 + int( $max_steps *
					( $frequencies->{ $_[ 0 ] } - $min ) /
					( $max - $min )
				)

			};

	$sort_method eq 'alpha' and @keys = sort { lc $a cmp lc $b } @keys;

	print '  <li class="size' . $step->( $_ ) . '">',
		encode_entities( $_ ), "</li>\n" for @keys;

	print "</ul>\n";
}
