#!/usr/bin/perl
#
# gscloud.pl - Google Search Cloud
#
# (c) Copyright, 2006 by John Bokma, http://johnbokma.com/
# License: The Artistic License
#
# Last updated: 2006-10-19 10:30:23 -0500

use strict;
use warnings;

use Carp;
use Encode;
use Unicode::Japanese;
use Unicode::Japanese qw(unijp);
use CGI;
use HTML::Entities;
use URI::Escape;
use Getopt::Long;

binmode STDOUT, ':utf8';

my $time = time;
my $steps = 18;
my $mapping = 'log';
my $sort = 'alpha';
my $limit = 75;
my $scale = 0;
my $prefix = '';

sub print_usage_and_exit {

	print <<USAGE;
usage: gscloud.pl [OPTIONS] ACCESS_LOG

options:
	steps   - number of cloud sizes, default $steps
	mapping - log or lin, default $mapping
	sort    - alpha or num, default $sort
	limit   - maximum number of phrases, default $limit
	scale   - scale when phrases less then steps, default $scale
	prefix  - prefix for paths (creates links), default none
USAGE

	exit;
}

GetOptions(

	"steps=i"   => \$steps,
	"mapping=s" => \$mapping,
	"sort=s"    => \$sort,
	"limit=i"   => \$limit,
	"scale=i"   => \$scale,
	"prefix=s"  => \$prefix
);

my $filename = shift;
defined $filename or $filename = '-';

open my $fh, $filename or
	die "Can't open '$filename' for reading: $!";

my %stats;
while ( my $line = <$fh> ) {

	$line =~ m!
		\[\d{2}/\w{3}/\d{4}(?::\d\d){3}.+?\]
		\s"GET\s(\S+)\sHTTP/\d.\d"
		\s(\S+)
		\s\S+
		\s"([^"]*)
	!xi or next;

	my ( $path, $status, $referer ) = ( $1, $2, $3 );

	# %XX がなぜか \xXX になっている場合がある??
	$referer =~ s/\\x(\w\w)/%$1/g;

	$referer =~ m!
		http://w{1,3}\.google\.(?:[a-z]{2}|com?\.[a-z]{2}|com)\.?/
		[^?]*\?([^"]*)
	!xi or next;

	my $query = $1;

	my $cgi = new CGI($query);
	my $q = $cgi->param('q');
	my $ie = $cgi->param('ie');

	next unless $q;

	if (defined $ie) { # ie パラメータがある場合
		if ($ie =~ /^utf-8|utf8$/i) { # UTF-8
			$q = unijp($q, 'utf8')->utf8;
		}
		elsif ($ie =~ /^(shift_jis|shift-jis|sjis|x-sjis)$/i) { # Shift_JIS
			$q = unijp($q, 'sjis')->utf8;
		}
		elsif ($ie =~ /^(euc-jp|euc_jp)$/i) { # EUC-JP
			$q = unijp($q, 'euc')->utf8;
		}
		elsif ($ie =~ /^iso-2022-jp$/i) { # ISO-2022-JP
			$q = unijp($q, 'jis')->utf8;
		}
		else { # どれでも無い場合は Shift_JIS と仮定
			$q = unijp($q, 'sjis')->utf8;
		}
	}
	{
		# 上記の変換で正常な UTF-8 バイト列にならない場合は Shift_JIS と仮定して再度変換
		use warnings FATAL => 'all';
		eval {
			my $dummy = lc(unijp($q)->h2zKana->z2hSym->z2hNum->z2hAlpha->getu);
		};
		if ($@) {
			$q = unijp($q, 'sjis')->utf8;
		}
	}
	# 日本語文字列正規化
	$q = unijp($q)->h2zKana->z2hSym->z2hNum->z2hAlpha->getu;
	# 大文字小文字正規化
	$q = lc($q);

	# なぜか 0x00 が入っていることが…
	$q =~ tr/\x{0}/ /;

	# or を取り除く
	$q =~ s" +or +" "g;
	# site:hogehoge 等を取り除く
	$q =~ s"(^| )\w+:[^ ]*""g;

	$q = join ' ' => split ' ', $q;
	$q = '(no query)' if $q eq '';

	$stats{ "$path:$status" }{ sum }++;
	$stats{ "$path:$status" }{ queries }{ $q }++;
	$stats{ "$path:$status" }{ uri }{ $q } = $referer;
}

close $fh or die "Can't close '$filename' after reading: $!";

print_html_start();

my @ps = sort { $stats{ $b }{ sum } <=> $stats{ $a }{ sum } } keys %stats;
for my $ps ( @ps ) {

	my ( $path, $status ) = $ps =~ /(.*):(\d+)/;
	my $sum = $stats{ $ps }{ sum };

	my $section = encode_entities($path);
	$prefix and $section = qq(<a href=") . encode_entities("$prefix$path"). qq(">$section</a>);

	print "<h2>$section",
		qq( <span class="small">total: $sum, status: $status</span>),
		"</h2>\n";
	print_cloud_as_html_list(

		frequencies => $stats{ $ps }{ queries },
		uri => $stats{ $ps }{ uri },
		steps => $steps,
		mapping => $mapping,
		sort => $sort,
		limit => $limit,
		scale => $scale,
	);
}

print_html_end( time - $time );
exit;


sub print_html_start {

	print <<"START";
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
 "http://www.w3.org/TR/html4/strict.dtd">
<html lang="ja">
<head>
	<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
	<title>Google Search Cloud (beta)</title>
	<link rel="stylesheet" type="text/css" href="gscloud-jp.css">
</head>
<body>
<h1>Google <span class="blue">Search Cloud</span>
<span class="beta">beta</span></h1>
START
}


sub print_html_end {

	my $delta = shift;
	print <<FOOTER;
<div class="footer">
	<a href="http://johnbokma.com/perl/google-search-cloud.html">Google
	Search Cloud</a>, written by John Bokma, took $delta seconds to
	generate this page.
</div>
</body>
</html>
FOOTER
}


sub print_cloud_as_html_list {

	my %params = @_;

	my $frequencies = $params{ frequencies }
		or croak "Parameter 'frequencies' not given";

	my $uri = $params{ uri }
		or croak "Parameter 'uri' not given";

	my $steps = $params{ steps }
		or croak "Parameter 'steps' not given";

	my $mapping = $params{ mapping } || 'log';
	$mapping eq 'log' or $mapping eq 'lin'
		or croak "Parameter 'mapping' has an unsupported value ($mapping)";

	my $sort_method = $params{ sort } || 'alpha';
	$sort_method eq 'alpha' or $sort_method eq 'num'
		or croak "Parameter 'sort' has an unsupported value ($sort_method)";

	my @keys = sort
		{ $frequencies->{ $b } <=> $frequencies->{ $a } } keys %$frequencies;

	# if there is a limit, take the top limit frequencies
	$params{ limit } and @keys = splice @keys, 0, $params{ limit };
	@keys or return;    # nothing to do

	$steps = @keys if $params{ scale } and $steps > @keys;
	my $max_steps = $steps - 1;

	my ( $max, $min ) = @$frequencies{ $keys[ 0 ], $keys[ -1 ] };

	print qq(<ul class="cloud">\n);

	my $step = $min == $max
		? sub { 1 }
		: $mapping eq 'log'
			? sub {

				1 + int( $max_steps * (
					( log( $frequencies->{ $_[ 0 ] } ) - log( $min )) /
					( log( $max ) - log( $min ) ) )
				)
			}
			: sub {

				1 + int( $max_steps *
					( $frequencies->{ $_[ 0 ] } - $min ) /
					( $max - $min )
				)

			};

	$sort_method eq 'alpha' and @keys = sort { lc $a cmp lc $b } @keys;

	print '  <li class="size' . $step->( $_ ) . '"><a href="',
		encode_entities($uri->{ $_ }), '">',
		encode_entities($_, '<>&"'), "</a></li>\n" for @keys;

	print "</ul>\n";
}
