#!/usr/bin/perl

# exit vals:
# 1: incorrect options
# 2: missing snowclone
# 3: not a snowclone
# 10: missing google key
# 11: rejected google key

use strict;
use warnings;

use vars qw($keep_punctuation $keep_caps $verbose $hs);

use English;

use Net::Google;
use Getopt::Long;
use Pod::Usage;
use HTML::Strip;
use HTML::Entities;

   $hs = new HTML::Strip;

my $google_key = GetGoogleKey();
my @langs = qw(en);
# limited to 10 according to http://www.google.com/apis/reference.html#2_7
my $results_per_query = 10;
my $results = 100;
   $verbose = 0;
   $keep_caps = 0;
   $keep_punctuation = 0;
my $safe = 0;
my $filter = 0;
my $proxy = $ENV{"HTTP_PROXY"};
my $help = 0;

GetOptions("help|h" => \$help,
	   "verbose|v+" => \$verbose,
	   "langs|l=s@" => \@langs,
	   "per|p=i" => \$results_per_query,
	   "punctuation|u" => \$keep_punctuation,
	   "caps|c" => \$keep_caps,
	   "key|k=s" => \$google_key,
	   "results|r=i" => \$results,
	   "safe|s" => \$safe,
	   "filter|f" => \$filter,
	   "proxy" => \$proxy,)
    or pod2usage(-verbose => 0, -message => $!, -exitval => 1);

pod2usage(0, -verbose => 1) if $help;

if (!length($google_key)) {
  pod2usage(-verbose => 0, -message => "No Google key given!", -exitval => 10);
}

# if they gave multiple -l args, they'll be in @langs. if they gave a single
# arg as "en ja", then it'll be in a single var - not good.
@langs = split(/ /, join(' ', @langs));

if ($#ARGV != "0") {
# missing snowclone
  pod2usage(-message => "No snowclone given", -verbose => 0, -exitval => 1);
}
my $clone = $ARGV[0];

if ($clone !~ /\*/) {
  pod2usage(-verbose => 0, -message => "That doesn't look like a snowclone to me - there's no * in it." -exitval => 3);
}

if ($results > 1000) {
  print "Warning: Google limits results to 1000 - you requested $results. Trying anyways.\n";
}

my $grepclone = $clone;

$grepclone = normalize($grepclone);
$grepclone =~ s/\*/\(\\w\+\)/g;

my $google = Net::Google->new(key => $google_key);
if (!defined $google) { die $! }
my $search = $google->search(lr => @langs);
if (!defined $search) { die $! }

$search->query(qq("$clone"));
$search->max_results($results_per_query);
$search->safe($safe);
$search->filter($filter);
$search->http_proxy($proxy);

my %clones;
my %stats;

for (my $start = 0; $start < $results; $start += $results_per_query) {
  verbose(1, "Searching: $start - " . ($start + $results_per_query) . "\n");
  $search->starts_at($start);
  # when would multiple responses be returned?
  my $response = @{$search->response()}[0]; # XXX hack, take the first
  $stats{total} = $response->estimatedTotalResultsCount();
  $stats{estimate} = not $response->estimateIsExact();
  my $results = $search->results();
  foreach my $result (@$results) {
    my $snippet = $result->snippet();
    $snippet = normalize($snippet);
    $snippet =~ m/$grepclone/ or verbose(1, "clone not found in: $snippet\n");
    $clones{$MATCH}++;
  }
  if ($response->endIndex() < ($start + $results_per_query)) {
    last;
  }
}

print "Total results";
if ($stats{estimate}) { print " (estimate)"; }
print ": " . $stats{total} . "\n";
foreach my $match (sort { $clones{$b} <=> $clones{$a} } keys %clones) {
  print "$match: $clones{$match}\n";
}

sub verbose {
  my ($level, $phrase) = @_;

  if ($verbose >= $level) {
    print STDERR $phrase;
  }
}

# normalize clone for grepping
# also normalize snippet text
sub normalize {
  my ($grepclone) = @_;
  
  verbose(2, "Normalizing: $grepclone\n");
  # no more HTML tags
  $grepclone = $hs->parse($grepclone);
  # no more HTML entities, either
  $grepclone = decode_entities($grepclone);
  
  if (!$keep_punctuation) {
    $grepclone =~ s/[^[:alnum:][:blank:]*]*//g;
  }
  
  if (!$keep_caps) {
    $grepclone = lc $grepclone;
  }

  $grepclone =~ y/ //s;
  
  verbose(2, "Normalized: $grepclone\n");
  return $grepclone;
}

sub GetGoogleKey {
  my $keyfile = $ENV{"HOME"} . "/.googlekey";
  open(KEYFILE, $keyfile) or return undef;
  my $googlekey = <KEYFILE>;
  chomp $googlekey;
  close(KEYFILE);
  return $googlekey;
}

__END__

=head1 NAME

snowclone_google.pl - gathers snowclone statistics from google

=head1 SYNOPSIS

snowclone_google.pl [options] "snowclone text"

 Options:
   -h, --help		      prints this help message
   -v, --verbose	      increases verbosity level
   -l, --langs	    "en"      set language list
   -p, --per	    100	      set results per query
   -r, --results    100	      set total number of results
   -k, --key	    ""	      google API key
   -c, --caps		      keep capitalization
   -u, --punctuation	      keep punctuation
   -s, --safe		      turn on SafeSearch
   -f, --filter		      turn on filtering of similar search results
       --proxy		      HTTP proxy

=head1 OPTIONS

=over 8

=item B<-l, --langs>

Set languages. Space seperated. Example: "en ja" to request English and
Japanese results.

=item B<-k, --key>

Google API key. You can request one from Google: http://www.google.com/apis/

=back

=head1 DESCRIPTION

B<This program> will search using Google for the specified snowclone string,
given using Google's globbing format. It will then extract the strings that
satisfy the snowclone from the snippets provided by Google and compile
statistics on them.

Capitalization and punctuation are, by default, discarded.

The Google API key will be read from the file ~/.googlekey. It is necessary to
supply this key either in this file or via the command-line options, as it is
required to access the Google API.

The HTTP_PROXY environment variable is honored, but note that the Google API
docs warn that many proxies have difficulty with the SOAP API which underlies
the Google API.

=head1 EXAMPLE

snowclone_google.pl -r 500 "In space, no one can hear you *"

=head1 NOTE

As of 2004-02-15, Google only allows the first 1000 results to be retrieved.
Also, the number of results per query is limited to 10.

=head1 TODO

Make it possible to have all of the URLs and full snippets listed, seperated by
which variant of the snowclone they are. This should probably be HTML format
for ease of use.

Actual error handling for rejected keys, overlimit, etc.

=head1 BUGS

Reports * as a hit sometimes.

Probably a bunch of others. Let me know if you find any.

=head1 COPYRIGHT

Copyright 2004 Michael Leuchtenburg.

This program is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

=head1 AUTHOR

Michael Leuchtenburg <michael@slashhome.org>

=cut
