#!/usr/bin/perl -w
# toplog
# From: Stephane Bortzmeyer <bortzmeyer@pasteur.fr>

use English;
use strict;

my ($logfile) = shift (@ARGV);

my ($result, %results, $results, $page, $words, $request, 
    @words, $word, %numbers);

my ($characters) = '\wéèçàâôêöïÉÈ';

if (! $logfile) {
    die "Usage: $0 logfile";
}
open (LOG, "< $logfile") or
    die "Cannot read $logfile: $OS_ERROR";
while (<LOG>) {
    chomp;
    $result = m/^([a-z]{3}\ \d+\ \d+:\d+:\d+)\ # Date
	([a-z0-9]+)\ # Machine name
	(htsearch\[\d+\]:)\  # Program name and PID
	([a-z0-9\-\.]+)\  # Client name or address
	(\[[a-z\-]+\])\ # Configuration file
	\((and|or|boolean)\)\ # Operator
	\[([$characters\'\"\-\?\!\&,;\+\* ]+)\]\ # Words
	\[([$characters\'\"\-\?\!\&,;\+\*\(\) ]+)\]\ # Logical words
        \(((\d+)\/\d+)\)\ # Results
        \-\ # Separator
        (\d+)\ # Page number
        /xi;
    if (! $result) {
	warn "Cannot parse \"$_\"";
    }
    else {
	$page = $11;
	$words = $7;
	$results = $10;
	if ($page == 1) { # Display resultst only for the first page
	    $results{$words} = $results;
	    @words = split ('\s|,', $words);
	    foreach $word (@words) {
		if ((! $word) or ($word =~ /^(and|et|or|ou|de|le)$/i)) {
		    next;
		}
		$numbers{$word}++;
	    }
	}
    }
}
close (LOG);

print "\n-- NUMBER OF REQUESTS PER WORD --\n";
foreach $word (reverse sort by_numbers keys %numbers) {
    print "$word: $numbers{$word}\n";
}

print "\n-- NUMBER OF RESULTS PER QUESTION --\n";
foreach $request (reverse sort by_results keys %results) {
    print "$request: $results{$request}\n";
}

sub by_results {
    $results{$a} <=> $results{$b};
}

sub by_numbers {
    $numbers{$a} <=> $numbers{$b};
}

