#!/usr/bin/perl
# spam analysis core
# Brian Ward http://www.o--o.net/

# CONFIG (optional): replace with your default relay
$myrelay = "laime\.cs\.uchicago\.edu";

# CONFIG (optional): add your other mail relays, with comments
$scomment{"tu-graz\.ac\.at"} = "Kernel-HOWTO (old)";
$scomment{"default"} = "Kernel-HOWTO (recent)";
$scomment{"math\.psu\.edu"} = "old, not widely published";
$scomment{"clueful.net"} = "dictionary spammer";
@servers = keys(%scomment);
@autodetect = ("x-spam-warning");

# @size_distrib = (1000000, 100000, 50000, 40000, 35000, 32500, 30000, 28000, 26000, 24000, 22000, 20000, 18000, 16000, 14000, 12000, 10000, 8000, 6000, 4000, 2000, 1000, -1);
@size_distrib = (0, 1000, 2000, 4000, 6000, 8000, 10000, 12000, 14000, 16000, 18000, 20000, 22000, 24000, 26000, 28000, 30000, 32500, 35000, 40000, 50000, 100000, 1000000);

$MoY{"Jan"} = 1; $MoY{"Feb"} = 2; $MoY{"Mar"} = 3; $MoY{"Apr"} = 4;
$MoY{"May"} = 5; $MoY{"Jun"} = 6; $MoY{"Jul"} = 7; $MoY{"Aug"} = 8;
$MoY{"Sep"} = 9; $MoY{"Oct"} = 10; $MoY{"Nov"} = 11; $MoY{"Dec"} = 12;

$DoW{"Mon"} = 1; $DoW{"Tue"} = 2; $DoW{"Wed"} = 3; $DoW{"Thu"} = 4;
$DoW{"Fri"} = 5; $DoW{"Sat"} = 6; $DoW{"Sun"} = 7;

# to skip today's stuff
@today = localtime(time);
$today[5] += 1900; $today[4]++;
$today_ds = sprintf "%d.%02d.%02d", $today[5], $today[4], $today[3];

$max_size = 0; $min_size = 99999999;

sub rctime {
    local ($ts) = @_;
    @ds = split(/\s+/, $ts);
    chop($ds[0]);	# the day of week
    @tas = split(/:/, $ds[4]);
    # year, month, day, day of week, hour, minute, second
    return($ds[3], $MoY{$ds[2]}, $ds[1], $DoW{$ds[0]}, @tas);
}

sub record_msgsize {
    # record message sizes
    # note that all variables are global
    push(@sizes, $this_message_size);

    # classify the message's size
    if ($this_message_size > $max_size) {
	$max_size = $this_message_size;
    }
    if ($this_message_size < $min_size) {
	$min_size = $this_message_size;
    }
    $i = 0;
    while ($this_message_size > $size_distrib[$i+1]) {
	if ($i == $#size_distrib) {
	    last;
	}
	$i++;
    }
    $size_dist_count[$i]++;
}

while ($name = shift) {
    if ($name eq "-font") {
	$fontname = shift;
	$font = "-font $fontname";
	next;
    }
    @st = stat($name);

    if ($name =~ /\.gz$/) {
	open (FD, "gzip -dc $name |");
    } else {
	open (FD, $name);
    }

    $invalid = 1;
    $this_message_size = 0;
    $recv_server = "default";
    $auto = 0;

    while (1) {
	$_ = <FD>;
	if (eof(FD) || (/^From /)) {
	    if (!$invalid) {
		$server_dist{$recv_server}++;
		&record_msgsize;
		if ($auto) {
		    $total_autodetects++;
		}
	    }
	    if (eof(FD)) {
		last;
	    }
	    $auto = 0;
	    $invalid = 1;
	    $this_message_size = 0;
	    $recv_server = "default";
	}

	$this_message_size += length;

	chop;

	if ($invalid && m/by $myrelay/) {
	    $_ = <FD>;
	    s/\n$//g;
	    s/^\s+//g;
	    s/.*;.//g;
	    @d = rctime($_);
	    # year, month, day, day of week, hour, minute, second
	    if ($d[1]) {
		# $datestr = "$d[0].$d[1].$d[2]";
		$datestr = sprintf "%d.%02d.%02d", $d[0], $d[1], $d[2];
		if ($datestr ne $today_ds) {	# we don't count today
		    $invalid = 0;		# mark as valid
		    if (!$byday{$datestr}) {
			$weekday_count[$d[3]]++;
		    }
		    $byday{$datestr}++;
		    $bymonth[$d[1]]++;
		    $bydayw[$d[3]]++;
		    $byhour[$d[4]]++;
		}
	    } else {
		print "oopsies: $name\n";
	    }
	}

	$ln = "\L$_";

	$i = 0;
	while ($i <= $#servers) {
	    $s = $servers[$i];
	    if ($ln =~ m/$s/) {
		$recv_server = $servers[$i];
	    }
	    $i++;
	}

#	$i = 0;
#	while ($i <= $#autodetect) {
#	    $s = $autodetect[$i];
#	    if ($ln =~ m/$s/) {
#		$auto = 1;
#	    }
#	    $i++;
#	}
	if ($name =~ /auto/) {
	    $auto = 1;
	}
    }
    close (FD);
}

$spam = 0; $max = 0; $min = 999999999;

foreach $day (keys(%byday)) {
    $spam += $byday{$day};
    if ($max < $byday{$day}) {
	$max = $byday{$day};
	$maxday = $day;
    }
    if ($min > $byday{$day}) {
	$min = $byday{$day};
	$minday = $day;
    }
    $days++;
}

$spam_per_day = $spam / $days;
$i = 1;
while ($i < 8) {
    $weekday_avg[$i] = $bydayw[$i] / $weekday_count[$i];
    $i++;
}

@sortsize = sort { $a <=> $b } @sizes;
$median_size = $sortsize[int($#sortsize/2)];
# foreach $sz (@sortsize) {
#     print "$sz\n";
# }

# compute sizes
# $size_distrib[$#size_distrib]++;
$i = 0;
while ($i < $#size_distrib) {
    $size_str[$i] = $size_distrib[$i]/1000 . "-" .
                    ($size_distrib[$i+1] / 1000) . "k";
    $i++;
}
$size_str[$#size_distrib] = ($size_distrib[$#size_distrib]/1000) . "k+";
# @size_str = reverse(@size_str);

# print "<html><head></head><body bgcolor=\"#ffffff\" fgcolor=\"#ffffff\">\n";

#print "<style>\n";
#print "<!--\n";
#print ".\n";
#print "-->\n";
#print "<.style>\n";

# generic statistics
$a = sprintf "%.2f", (100 * $total_autodetects / $spam);
$spds = sprintf "%.2f", $spam_per_day;

open (F, ">generic.pht"); select F;

print<<"GEN";
<table border=1 style="text-align: center" cellpadding=3>
<tr><th>Total</th><th>Average<th>Maximum</th><th>Minimum</th><th>Autodetect</th></tr>
<tr><td>$spam</td><td>$spds</td><td>$max</td><td>$min</td><td>$total_autodetects</td></tr>
<tr><td colspan=2>($days days)</td><td>$maxday</td><td>$minday</td><td>($a%)</td></tr>
</table>
GEN

close F;

open (F, ">daily.pht");
select F;

# days of the week

print<<"DOW";
<table border=1 cellpadding=3 style="text-align: center">
<tr><th>Mon</th><th>Tue</th><th>Wed</th><th>Thu</th><th>Fri</th><th>Sat</th><th>Sun</th></tr>
DOW
print "<tr>";
@wd = @weekday_avg; shift @wd;
foreach $dt (@wd) {
     printf "<td>%.2f</td>", $dt;
}
print "</tr>\n</table>\n";

close F;

# size statistics

open (F, ">size.pht");
select F;

print<<"SIZE";
<table border=1 cellpadding=3 style="text-align: center">
<tr><th>Median</th><th>Maximum</th><th>Minimum</th></tr>
<tr><td>$median_size</td><td>$max_size</td><td>$min_size</td></tr>
</table>
SIZE

close F;

# server distribution

open (F, ">dist.pht");
select F;

print "<table border=1 cellpadding=3 style=\"text-align: center\">";
print "<tr><th>Server</th><th>Count</th><th>Pct</th><th>Comment</th></tr>\n";
$i = 0;
foreach $s (sort(keys(%server_dist))) {
    $i++;
    # print "<tr><td>$s</td><td>$server_dist{$s}</td>";
    print "<tr><td>#$i</td><td>$server_dist{$s}</td>";
    printf "<td>%.2f%%</td><td>$scomment{$s}</td></tr>\n", 100 * $server_dist{$s} / $spam;
}
print "</table>\n";

close F;

select stdout;

# generate graphs

open(OUT, "| graph $font -bar > size.ps");
print OUT "Size Distribution\n";
# @size_dist_count = reverse(@size_dist_count);
$i = 0;
while ($i <= $#size_distrib) {
    print OUT "$size_str[$i] $size_dist_count[$i]\n";
    $i++;
}
close(OUT);

open(OUT, "| graph $font -bar > hourly.ps");
print OUT "Hourly Distribution\n";
$i = 0;
while ($i <= $#byhour) {
    print OUT "$i:00-$i:59 $byhour[$i]\n";
    $i++;
}
close(OUT);

# print "\nDaily Count)\n";
# foreach $day (sort(keys(%byday))) {
#     print "$day $byday{$day}\n";
# }

sub mklineg {
    local($numdays, $nday_avg, $fn, $label) = @_;
    local(@dc) = ();
    local(@days) = ();
    local($i, $avg);
    # open(OUT, "| spgraph -line > $fn");
    open(OUT, "| graph $font -line > $fn");
    print OUT "$label\n";

    @days = sort(keys(%byday));
    if ($numdays ne "all") {		# remove 
	$rm = ($#days + 1) - ($numdays + $nday_avg);
	splice (@days, 0, $rm);
    }

    # build up the initial averages (in the @dc array)
    $i = 0;
    while ($i < ($nday_avg - 1)) {
	$dc[$i] = $byday{shift(@days)};
	$i++;
    }

    # compute the average and print the figure
    while ($cur_day = shift(@days)) {
	$dc[$nday_avg - 1] = $byday{$cur_day};
	$avg = 0;
	foreach $d (@dc) {
	    $avg += $d;
	}
	$avg = $avg / $nday_avg;
	print OUT "$cur_day $avg\n";
	# print join(" ", @dc) . "\n";
	shift(@dc);
    }
    close(OUT);
}

$nday_avg = int($days/7.5);
if ($nday_avg > 20) {
    $nday_avg = 20;
}
# $nday_avg = 4;

# If you want other graphs, add them here

&mklineg("all", $nday_avg, "dcount.ps", "Daily Count ($nday_avg-Day Average)");
&mklineg(30, 1, "30day.ps", "Past 30 Days");

