#!/usr/bin/perl -w

use strict;

# Konfigurace
my $soubor = 	     '/usr/local/etc/httpd/logs/access_log';
my $blacklisthosts = '/home/xkripac/WUM/blacklist.hosts';
my $blacklistsites = '/home/xkripac/WUM/blacklist.sites';
my $charsets = 	     '/home/xkripac/WUM/charsets';	

# Nacteni casu zacatku;
my $time = time;

print STDERR "\nFiltr zpracovavajici logy weboveho serveru FI MU\n\n";

# Nacteni blacklistu stroju
my @blh;
open BLACKLIST, "$blacklisthosts"
	or die "Chyba: Blacklist $blacklisthosts nelze otevrit pro cteni!\n";

my $line;
while (defined ($line = <BLACKLIST>)) {
	chop $line;
	push @blh, $line;	
}
close BLACKLIST;
print STDERR "Nacteno ", $#blh+1, " stroju pro ignorovani.\n";


# Nacteni blacklistu stranek
my @bls;
open BLACKLIST, "$blacklistsites"
	or die "Chyba: Blacklist $blacklistsites nelze otevrit pro cteni!\n";

while (defined ($line = <BLACKLIST>)) {
	chop $line;
	push @bls, $line;	
}
close BLACKLIST;
print STDERR "Nacteno ", $#bls+1, " stranek pro ignorovani.\n";

# Nacteni znakovych sad (dve ruzne znakove sady se v logu tvari jako jedna stranka)
my @blc;
my $count = 0;
open BLACKLIST, "$charsets"
	or die "Chyba: Soubor $charsets znakovych sad nelze otevrit pro cteni!\n"; 	

while (defined ($line = <BLACKLIST>)) {
	chop $line;
	push @blc, $line;	
	$count++;
}

close BLACKLIST;
print STDERR "Nacteno $count kompatibilnich znakovych sad.\n";

	

# Nacteni vstupnich dat
if ($#ARGV >= 0) {
	if ($ARGV[0] eq '-') {
		$soubor = 'Standardni vstup';	
	} else {
		$soubor = $ARGV[0];
	}	
}

print STDERR "Zpracovavam: $soubor\n";

if ($soubor eq 'Standardni vstup') {
	*FILE = *STDIN;
} else {
	open FILE, "$soubor"
		or die "Chyba: Soubor $soubor nelze otevrit pro cteni!\n";
}	

my $ignoruj = 0;
my %blhoststat;
my %blsitestat;
my $userstat = 0;
my $charsetstat = 0;
my $blacklist;

$count = 0;

print STDERR "\t . = 1000 zpracovanych radek\n";
print STDERR "\t radek obsahuje 50 tecek\n";
while (defined ($line = <FILE>)) {
	$count++;
	if ($count % 1000 == 0) { 
		print STDERR "."; 
		if ($count % 50000 == 0) { print STDERR "\n"; }
	}
	
	$ignoruj = 0;
	# Ignoruji adresy v blacklistu stroju (typicky vyhladavace)
	foreach $blacklist (@blh) {
		if ($line =~ /$blacklist/) {
			$blhoststat{$blacklist}++;
			$ignoruj = 1;
			last;
		}
	}	
	next if $ignoruj;

	# Ignoruji studentske stranky
	if ($line =~ / \/\~/ or $line =~ / \/usr\// or $line =~ /\/%7(E|e)/) {
		$userstat++;	
		$ignoruj = 1;
	}
	next if $ignoruj;

	# Ignoruji adresy v blacklistu stranek (robots.txt apod)
	foreach $blacklist (@bls) {
		if ($line =~ /$blacklist/) {
			$blsitestat{$blacklist}++;
			$ignoruj = 1;
			last;
		}
	}	
	next if $ignoruj;


	# Prevadim znakove sady do jednotne podoby
	foreach $blacklist (@blc) {
		$line =~ s/\.$blacklist\S*//;
	}	
	next if $ignoruj;
	

	print $line;
}


print STDERR "\n\n";

foreach $line (keys %blhoststat) {
	print STDERR "Ignorovano $blhoststat{$line} pristupu ze stroje $line.\n";
}

foreach $line (keys %blsitestat) {
	print STDERR "Ignorovano $blsitestat{$line} pristupu na stranku $line.\n";
}

print STDERR "Ignorovano $userstat pristupu k uzivatelskym strankam.\n";

print STDERR "Celkem zpracovano $count stranek.\n";

print STDERR "Cas behu skriptu: ", time - $time, "s.\n";

close FILE;	

