#!/usr/bin/perl -w
#
# Perl script used for the preprocessing of training data by means of
# selective subsampling. 
# 
# Miloslav Nepil, FI MU Brno, CZ, <nepil@fi.muni.cz>
# Miroslav Dolecek, FI MU Brno, CZ, <dolecek@fi.muni.cz>
# Miroslav Kripac, FI MU Brno, CZ,  <kripac@fi.muni.cz>
# 
# Release 2
# 	* Extended classifiers selection
# 	* Examples choosing based on entropy

use strict;

# Konfigurace skriptu
my $Version='2.0.2 6/22/2002';
my $classifiers_count_default = 2;
my $initial_fraction_default = 0.005;
my $terminal_fraction_default = 0.2;
my $example_entropy_default  = 0.5;
my $classifiers_choose = 4; # celkovy pocet klas. ze kterych vybirame (2 x vic)
my $verbose = 0; # vypisovat ladici vypisy
my $classifier_file_name = 'initial_classifier';
my $selected_classifier = 'classifier'; # Ten co jsme vybrali na zaklade koeficientu.
my $LinDiscr = 'LinDiscr';
my $LinDiscrU = 'LinDiscrU';

$|=1; # vypnuti vyrovnavaci pameti "po radcich" na STDOUT;

# Test poctu parametru
if (@ARGV < 2) {
	die <<EOF;

Perl script used for the preprocessing of training data by means of
selective subsampling.

Version $Version

Usage: $0 <InputStem> <OutputStem> [-n number_classifiers] [-i initial_fraction] [-t terminal_fraction] [-e example_entropy] [-v]

  -n 	number of initial classifiers used for committee voting (default $classifiers_count_default) 
  -i	fraction of data returned for training initial classifiers (default $initial_fraction_default)
  -t	fraction of data returned for training terminal classifier (default $terminal_fraction_default)
  -e	minimal entropy of selected example (default $example_entropy_default)
  -v    verbose 

EOF
}

# Nacteni pripadnych vstupnich parametru
my $classifiers_count = $classifiers_count_default; 
my $initial_fraction = $initial_fraction_default; 
my $terminal_fraction = $terminal_fraction_default;
my $example_entropy = $example_entropy_default;

for my $i (2 .. $#ARGV) {
	if ($ARGV[$i] eq "-n") {
		$classifiers_count = $ARGV[$i+1];
		$classifiers_choose = 2 * $classifiers_count;
	} elsif ($ARGV[$i] eq "-i") {
		$initial_fraction = $ARGV[$i+1];
	} elsif ($ARGV[$i] eq "-t") {
		$terminal_fraction = $ARGV[$i+1];
	} elsif ($ARGV[$i] eq "-e") {
		$example_entropy = $ARGV[$i+1];
	} elsif ($ARGV[$i] eq "-v") {
		$verbose = 1;
	}
}

# Vypis skutecnych hodnot
print "\nNumber of classifiers:\t$classifiers_count\n";
print "Initial fraction:\t$initial_fraction\n";
print "Terminal fraction:\t$terminal_fraction\n";
print "Example entropy:\t$example_entropy\n\n";

# Nastaveni datovych souboru
my $input_stem = $ARGV[0];
my $output_stem = $ARGV[1];

my $input_data = $input_stem . ".data";
my $output_data = $output_stem . ".data";

my $input_test = $input_stem . ".test";
my $output_test = $output_stem . ".test";
system("cp -f $input_test $output_test 2>/dev/null");

my $input_names = $input_stem . ".names";
my $output_names = $output_stem . ".names";
system("cp -f $input_names $output_names 2>/dev/null");


############################################################################################
# 1. Cast - vyber vhodnych klasifikatoru (tech ktere se ve svych koeficientech nejvice lisi)
############################################################################################

# Nejprve zjistime velikost ucicich dat a jejich inicialni frakci
print "Reading data size ... " if $verbose;
open TRAINDATASIZE, $input_data
	or die "\nselsub error: Unable to open training data file $input_data.\n Check <InputStem>.data\n";
my $training_data_size = 0;
while(<TRAINDATASIZE>) { $training_data_size ++; }
close TRAINDATASIZE;
my $initial_data_size = int($training_data_size * $initial_fraction);
print " Ok\n" if $verbose;

print "Training data size: $training_data_size\n";
print "Initial data size: $initial_data_size\n";

# Naplnime pole udrzujici pro kazdy klasifikator indexy nahodnych radku z ucicich dat. 
my @ClasIndex; 
print "Generating random numbers ... " if $verbose;
for my $clas (0 .. $classifiers_choose-1) {
	for my $i ( 0 .. $initial_data_size-1) {
		$ClasIndex[$clas][$i] = int(rand($training_data_size));
	}
}
print " Ok\n" if $verbose;

# Setridime jednotlive hodnoty v polich
for my $clas (0 .. $classifiers_choose-1) {
	$ClasIndex[$clas] =  [ sort {$a <=> $b} @{$ClasIndex[$clas]} ];
}

# Otevreme soubor s ucicimi daty
open INPUTDATA, $input_data
	or die "\nselsub error: Unable to open training data file $input_data.\n Check <InputStem>.data\n";


# Nyni pro vsechna ucici data urcime ke kterym klasifikatorum budou patrit
# Pokud generator urcil dve stejna nahodna cisla, priradime tomu dalsimu o jedno vetsi
my $i=0;
my $line;
my @ClasData;
print "Creating initial classifiers data sets ..." if $verbose;
while (defined($line = <INPUTDATA>)) {
	for my $clas (0	.. $classifiers_choose-1) {
		if (defined($ClasIndex[$clas][0]) and $i == $ClasIndex[$clas][0]) {
			push @{$ClasData[$clas]}, $line;			
			
			shift @{$ClasIndex[$clas]};
			# Nasleduje prezpracovani duplicit ktere se pouzije v dalsim pruchodu (index 0)
			my $j = 0;
			while (defined($ClasIndex[$clas][$j]) and  $i == $ClasIndex[$clas][$j]) {
				$ClasIndex[$clas][$j] = $i + $j+1;
				$j++;	
			}
			while (defined($ClasIndex[$clas][$j]) and  $i+$j > $ClasIndex[$clas][$j]) {
				$ClasIndex[$clas][$j] = $i + $j;
				$j++;	
			}
		}
	}
	$i++;
}
close INPUTDATA;
print " Ok\n" if $verbose;

# A vypiseme je do jednotlivych souboru
print "Writing initial classifiers data sets ..." if $verbose;
for my $clas (0 .. $classifiers_choose-1) {
	open CLASDATA, ">>$classifier_file_name\_$clas.data"
		or die "\nselsub error: Unable to open temporary file.\n";
	for (@{$ClasData[$clas]}) {
		print CLASDATA; 	
	}	
	close CLASDATA;
	# LinDiscr bude potrebova definice atributu tehoz jmena
	system ("ln -s $input_names $classifier_file_name\_$clas.names");
}
print " Ok\n" if $verbose;

# Nyni nad kazdym souborem spustime LinDiscr a zjistime jeho koeficienty
my $coef;
my %ClasCoef;
for my $clas (0 .. $classifiers_choose-1) {
	open LINDISCR, "$LinDiscr -f $classifier_file_name\_$clas |"
		or die "\nselsub error: Unable to open LinDiscr. Check configuration.\n";
	while (defined(my $line = <LINDISCR>)) {		
		if ($line =~ /Coefficients:/ ) {
			$line = <LINDISCR>;
			$line =~ /^[\+,\-](\d+\.\d+)[\+,\-]/;
			$coef = $1;
			$ClasCoef{$coef} = $clas;
		}
	}
	close LINDISCR;
}



# Koeficienty utridime a zjisitime indexy sudych klasifikatoru (tim vybereme ty co se budou nejvice lisit)
$i = 1;
print "Running classifiers ...\n" if $verbose;
for $coef (sort {$a <=> $b} keys %ClasCoef) {
	unless ($i % 2) {
		my $j = int($i/2)-1;
		print "   Classifier number $j ..." if $verbose;
		system ("cp -f $classifier_file_name\_$ClasCoef{$coef}.data $selected_classifier\_$j.data");
		system ("cp -f $classifier_file_name\_$ClasCoef{$coef}.coef $selected_classifier\_$j.coef");
		system ("ln -s $input_names $selected_classifier\_$j.names");
		system ("ln -s $input_data $selected_classifier\_$j.test");
		system ("$LinDiscrU -f $selected_classifier\_$j >/dev/null");
		print " Ok\n" if $verbose;
	}
	$i++;
}

# Nyni mame v souborech $selected_classifier* hodnoty rozhodnuti jednotlivych klasifikatoru na
# zaklade kterych urcime priklady ktere se vyberou do vysledne mnoziny


############################################################################################
# 2. Cast - vyber dat do vysledne mnoziny na zaklade rozhodnuti klasifikatoru
############################################################################################

# Nejprve zjistime spravne odpovedi z puvodnich dat
my @Correct;
print "Reading correct answers ... " if $verbose;
open INPUTDATA, $input_data
	or die "\nselsub error: Unable to open INPUTDATA\n"; 
$i = 0;
while (defined($line = <INPUTDATA>)) {
	$line =~ /\,\s*(\S+)$/;
	$Correct[$i] = $1;
	$i++;
}
close INPUTDATA;
print " Ok\n" if $verbose;

# Nyni pro kazdy priklad zjistime pocet spravnych a spatnych rozhodnuti klasifikatoru...	
my @Positive;
my @Negative;
my $prediction_size;
print "Reading classifiers predictions ...\n" if $verbose;
for my $clas (0 .. $classifiers_count-1) {
	print "   Classifier number $clas ..." if $verbose;
	open CLASDATA, "$selected_classifier\_$clas.pred"
		or die "\nselsub error: Unable to open prediction file.\n";
	my $i = 0;	
	while (defined($line = <CLASDATA>)) {
		chop $line;
		if ($line eq $Correct[$i]) {
			$Positive[$i]++;
		} else {
			$Negative[$i]++;
		}
		$i++;
	}
	close CLASDATA;
	$prediction_size = $i-1;
	print " Ok\n" if $verbose;
}

# ...a z toho entropii
my @Entropy;
my @EntropyEx;
my $entropy_count = 0; # Pocet prvku majicich entropi vyssi nez je parametr $example_entropy
print "Counting example enropy ..." if $verbose;
for my $i (0 .. $prediction_size-1) {

ENTROPIE:
	### Mozna lepsi postaveni enropie da jeste lepsi vysledky?
	my $p = $Positive[$i];
	my $n = $Negative[$i];

	if (not defined $p or not defined $n) {
		$Entropy[$i] = 0;
	} else {
		$Entropy[$i] = - $p/($p+$n) * log($p/($p+$n)) - $n/($p+$n) * log($n/($p+$n));
	}

	if ($Entropy[$i] >= $example_entropy) {
		push @EntropyEx, $i;	
		$entropy_count++ 
	}
}
print " Ok\n" if $verbose;

print "Total count of high entropy data: $entropy_count (", 
	int($entropy_count/$prediction_size*10000)/100, "%)\n";

my $terminal_data_size = int($prediction_size * $terminal_fraction);

# Data z velkou enropii doplnime o nahodne vybrana zbyvajici data v pripade, ze je dat z velkou 
# enropii min nez je vysledna mnozina.
$i = 0;
my $j = 0;
my $k = 0;
my @AddEx;
while ($k < $terminal_data_size) {
	if (defined $EntropyEx[$j] and $EntropyEx[$j] == $i) {
		$j++;
		$i++;
		next;
	}  
	push @AddEx, $i;	
	$k++;
	$i++;
}
for my $i ( 0 .. $#AddEx) {
	my $rand = int rand ($#AddEx);
        my $p = $AddEx[$i];
        $AddEx[$i] = $AddEx[$rand]; 
        $AddEx[$rand] = $p; 
}
for my $i ( 0 .. $terminal_data_size - $entropy_count) {
	push @EntropyEx,$AddEx[$i] if defined $AddEx[$i];
}


# Vyslednou mnozinu nejprve nahodne prehazime 
print "Random selection of terminal set ..." if $verbose;
for my $i ( 0 .. $#EntropyEx) {
	my $rand = int rand ($#EntropyEx);
        my $p = $EntropyEx[$i];
        $EntropyEx[$i] = $EntropyEx[$rand]; 
        $EntropyEx[$rand] = $p; 
}
print " Ok\n" if $verbose;

# Pote vybereme prvni priklady az do velikosti terminalni mnoziny
splice(@EntropyEx, $terminal_data_size, $#EntropyEx);

# A vysledne pole setridime
@EntropyEx = sort {$a <=> $b} @EntropyEx;

# Radky jejichz indexy jsou v EnropyEx zapiseme do vysledneho souboru
print "Writing terminal set ..." if $verbose;
open TERMDATA, ">$output_data"
	or die "selsub error: Unable to write teminal set\n";
open INPUTDATA, "$input_data"
	or die "selsub error: Unable to read input data\n";
$i=0;
$j=0;
$line='';

while (defined($line = <INPUTDATA>)) {
	if (defined $EntropyEx[$j] and $i == $EntropyEx[$j]) {
		print TERMDATA $line;
		$j++;
	}
	$i++;
}

close TERMDATA;
close INPUTDATA;
print " Ok\n" if $verbose;

print "Terminal set size: $terminal_data_size (", 
	int($terminal_data_size/$prediction_size*10000)/100, "%)\n\n";

# Vicistime vsechna testovaci data
system ("rm -f $classifier_file_name*");
system ("rm -f $selected_classifier*");

