#!/usr/bin/perl -w # # Perl script used for the preprocessing of training data by means of # selective subsampling. # # Miloslav Nepil, FI MU Brno, CZ, # Miroslav Dolecek, FI MU Brno, CZ, # Miroslav Kripac, FI MU Brno, CZ, # # Release 2 # * Extended classifiers selection # * Examples choosing based on entropy use strict; # Konfigurace skriptu my $Version='2.0.2 6/22/2002'; my $classifiers_count_default = 2; my $initial_fraction_default = 0.005; my $terminal_fraction_default = 0.2; my $example_entropy_default = 0.5; my $classifiers_choose = 4; # celkovy pocet klas. ze kterych vybirame (2 x vic) my $verbose = 0; # vypisovat ladici vypisy my $classifier_file_name = 'initial_classifier'; my $selected_classifier = 'classifier'; # Ten co jsme vybrali na zaklade koeficientu. my $LinDiscr = 'LinDiscr'; my $LinDiscrU = 'LinDiscrU'; $|=1; # vypnuti vyrovnavaci pameti "po radcich" na STDOUT; # Test poctu parametru if (@ARGV < 2) { die < [-n number_classifiers] [-i initial_fraction] [-t terminal_fraction] [-e example_entropy] [-v] -n number of initial classifiers used for committee voting (default $classifiers_count_default) -i fraction of data returned for training initial classifiers (default $initial_fraction_default) -t fraction of data returned for training terminal classifier (default $terminal_fraction_default) -e minimal entropy of selected example (default $example_entropy_default) -v verbose EOF } # Nacteni pripadnych vstupnich parametru my $classifiers_count = $classifiers_count_default; my $initial_fraction = $initial_fraction_default; my $terminal_fraction = $terminal_fraction_default; my $example_entropy = $example_entropy_default; for my $i (2 .. $#ARGV) { if ($ARGV[$i] eq "-n") { $classifiers_count = $ARGV[$i+1]; $classifiers_choose = 2 * $classifiers_count; } elsif ($ARGV[$i] eq "-i") { $initial_fraction = $ARGV[$i+1]; } elsif ($ARGV[$i] eq "-t") { $terminal_fraction = $ARGV[$i+1]; } elsif ($ARGV[$i] eq "-e") { $example_entropy = $ARGV[$i+1]; } elsif ($ARGV[$i] eq "-v") { $verbose = 1; } } # Vypis skutecnych hodnot print "\nNumber of classifiers:\t$classifiers_count\n"; print "Initial fraction:\t$initial_fraction\n"; print "Terminal fraction:\t$terminal_fraction\n"; print "Example entropy:\t$example_entropy\n\n"; # Nastaveni datovych souboru my $input_stem = $ARGV[0]; my $output_stem = $ARGV[1]; my $input_data = $input_stem . ".data"; my $output_data = $output_stem . ".data"; my $input_test = $input_stem . ".test"; my $output_test = $output_stem . ".test"; system("cp -f $input_test $output_test 2>/dev/null"); my $input_names = $input_stem . ".names"; my $output_names = $output_stem . ".names"; system("cp -f $input_names $output_names 2>/dev/null"); ############################################################################################ # 1. Cast - vyber vhodnych klasifikatoru (tech ktere se ve svych koeficientech nejvice lisi) ############################################################################################ # Nejprve zjistime velikost ucicich dat a jejich inicialni frakci print "Reading data size ... " if $verbose; open TRAINDATASIZE, $input_data or die "\nselsub error: Unable to open training data file $input_data.\n Check .data\n"; my $training_data_size = 0; while() { $training_data_size ++; } close TRAINDATASIZE; my $initial_data_size = int($training_data_size * $initial_fraction); print " Ok\n" if $verbose; print "Training data size: $training_data_size\n"; print "Initial data size: $initial_data_size\n"; # Naplnime pole udrzujici pro kazdy klasifikator indexy nahodnych radku z ucicich dat. my @ClasIndex; print "Generating random numbers ... " if $verbose; for my $clas (0 .. $classifiers_choose-1) { for my $i ( 0 .. $initial_data_size-1) { $ClasIndex[$clas][$i] = int(rand($training_data_size)); } } print " Ok\n" if $verbose; # Setridime jednotlive hodnoty v polich for my $clas (0 .. $classifiers_choose-1) { $ClasIndex[$clas] = [ sort {$a <=> $b} @{$ClasIndex[$clas]} ]; } # Otevreme soubor s ucicimi daty open INPUTDATA, $input_data or die "\nselsub error: Unable to open training data file $input_data.\n Check .data\n"; # Nyni pro vsechna ucici data urcime ke kterym klasifikatorum budou patrit # Pokud generator urcil dve stejna nahodna cisla, priradime tomu dalsimu o jedno vetsi my $i=0; my $line; my @ClasData; print "Creating initial classifiers data sets ..." if $verbose; while (defined($line = )) { for my $clas (0 .. $classifiers_choose-1) { if (defined($ClasIndex[$clas][0]) and $i == $ClasIndex[$clas][0]) { push @{$ClasData[$clas]}, $line; shift @{$ClasIndex[$clas]}; # Nasleduje prezpracovani duplicit ktere se pouzije v dalsim pruchodu (index 0) my $j = 0; while (defined($ClasIndex[$clas][$j]) and $i == $ClasIndex[$clas][$j]) { $ClasIndex[$clas][$j] = $i + $j+1; $j++; } while (defined($ClasIndex[$clas][$j]) and $i+$j > $ClasIndex[$clas][$j]) { $ClasIndex[$clas][$j] = $i + $j; $j++; } } } $i++; } close INPUTDATA; print " Ok\n" if $verbose; # A vypiseme je do jednotlivych souboru print "Writing initial classifiers data sets ..." if $verbose; for my $clas (0 .. $classifiers_choose-1) { open CLASDATA, ">>$classifier_file_name\_$clas.data" or die "\nselsub error: Unable to open temporary file.\n"; for (@{$ClasData[$clas]}) { print CLASDATA; } close CLASDATA; # LinDiscr bude potrebova definice atributu tehoz jmena system ("ln -s $input_names $classifier_file_name\_$clas.names"); } print " Ok\n" if $verbose; # Nyni nad kazdym souborem spustime LinDiscr a zjistime jeho koeficienty my $coef; my %ClasCoef; for my $clas (0 .. $classifiers_choose-1) { open LINDISCR, "$LinDiscr -f $classifier_file_name\_$clas |" or die "\nselsub error: Unable to open LinDiscr. Check configuration.\n"; while (defined(my $line = )) { if ($line =~ /Coefficients:/ ) { $line = ; $line =~ /^[\+,\-](\d+\.\d+)[\+,\-]/; $coef = $1; $ClasCoef{$coef} = $clas; } } close LINDISCR; } # Koeficienty utridime a zjisitime indexy sudych klasifikatoru (tim vybereme ty co se budou nejvice lisit) $i = 1; print "Running classifiers ...\n" if $verbose; for $coef (sort {$a <=> $b} keys %ClasCoef) { unless ($i % 2) { my $j = int($i/2)-1; print " Classifier number $j ..." if $verbose; system ("cp -f $classifier_file_name\_$ClasCoef{$coef}.data $selected_classifier\_$j.data"); system ("cp -f $classifier_file_name\_$ClasCoef{$coef}.coef $selected_classifier\_$j.coef"); system ("ln -s $input_names $selected_classifier\_$j.names"); system ("ln -s $input_data $selected_classifier\_$j.test"); system ("$LinDiscrU -f $selected_classifier\_$j >/dev/null"); print " Ok\n" if $verbose; } $i++; } # Nyni mame v souborech $selected_classifier* hodnoty rozhodnuti jednotlivych klasifikatoru na # zaklade kterych urcime priklady ktere se vyberou do vysledne mnoziny ############################################################################################ # 2. Cast - vyber dat do vysledne mnoziny na zaklade rozhodnuti klasifikatoru ############################################################################################ # Nejprve zjistime spravne odpovedi z puvodnich dat my @Correct; print "Reading correct answers ... " if $verbose; open INPUTDATA, $input_data or die "\nselsub error: Unable to open INPUTDATA\n"; $i = 0; while (defined($line = )) { $line =~ /\,\s*(\S+)$/; $Correct[$i] = $1; $i++; } close INPUTDATA; print " Ok\n" if $verbose; # Nyni pro kazdy priklad zjistime pocet spravnych a spatnych rozhodnuti klasifikatoru... my @Positive; my @Negative; my $prediction_size; print "Reading classifiers predictions ...\n" if $verbose; for my $clas (0 .. $classifiers_count-1) { print " Classifier number $clas ..." if $verbose; open CLASDATA, "$selected_classifier\_$clas.pred" or die "\nselsub error: Unable to open prediction file.\n"; my $i = 0; while (defined($line = )) { chop $line; if ($line eq $Correct[$i]) { $Positive[$i]++; } else { $Negative[$i]++; } $i++; } close CLASDATA; $prediction_size = $i-1; print " Ok\n" if $verbose; } # ...a z toho entropii my @Entropy; my @EntropyEx; my $entropy_count = 0; # Pocet prvku majicich entropi vyssi nez je parametr $example_entropy print "Counting example enropy ..." if $verbose; for my $i (0 .. $prediction_size-1) { ENTROPIE: ### Mozna lepsi postaveni enropie da jeste lepsi vysledky? my $p = $Positive[$i]; my $n = $Negative[$i]; if (not defined $p or not defined $n) { $Entropy[$i] = 0; } else { $Entropy[$i] = - $p/($p+$n) * log($p/($p+$n)) - $n/($p+$n) * log($n/($p+$n)); } if ($Entropy[$i] >= $example_entropy) { push @EntropyEx, $i; $entropy_count++ } } print " Ok\n" if $verbose; print "Total count of high entropy data: $entropy_count (", int($entropy_count/$prediction_size*10000)/100, "%)\n"; my $terminal_data_size = int($prediction_size * $terminal_fraction); # Data z velkou enropii doplnime o nahodne vybrana zbyvajici data v pripade, ze je dat z velkou # enropii min nez je vysledna mnozina. $i = 0; my $j = 0; my $k = 0; my @AddEx; while ($k < $terminal_data_size) { if (defined $EntropyEx[$j] and $EntropyEx[$j] == $i) { $j++; $i++; next; } push @AddEx, $i; $k++; $i++; } for my $i ( 0 .. $#AddEx) { my $rand = int rand ($#AddEx); my $p = $AddEx[$i]; $AddEx[$i] = $AddEx[$rand]; $AddEx[$rand] = $p; } for my $i ( 0 .. $terminal_data_size - $entropy_count) { push @EntropyEx,$AddEx[$i] if defined $AddEx[$i]; } # Vyslednou mnozinu nejprve nahodne prehazime print "Random selection of terminal set ..." if $verbose; for my $i ( 0 .. $#EntropyEx) { my $rand = int rand ($#EntropyEx); my $p = $EntropyEx[$i]; $EntropyEx[$i] = $EntropyEx[$rand]; $EntropyEx[$rand] = $p; } print " Ok\n" if $verbose; # Pote vybereme prvni priklady az do velikosti terminalni mnoziny splice(@EntropyEx, $terminal_data_size, $#EntropyEx); # A vysledne pole setridime @EntropyEx = sort {$a <=> $b} @EntropyEx; # Radky jejichz indexy jsou v EnropyEx zapiseme do vysledneho souboru print "Writing terminal set ..." if $verbose; open TERMDATA, ">$output_data" or die "selsub error: Unable to write teminal set\n"; open INPUTDATA, "$input_data" or die "selsub error: Unable to read input data\n"; $i=0; $j=0; $line=''; while (defined($line = )) { if (defined $EntropyEx[$j] and $i == $EntropyEx[$j]) { print TERMDATA $line; $j++; } $i++; } close TERMDATA; close INPUTDATA; print " Ok\n" if $verbose; print "Terminal set size: $terminal_data_size (", int($terminal_data_size/$prediction_size*10000)/100, "%)\n\n"; # Vicistime vsechna testovaci data system ("rm -f $classifier_file_name*"); system ("rm -f $selected_classifier*");