#************************************************************ =head1 NAME build_probabilities.pl - Turn two token files into a probability file =head1 DESCRIPTION The following options are supported =head2 Bad (SPAM) Token File (-b or --bad) Specify the token file containing the bad tokens. In other words, the tokens from the SPAM messages. example: perl -w build_probabilities.pl -p prob.dat -b bad.dat =head2 Good Token File (-g or --good) Specify the token file containing the good tokens. In other words, the tokens from the non-SPAM messages. example: perl -w build_probabilities.pl -p prob.dat -b bad.dat -g good.dat =head2 Help (-h or -?) Print useage instructions example: perl -w build_probabilities.pl -h =head2 Log File Name (-l or --log) If a logfile is specified, then this is used as the logfile name. =head2 Log Configuration Files (--log_cfg) You can create a configuration file for your logger and then configure your log object by simply telling it to read the specified configuration file. To create an initial configuration file, write a perl script that creates a logger, configures the logger, and then use the write_to_file('log_cfg.dat') method. This provides complete control over how the logger is configured. You can set screen and file output levels, for example. example: perl -w build_probabilities.pl -p prob.dat -b bad.dat -g good.dat --log_cfg ~andy/logs/default_log.dat =head2 Log File Directory (--log_dir) This allows you to specify which directory contains the log example: perl -w build_probabilities.pl -p prob.dat -b bad.dat -g good.dat --log_dir ~andy/logs =head2 Token file (-p or --prob) This provides a method of specifying the name of the output probability token data file. example: perl -w build_probabilities.pl -p prob.dat -b bad.dat -g good.dat =cut #************************************************************ use Carp; use IO::File; use File::Basename; use strict; use Getopt::Long; use Pitonyak::SmallLogger; use Pitonyak::SafeGlob qw(glob_spec_from_path); use Pitonyak::BayesianTokenCounter; # Print program usage sub usage { my $name = $0; $name = $_[0] if $#_ >= 0; print STDERR << "EOF"; Usage: $name [-cfhr] [-l file] -b bad_tokens_file -g good_tokens_file -p file [--log_cfg file] [--log_dir path] Build the probability file -b, --bad=FILE : file containing the bad tokens -g, --good=FILE : file containing the good tokens -h, --help : print this help message -l, --log=FILE : base name for the log file --log_cfg=FILE : log configuration file --log_dir=PATH : path where the logs should be saved -p, --prob=FILE : output token probability file example: $name -g good_files.dat -b bad_files.dat -p probability.dat EOF } #************************************************************ #** ** #** Input: configuration file to use ** #** file specs to match ** #** ** #************************************************************ my @suffixlist = (); my ( $program_name, $program_path, $program_suffix ) = fileparse( $0, @suffixlist ); my $help = 0; my $logfile = ''; my $bad_token_file = ''; my $good_token_file = ''; my $outfile = ''; my $log_cfg = ''; my $log_dir = ''; Getopt::Long::Configure("bundling"); my $goodOptions = GetOptions( "bad|b=s" => \$bad_token_file, "good|g=s" => \$good_token_file, "help|?|h" => \$help, "log|l=s" => \$logfile, "log_cfg=s" => \$log_cfg, "log_dir=s" => \$log_dir, "prob|p=s" => \$outfile, ); if ( $help || $bad_token_file eq '' || $good_token_file eq '' || $outfile eq '' ) { usage(); exit 0; } my $log = new Pitonyak::SmallLogger; $log->log_name_date(''); $log->message_loc_format('(sub):(line):'); $log->open_append(1); $log->log_path($program_path); $log->read_from_file($log_cfg) if defined($log_cfg) and $log_cfg ne ''; $log->log_path($log_dir) if defined($log_dir) and $log_dir ne ''; $log->log_primary_name($logfile) if defined($logfile) and $logfile ne ''; my $good_tokens = Pitonyak::BayesianTokenCounter::read_from_file($bad_token_file); my $bad_tokens = Pitonyak::BayesianTokenCounter::read_from_file($bad_token_file); my $token_list = new Pitonyak::BayesianTokenCounter; $token_list->set_log($log); $bad_tokens->set_log($log); $good_tokens->set_log($log); $token_list->build_probabilities( $good_tokens, $bad_tokens ); $token_list->write_to_file($outfile); #************************************************************ =pod =head1 COPYRIGHT Copyright 1998-2002, Andrew Pitonyak (perlboy@pitonyak.org) This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 Modification History =head2 September 10, 2002 Version 1.00 First release =cut #************************************************************