build_probabilities.pl


#************************************************************

=head1 NAME

build_probabilities.pl - Turn two token files into a probability file

=head1 DESCRIPTION

The following options are supported

=head2 Bad (SPAM) Token File (-b or --bad)

Specify the token file containing the bad tokens.
In other words, the tokens from the SPAM messages.

example: perl -w build_probabilities.pl -p prob.dat -b bad.dat

=head2 Good Token File (-g or --good)

Specify the token file containing the good tokens.
In other words, the tokens from the non-SPAM messages.

example: perl -w build_probabilities.pl -p prob.dat -b bad.dat -g good.dat

=head2 Help (-h or -?)

Print useage instructions

example: perl -w build_probabilities.pl -h

=head2 Log File Name (-l or --log)

If a logfile is specified, then this is used as the logfile name.

=head2 Log Configuration Files (--log_cfg)

You can create a configuration file for your logger and then configure
your log object by simply telling it to read the specified configuration file.
To create an initial configuration file, write a perl script that
creates a logger, configures the logger, and then use the write_to_file('log_cfg.dat')
method.

This provides complete control over how the logger is configured.
You can set screen and file output levels, for example.

example: perl -w build_probabilities.pl -p prob.dat -b bad.dat -g good.dat --log_cfg ~andy/logs/default_log.dat

=head2 Log File Directory (--log_dir)

This allows you to specify which directory contains the log

example: perl -w build_probabilities.pl -p prob.dat -b bad.dat -g good.dat  --log_dir ~andy/logs


=head2 Token file (-p or --prob)

This provides a method of specifying the name of the output probability token data file.

example: perl -w build_probabilities.pl -p prob.dat -b bad.dat -g good.dat

=cut

#************************************************************

use Carp;
use IO::File;
use File::Basename;
use strict;
use Getopt::Long;
use Pitonyak::SmallLogger;
use Pitonyak::SafeGlob qw(glob_spec_from_path);
use Pitonyak::BayesianTokenCounter;

# Print program usage
sub usage {
    my $name = $0;
    $name = $_[0] if $#_ >= 0;

    print STDERR << "EOF";

Usage: $name [-cfhr] [-l file] -b bad_tokens_file -g good_tokens_file -p file [--log_cfg file] [--log_dir path]
Build the probability file

 -b, --bad=FILE      : file containing the bad tokens
 -g, --good=FILE     : file containing the good tokens
 -h, --help          : print this help message
 -l, --log=FILE      : base name for the log file
 --log_cfg=FILE      : log configuration file
 --log_dir=PATH      : path where the logs should be saved
 -p, --prob=FILE     : output token probability file

example: $name -g good_files.dat -b bad_files.dat -p probability.dat

EOF
}

#************************************************************
#**                                                        **
#**  Input: configuration file to use                      **
#**         file specs to match                            **
#**                                                        **
#************************************************************

my @suffixlist = ();
my ( $program_name, $program_path, $program_suffix ) =
  fileparse( $0, @suffixlist );

my $help            = 0;
my $logfile         = '';
my $bad_token_file  = '';
my $good_token_file = '';
my $outfile         = '';
my $log_cfg         = '';
my $log_dir         = '';

Getopt::Long::Configure("bundling");
my $goodOptions = GetOptions(
    "bad|b=s"   => \$bad_token_file,
    "good|g=s"  => \$good_token_file,
    "help|?|h"  => \$help,
    "log|l=s"   => \$logfile,
    "log_cfg=s" => \$log_cfg,
    "log_dir=s" => \$log_dir,
    "prob|p=s"  => \$outfile,
);

if (   $help
    || $bad_token_file  eq ''
    || $good_token_file eq ''
    || $outfile         eq '' )
{
    usage();
    exit 0;
}

my $log = new Pitonyak::SmallLogger;
$log->log_name_date('');
$log->message_loc_format('(sub):(line):');
$log->open_append(1);
$log->log_path($program_path);
$log->read_from_file($log_cfg)   if defined($log_cfg) and $log_cfg ne '';
$log->log_path($log_dir)         if defined($log_dir) and $log_dir ne '';
$log->log_primary_name($logfile) if defined($logfile) and $logfile ne '';

my $good_tokens =
  Pitonyak::BayesianTokenCounter::read_from_file($bad_token_file);
my $bad_tokens =
  Pitonyak::BayesianTokenCounter::read_from_file($bad_token_file);
my $token_list = new Pitonyak::BayesianTokenCounter;

$token_list->set_log($log);
$bad_tokens->set_log($log);
$good_tokens->set_log($log);

$token_list->build_probabilities( $good_tokens, $bad_tokens );
$token_list->write_to_file($outfile);

#************************************************************

=pod

=head1 COPYRIGHT

Copyright 1998-2002, Andrew Pitonyak (perlboy@pitonyak.org)

This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

=head1 Modification History

=head2 September 10, 2002

Version 1.00 First release

=cut

#************************************************************