spam_check_file.pl


#************************************************************

=head1 NAME

spam_check_file.pl - Check to see if files are likely to be SPAM

=head1 DESCRIPTION

The following options are supported

=head2 Add Header To Files (-a, --add_header_level)

Add a header to file if spam probability is greater or equal to this value.
Default value is 2 which means that a header will never be added because
the smallest probability is 0 and greatest is 1. This allows for you to
have a header added only if the file is above a certain thresh-hold.

The following example will only add a header if the spam probability is at least 0.90.

example: perl -w spam_check_file.pl -s ./*.msg -p prob.dat -a 0.90

=head2 Case Sensitive Tokens (--case or -c)

Tokens are not considered case sensitive by default. If you
desire that the tokens I<Hello> and I<hello> be considered different,
turn case sensitive tokens on.

example: perl -w spam_check_file.pl -c -s ./*.msg -p prob.dat

=head2 Case Sensitive File Search (--file_case or -f)

Files are specified based on "file specifications".
By default, the file specs are assumed to be case insensitive.
In the UNIX world, this may make a difference so you can turn
case sensitivity on with this option.

example: perl -w spam_check_file.pl -fc -s ./*.msg -p prob.dat

=head2 Help (-h or -?)

Print useage instructions

example: perl -w spam_check_file.pl -h

=head2 Log File Name (-l or --log)

If a logfile is specified, then this is used as the logfile name.
By default, the log tokenize_file.log is created.

=head2 Log Configuration Files (--log_cfg)

You can create a configuration file for your logger and then configure
your log object by simply telling it to read the specified configuration file.
To create an initial configuration file, write a perl script that
creates a logger, configures the logger, and then use the write_to_file('log_cfg.dat')
method.

This provides complete control over how the logger is configured.
You can set screen and file output levels, for example.

example: perl -w spam_check_file.pl -c -s ./*.msg -p prob.dat --log_cfg ~andy/logs/default_log.dat

=head2 Log File Directory (--log_dir)

This allows you to specify which directory contains the log

example: perl -w spam_check_file.pl -c -s ./*.msg -p prob.dat --log_dir ~andy/logs


=head2 Probability Token File (-p or --prob)

This provides a method of specifying the name of the probability token data file.

example: perl -w spam_check_file.pl -c -s ./*.msg -p prob.dat

=head2 Recurse Directories (-r or --recurse)

This causes all directories under the specified directory to be searched for the given file spec.

example: perl -w spam_check_file.pl -r -s ./*.msg -p prob.dat

=head2 Setting the SPAM Limit (Sensitiveity) with (-sl or --spam_limit)

By default, a file is considered SPAM if the probability is greater than 0.90.
You can make this less or more sensitive by changing this value.

To make this more strict, for example, change the probability to 0.95.

example: perl -w spam_check_file.pl -r -s ./*.msg -p prob.dat -sl 0.95

=head2 File Specs (-s or --spec)

This specifies the file specs to search.
If you desire to have three sets of file specs, then include the spec parameter three times.

example: perl -w spam_check_file.pl -r -s ./*.msg -p prob.dat -s *.MES -s ~andy/*.msg

=cut

#************************************************************

use Carp;
use IO::File;
use File::Basename;
use strict;
use Pitonyak::SmallLogger;
use Pitonyak::SafeGlob qw(glob_spec_from_path);
use Pitonyak::BayesianTokenCounter;
use Getopt::Long;

# Print program usage
sub usage {
    my $name = $0;
    $name = $_[0] if $#_ >= 0;

    print STDERR << "EOF";

Usage: $name [-cfhr] [-l file] -s spec -p prob_file [--log_cfg file] [--log_dir path] [-sl NUM] [-a NUM]
Tokenize a file

 -a, --add_header_level=PROB : Add a header to file if spam probability is
                             : greater or equal to this value. Default is 2
                             : Smallest probability is 0 and greatest is 1
 -c, --case             : case sensitive tokens
 -f, --file_case        : case sensitive file searches
 -h, --help             : print this help message
 -l, --log=FILE         : base name for the log file
 --log_cfg=FILE         : log configuration file
 --log_dir=PATH         : path where the logs should be saved
 -p, --prob=FILE        : file containing the probability tokens
 -r, --recurse          : recurse directories while searching file specs
 -sl, --spam_limit==NUM : File is not considered SPAM if the probability
                        : is less than the spam_limit. Default is 0.90
 -s, --spec=FILE        : file specs to search

example: $name --file_case -s ./good/*.msg -s ./home/good/*.msg -p probability.dat

EOF
}

#************************************************************
#**                                                        **
#**  Input: configuration file to use                      **
#**         file specs to match                            **
#**                                                        **
#************************************************************

my @suffixlist = ();
my ( $program_name, $program_path, $program_suffix ) =
  fileparse( $0, @suffixlist );

my $recurse              = 0;
my $case_sensitive       = 0;
my $files_case_sensitive = 0;
my @spec                 = ();
my $help                 = 0;
my $logfile              = '';
my $prob_file            = '';
my $log_cfg              = '';
my $log_dir              = '';
my $spam_limit           = 0.90;
my $add_header_level     = 2.0;

Getopt::Long::Configure("bundling");
my $goodOptions = GetOptions(
    "add_header_level|a=f" => \$add_header_level,
    "case|c"               => \$case_sensitive,
    "file_case|f"          => \$files_case_sensitive,
    "help|?|h"             => \$help,
    "log|l=s"              => \$logfile,
    "log_cfg|lc=s"         => \$log_cfg,
    "log_dir|ld=s"         => \$log_dir,
    "prob|p=s"             => \$prob_file,
    "recurse|r"            => \$recurse,
    "spam_limit|sl=f"      => \$spam_limit,
    "spec|s=s"             => \@spec,
);

if ( $help || $prob_file eq '' || $#spec < 0 ) {
    usage();
    exit 0;
}

my $log = new Pitonyak::SmallLogger;
$log->log_name_date('');
$log->file_output( {} );
$log->message_loc_format('(sub):(line):');
$log->open_append(1);
$log->log_path($program_path);
$log->read_from_file($log_cfg)   if defined($log_cfg) and $log_cfg ne '';
$log->log_path($log_dir)         if defined($log_dir) and $log_dir ne '';
$log->log_primary_name($logfile) if defined($logfile) and $logfile ne '';

# This will create one!
my $prob_tokens = Pitonyak::BayesianTokenCounter::read_from_file($prob_file);
$prob_tokens->set_log($log);

my $glob = new Pitonyak::SafeGlob();
$glob->case_sensitive($files_case_sensitive);
$glob->recurse($recurse);
$glob->return_dirs(0);
$glob->return_files(1);
my $file_token_list  = new Pitonyak::BayesianTokenCounter;
my $empty_tokens     = {};
my $last_probability = 0;

foreach my $file_name ( $glob->glob_spec_from_path(@spec) ) {

    # I keep using the same object but I clear the
    # tokens before each file!
    $file_token_list->tokens($empty_tokens);
    $log->write_log_type( 'T', "Tolkenizing $file_name" );
    $file_token_list->tokenize_file($file_name);
    $last_probability = $prob_tokens->rate_tokens($file_token_list);
    $log->info("$last_probability : $file_name");
    if ( $last_probability >= $add_header_level ) {

        # Add the header to the file
        # Enable slurp mode!
        local $/;
        undef $/;
        if ( !rename( $file_name, "$file_name.bak" ) ) {
            $log->error(
                "Failed to rename $file_name to $file_name.bak because $!");
        }
        else {
            my $handle = new IO::File;
            if ( !$handle->open( "$file_name.bak", 'r' ) ) {
                $log->error("Failed to open file $file_name.bak because $!");
                if ( !rename( "$file_name.bak", $file_name ) ) {
                    $log->error(
"Failed to restore $file_name from $file_name.bak because $!"
                    );
                }
            }
            else {
                my $spam_text = $last_probability < $spam_limit ? ' not' : '';
                my $new_header =
"X-Andy-Spam: Probably$spam_text spam with score $last_probability\n";
                my $header_txt      = '';
                my $rest_of_message = '';
                my $message         = <$handle>;
                $handle->close();
                if ( !$handle->open( "$file_name", 'w' ) ) {
                    $log->error("Failed to open file $file_name because $!");
                    if ( !rename( "$file_name.bak", $file_name ) ) {
                        $log->error(
"Failed to restore $file_name from $file_name.bak because $!"
                        );
                    }
                }
                else {

                    # Do NOT put multiple headers into the same file
                    # If there is already one, then simply replace it!
                    if ( $message =~ /\A(.*?)^X-Andy-Spam:.*?(^.*)\Z/mis ) {
                        $handle->print("$1$new_header");
                        $handle->print($2);
                        undef $message;
                    }

                    # Next, attempt to place this AFTER the Subject line
                    elsif ( $message =~ /\A(.*?^Subject:.*?)(^\S*?:.*)\Z/mis ) {
                        $handle->print("$1$new_header");
                        $handle->print($2);
                        undef $message;
                    }
                    else {
                        $handle->print($new_header);
                        $handle->print($message);
                        undef $message;
                    }
                    $handle->close();
                    if ( !unlink("$file_name.bak") ) {
                        $log->error(
                            "Failed to delete $file_name.bak because $!");
                    }
                }
            }
        }
    }
}

$last_probability < $spam_limit ? 0 : 1;

#************************************************************

=pod

=head1 COPYRIGHT

Copyright 1998-2002, Andrew Pitonyak (perlboy@pitonyak.org)

This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

=head1 Modification History

=head2 September 10, 2002

Version 1.00 First release

=cut

#************************************************************