#************************************************************ =head1 NAME tokenize_file.pl - Read a list of email file specs and turn them into a token file =head1 DESCRIPTION The following options are supported =head2 Case Sensitive Tokens (--case or -c) Tokens are not considered case sensitive by default. If you desire that the tokens I and I be considered different, turn case sensitive tokens on. The following example will read the file token_list.dat if it exists, and then read all the files matching the file spec. The files will be tokenized with case sensitive and the new file written out. example: perl -w tokenize_file.pl -c -o token_list.dat -s ./good_files/*.msg =head2 Case Sensitive File Search (--file_case or -f) Files are specified based on "file specifications". By default, the file specs are assumed to be case insensitive. In the UNIX world, this may make a difference so you can turn case sensitivity on with this option. The following example will read the file token_list.dat if it exists, and then read all the files matching the file spec. This will not match files matching *.MSG. The files will be tokenized and the new file written out. The path portion will be case sensitive based on the operating system used. If you are using Linux and you have the directories good_files and Good_Files, the second directory will not be searched even with the default case insensitive file matching. example: perl -w tokenize_file.pl -f -o token_list.dat -s ./good_files/*.msg =head2 Help (-h or -?) Print useage instructions example: perl -w tokenize_file.pl -h =head2 Log File Name (-l or --log) If a logfile is specified, then this is used as the logfile name. By default, the log tokenize_file.log is created. =head2 Log Configuration Files (--log_cfg) You can create a configuration file for your logger and then configure your log object by simply telling it to read the specified configuration file. To create an initial configuration file, write a perl script that creates a logger, configures the logger, and then use the write_to_file('log_cfg.dat') method. This provides complete control over how the logger is configured. You can set screen and file output levels, for example. example: perl -w tokenize_file.pl -o token_list.dat -s ./*.msg --log_cfg ~andy/logs/default_log.dat =head2 Log File Directory (--log_dir) This allows you to specify which directory contains the log example: perl -w tokenize_file.pl -o token_list.dat -s ./*.msg --log_dir ~andy/logs =head2 Token file (-o or --out) This provides a method of specifying the name of the token data file. If the file exists, then it is read. If it does not, then it is created. example: perl -w tokenize_file.pl -o token_list.dat -s ./good_files/*.msg =head2 Recurse Directories (-r or --recurse) This causes all directories under the specified directory to be searched for the given file spec. In the example below, all files will extension msg will be included regardless of their directory. example: perl -w tokenize_file.pl -r -o token_list.dat -s ./good_files/*.msg =head2 File Specs (-s or --spec) This specifies the file specs to search. If you desire to have three sets of file specs, then include the spec parameter three times. example: perl -w tokenize_file.pl -r -o token_list.dat -s ./good_files/*.msg -s ~andy/home/msg/*.msg =cut #************************************************************ use Carp; use IO::File; use File::Basename; use strict; use Getopt::Long; use Pitonyak::SmallLogger; use Pitonyak::SafeGlob qw(glob_spec_from_path); use Pitonyak::BayesianTokenCounter; # Print program usage sub usage { my $name = $0; $name = $_[0] if $#_ >= 0; print STDERR << "EOF"; Usage: $name [-cfhr] [-l file] [--log_cfg file] [--log_dir path] -s spec -o file Tokenize a file -c, --case : case sensitive tokens -f, --file_case : case sensitive file searches -h, --help : print this help message -l, --log=FILE : base name for the log file --log_cfg=FILE : log configuration file --log_dir=PATH : path where the logs should be saved -o, --out=FILE : output token file -r, --recurse : recurse directories while searching file specs -s, --spec=FILE : file specs to search example: $name --file_case -s ./good/*.msg -s ./home/good/*.msg -o good_files.dat EOF } #************************************************************ #** ** #** Input: configuration file to use ** #** file specs to match ** #** ** #************************************************************ my @suffixlist = (); my ( $program_name, $program_path, $program_suffix ) = fileparse( $0, @suffixlist ); my $recurse = 0; my $case_sensitive = 0; my $files_case_sensitive = 0; my @spec = (); my $help = 0; my $outfile = ''; my $logfile = ''; my $log_cfg = ''; my $log_dir = ''; Getopt::Long::Configure("bundling"); my $goodOptions = GetOptions( "spec|s=s" => \@spec, "recurse|r" => \$recurse, "help|?|h" => \$help, "out|o=s" => \$outfile, "log|l=s" => \$logfile, "log_cfg=s" => \$log_cfg, "log_dir=s" => \$log_dir, "case|c" => \$case_sensitive, "file_case|f" => \$files_case_sensitive, ); if ( $help || $outfile eq '' || $#spec < 0 ) { usage(); exit 0; } my $log = new Pitonyak::SmallLogger; $log->log_name_date(''); $log->message_loc_format('(sub):(line):'); $log->open_append(1); $log->log_path($program_path); $log->read_from_file($log_cfg) if defined($log_cfg) and $log_cfg ne ''; $log->log_path($log_dir) if defined($log_dir) and $log_dir ne ''; $log->log_primary_name($logfile) if defined($logfile) and $logfile ne ''; # This will create one! my $token_list = Pitonyak::BayesianTokenCounter::read_from_file($outfile); $token_list->set_log($log); my $files_tokenized = 0; my $old_tokens = $token_list->num_tokens(); my $want_files = 1; my $want_dirs = 0; my $glob = new Pitonyak::SafeGlob(); $glob->case_sensitive($files_case_sensitive); $glob->recurse($recurse); $glob->return_dirs(0); $glob->return_files(1); my $time_start = time(); foreach my $file_name ( $glob->glob_spec_from_path(@spec) ) { $log->write_log_type( 'T', "Tolkenizing file $file_name" ); ++$files_tokenized; $token_list->tokenize_file($file_name); } $token_list->write_to_file(); my $new_tokens = $token_list->num_tokens(); my $num_added_tokens = $new_tokens - $old_tokens; my $num_files = $token_list->num_files(); my $time_elapsed = time() - $time_start; $log->info( "Parsed $files_tokenized/$num_files files into $num_added_tokens/$new_tokens with time $time_elapsed" ); #************************************************************ =pod =head1 COPYRIGHT Copyright 1998-2002, Andrew Pitonyak (perlboy@pitonyak.org) This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 Modification History =head2 September 10, 2002 Version 1.00 First release =cut #************************************************************