: # -*- perl -*- eval 'exec perl -w -S $0 ${1+"$@"}' # Let `sh' locate `perl' if 0; # Copyright 2001, 2002 Stefan Merten # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA =head1 NAME mail2clf - Convert mail into lines for a Web servers common log file format =head1 SYNOPSIS B [B<-v>] [I...] B B<-H> =head1 DESCRIPTION Converts important information from a mail to lines for a common log file. This format is used by web servers and there are a number of programs which create beautiful visualizations. See L<"WEBALIZER"> for hints for a configuration file for B. If I is given at least once, each must be a path to a file containining a single mail. These mail files are converted. Use C<-> to read a single mail from STDIN. Omitting I allows a combination of B with B. The output of B is expected on STDIN. The result gives a better picture regarding threads in a bunch of mails since all the mail belonging to a single thread is taken as a hit to a single file named as the thread. B<-l> must be given to B to have long file names instead of just numbers. The input is expected to be generated by B and thus must follow a simple format of two types of alternating line blocks. =over 4 =item * One or more lines without leading whitespace give one or more cleaned subjects belonging to a single thread. If there is more than one subject line, the first one is used as the thread name. However, it is often useful to run B with B<-e> so an exact match is done on the cleaned subjects and there is only one subject line for each thread. =item * After that one or more lines with leading whitespace gives the names and full subjects of the mail files belonging to that thread. The first word of the line gives the file name and the rest of the line is ignored. =back =cut ############################################################################### require 5.004; # Switch warning on $^W = 1; use strict; use diagnostics; use Getopt::Long qw( GetOptions ); use Carp qw( carp croak confess ); use FindBin; { local %ENV = ( POSIXLY_CORRECT => 1 ); Getopt::Long::config("default", "bundling"); } sub errEx($@); use Mail::Internet; use Mail::Address; use Time::ParseDate; use Time::CTime; ############################################################################### ############################################################################### # Constants # Field names to be used for a mail description hash my $KeySubject = "subject"; my $KeyFileName = "file-name"; ############################################################################### ############################################################################### # Options my %OptS2OptVar; my $SUsage = "Usage: $FindBin::Script"; =head1 OPTIONS =over 4 =item B<-v> =item B<--verbose> Operate verbose. =cut my $verb = 0; $OptS2OptVar{"v|verbose"} = \$verb; $SUsage .= " [-v]"; =item B<-H> =item B<--help> Generate the man page for this program on standard output. =cut my $help = 0; $OptS2OptVar{"H|help"} = \$help; $SUsage .= " [-H]"; =back If an unknown option such as B<-.> is given, a short usage message is generated. =cut $SUsage .= " [...]"; # Options and usage errEx(1, $SUsage) unless GetOptions(%OptS2OptVar); exec("perldoc $FindBin::Bin/$FindBin::Script") if $help; # errEx(1, $SUsage) # unless @ARGV < 0; ############################################################################### ############################################################################### # More constants ############################################################################### ############################################################################### # Variables ############################################################################### ############################################################################### # Unspecialized functions # Outputs the given strings `@lns' as error message. Returns 0. sub errO(@) { my( @lns ) = @_; my $lns; foreach $lns ( @lns ) { my $ln; foreach $ln ( split(/\n/, $lns) ) { print(STDERR "$FindBin::Script: $ln\n"); } } return 0; } ############################################################################## # Outputs the given strings `@lns' as verbose text. Returns 0. sub vrbO(@) { my( @lns ) = @_; if($verb) { my $ln; foreach $ln ( @lns ) { errO("## $ln"); } } return 0; } ############################################################################## # Outputs error messages `@msg' and exits with code `$code'. sub errEx($@) { my( $code, @msgs ) = @_; errO(@msgs); exit($code); } ############################################################################## # Prepends the path `$relP' relative to the one this script is started in to # environment variable $PATH if this directory exists. Returns resulting path # on success. sub addMyP2P(;$ ) { my( $relP ) = @_; $relP = "" unless defined($relP); my $dstP = $FindBin::RealBin; $dstP .= "/$relP" if $relP; return "" unless -d $dstP; $ENV{"PATH"} = $dstP . ":" . $ENV{"PATH"}; return $dstP; } ############################################################################## ############################################################################## # Specialized functions # Reads mail file `$fNm' extracts relevant data and returns pair consisting of # seconds since the epoch and a CLF line. If `$fNm' is no scalar it is # interpreted as a reference to a hash, which may contain predefined data. sub f2Ln($ ) { my( $fNm ) = @_; my %f; if(ref($fNm)) { %f = %$fNm; } else { $f{$KeyFileName} = $fNm; } errEx(2, "File name unknown") unless $f{$KeyFileName}; $fNm = $f{$KeyFileName}; errEx(2, "Can't open `$fNm'") unless open(F, $fNm); my $mail = Mail::Internet->new(\*F); my $hdr = $mail->head(); my $bdy = $mail->body(); close(F); my $frmLn = $hdr->get("From"); my @frms = Mail::Address->parse($frmLn); my $frm = shift(@frms); my $adr = $frm->address(); # Save mail address from beeing collected by a harvester $adr =~ s/\@/../; my $dtLn = $hdr->get("Date"); unless(defined($dtLn)) { errO("No date in `$fNm'"); $dtLn = ""; } my $secs = parsedate($dtLn, NO_RELATIVE => 1); unless(defined($secs)) { errO("Can't parse date in `$fNm'"); $secs = 0; } my $dt = strftime("[%d/%b/%Y:%H:%M:%S +0000]", gmtime($secs)); my $subj = defined($f{$KeySubject}) ? $f{$KeySubject} : $hdr->get("Subject"); if(defined($subj)) { $subj =~ s/\s*$//; $subj =~ s/^\s*//; $subj =~ s/\s+/_/g; $subj =~ s/[]["']//g; } else { $subj = "NO_SUBJECT"; } my $sz = 0; $sz += length() foreach ( @$bdy ); my $id = $hdr->get("Message-Id"); chomp($id); return ( $secs, "$adr - - $dt \"GET $subj HTTP/1.0\" 200 $sz $id -" ); } # CLF fields # ---------- # host => From: # - # - # [2*digit/3*letter/4*digit:2*digit:2*digit:2*digit [-+]4*digit] => Date: # request => "GET Subject: HTTP/1.0" # 200 # bytes => Body size # referrer => Message id # client ############################################################################## # Extracts thread information from STDIN and returns an array of mail file # infos being references to hashs indexed by `$Key...' constants. Mail files # must be given as file paths. sub thrds2Fs() { my @r; my $thrd; my $subj; while(defined($_ = )) { chomp(); unless(s/^\s+//) { # A subject line $thrd = $_ unless defined($thrd); } else { # A mail file if(defined($thrd)) { $subj = $thrd; undef($thrd); } push(@r, { $KeyFileName => ( split() )[0], $KeySubject => $subj }); } } return @r; } ############################################################################## ############################################################################## # Now work my %secs2Ln; my @fs = @ARGV ? @ARGV : thrds2Fs(); foreach my $f ( @fs ) { my( $secs, $ln ) = f2Ln($f); $secs++ while defined($secs2Ln{$secs}); $secs2Ln{$secs} = $ln; } # Sort entries by time just like a web server does foreach my $secs ( sort{ $a <=> $b }(keys(%secs2Ln)) ) { print($secs2Ln{$secs} . "\n"); } ############################################################################## =head1 WEBALIZER Since B is a free tool to produce nice graphics, it may be used for the visualization. =head2 Configuration file The following are useful settings in a B configuration file. Only the differences to the sample file found in the B documentation are given. =over 4 =item LogFile No log file may be given to use STDIN. =item HostName If a mailing list is visualized, the mail address of the list is a good value for this variable. =item HTMLExtension Since the subjects are mapped into file names without any extension, this I be given and it needs to be an empty string. =item VisitTimeout 0 The notion of a visit doesn't make much sense for a mailing list, so this should be turned of. =item CountryGraph no =item TopCountries 0 Since the host names are mail addresses of the senders of the mail, this makes no sense. =back So far neither the referrer field nor the client field of the combined log file format is used. =head2 Notion mapping The notions used by B or other visualization tools are meant to be used for web server statistics of course. However, B maps mail, so the following notion mapping applies. Web Mail -------------------------------------- URL (Thread) subject Hits Number of single mails Files dito Site Address of mail author KBytes Size of mail body Visits Makes no sense Entry pages dito Exit pages dito =head1 EXAMPLE The following pipeline produces a visualization of the mail from a mailing list considering threads. mail2thread -p '\[[a-z]?ox\]' -e -l ~/Mail/oekonux/arc/* | mail2clf | webalizer -i =over 4 =item * The mail is stored in C<~/Mail/oekonux/arc>. =item * There are a number of possible prefixes due to the mailing list all matching C<\[[a-z]?ox\]>. =item * The subjects in the mailing list are used quite disciplined so C<-e> can be used. =item * B may not use any history information. =item * A file C needs to be found in the current directory. =back =head1 PREREQUISITES Because this is a Perl program, Perl (>= V5.005) must be installed. This program needs the great C package installed. Try http://search.cpan.org/search?dist=MailTools This program needs the C package installed. Try http://search.cpan.org/search?dist=Time-modules =head1 SEE ALSO L L L =head1 AUTHOR Stefan Merten =head1 LICENSE This program is licensed under the terms of the GPL. See http://www.gnu.org/licenses/gpl.txt =head1 AVAILABILTY See http://www.merten-home.de/FreeSoftware/mail2clf/ =cut