#!/usr/bin/perl # # INSTITUT FUER INFORMATIK # der Ludwig-Maximilians Universitaet Muenchen # # FAKULTAET FUER INFORMATIK # der Technischen Universitaet Muenchen # # Fortgeschrittenenpraktikum # -------------------------- # Implementierung eines Analysewerkzeuges fuer # Logdateien von WWW-Servern bei der BMW-AG # # Peter Kai Wimmer # Gabelsbergerstr. 28/IV # 80333 Muenchen # Tel./Fax: (089) 523 72 65 # E-Mail: peter@leo.org # wimmer@informatik.tu-muenchen.de # # Urwald is Bloedsinn, Ernie, ich geh' nach Hause (Bert, Sesamstrasse) # $false = 0; $true = 1; # # standard log file name # $logfile = "httpd-log.all"; # # filter types # # global variables with corresponding name and meaning # must exist # @filterTypes = ('host', 'logname', 'authuser', 'method', 'protocol', 'domain', 'topdomain', 'dir', 'file', 'param', 'browser', 'status', 'bytes', 'date', 'time'); # # list types # @listTypes = ('hosts', 'lognames', 'authusers', 'methods', 'protocols', 'domains', 'topdomains', 'dirs', 'files', 'params', 'browsers'); # # count types # @countTypes = ('bytes', 'requests'); # # Flags # # # use regular expressions? # $regexp = $false; # # print lines verbosely? # $verbose = $false; # # print logline (false) or list/count (true) ? # $printExtended = $false; # # sort alphabetically (default; false) or numerically (true)? # $sortNum = $false; # # Log file format: Common (false) or Netscape (true)? # $netscape = $false; # # Map months to numbers # %monthName = ( 'Jan', 1, 'Feb', 2, 'Mar', 3, 'Apr', 4, 'May', 5, 'Jun', 6, 'Jul', 7, 'Aug', 8, 'Sep', 9, 'Oct', 10, 'Nov', 11, 'Dec', 12, ); # # check if a batch file is used # read parameters from that file # foreach $arg (@ARGV) { local ($batchfile, @batchline); if ($arg =~ /^@.*/) { $batchfile = substr ($arg,1); open(BATCHFILE, $batchfile) || die "$0: Can't open $batchfile\n"; -T $batchfile || die "$0: $batchfile: not a batchfile\n"; while($batchline = <BATCHFILE>) { ($batchline =~ /^#/) && next; @batchline = split (' ', $batchline); foreach $arg (@batchline) { push(@argv, $arg); print "$arg\n"; } } } else { push(@argv, $arg); } } # # Main Loop # MAIN: while(@argv) { local ($arg, $type, $outputType); $arg = shift (@argv); ($arg eq '-help') && (&help) && exit; ($arg eq '-l') && ($logfile = shift (@argv)) && next; ($arg eq '-verbose') && ($verbose = $true) && next; ($arg eq '-sort') && ($sortNum = $true) && next; ($arg eq '-netscape') && ($netscape = $true) && next; ($arg eq '-regexp') && ($regexp = $true) && next; # # Filter # foreach (@filterTypes) { if ($arg =~ $_) { $type = substr($arg,1); $arg = shift (@argv); while (($arg !~ /^-.*/) && ($arg ne "")) { # use shell-style expressions # instead of regular expressions? # if (!$regexp) { # . -> \. $arg =~ s|\.|\\\.|g; # * -> .* $arg =~ s|\*|\.\*|g; # ? -> . $arg =~ s|\?|\.|g; # [^ -> [\^ $arg =~ s|\[\^|\[\\\^|g; } if ($type eq 'status') { ($arg eq 'ok') && ($arg = '20.'); ($arg eq 'error') && ($arg = '40. 50.'); } $filter{$type} .= $arg . ' '; $arg = shift (@argv); } if ($type !~ /bytes|date|time/) { $prg_filter .= <<'EOF'; ($TYPE !~ /^FILTER$/) && return scalar($false); EOF $filtertype = $filter{$type}; chop $filtertype; $filtertype =~ s/ /\$|^/g; # escape '/' for regular expr. $filtertype =~ s|/|\\/|g; $prg_filter =~ s/TYPE/$type/; $prg_filter =~ s/FILTER/$filtertype/; } unshift(@argv, $arg); next MAIN; } } # # Output # if ($arg =~ /-count|-list/) { # disable line-by-line listing $printExtended = $true; # 'list' or 'count' $outputType = substr($arg,1); $arg = shift (@argv); while (($arg !~ /^-.*/) && ($arg ne "")) { $output{$outputType} .= $arg . ' '; # # list # if ($outputType eq "list") { if (!grep (/^$arg$/, @listTypes)) { print "$0: Unknown list type -- $arg\n"; die "$0: Try '$0 -help' for more information.\n"; } # cut off trailing 's' chop $arg; # the following $prg_filter results in (e.g.) # $list{'hosts'} .= $host . ' '; $prg_account{'list'} .= <<'EOF'; $list{'TYPEs'} .= $TYPE . ' '; EOF $prg_account{'list'} =~ s/TYPE/$arg/g; } # # count # if ($outputType eq "count") { if (!grep (/^$arg$/, @countTypes)) { print "$0: Unknown count type -- $arg\n"; die "$0: Try '$0 -help' for more information.\n"; } } $arg = shift (@argv); } unshift(@argv, $arg); next MAIN; } # # Error # if ($arg ne "") { print "$0: Illegal option -- $arg\n"; die "$0: Try '$0 -help' for more information.\n"; } } # for debugging: code strings # # print"Filter: $prg_filter"; # print"List: $prg_account{'list'}"; open(LOGFILE, $logfile) || die "$0: Can't open $logfile\n"; -T $logfile || die "$0: $logfile: not a logfile\n"; while($logline = <LOGFILE>) { # two lines from Netscape server ($netscape) && (chop $logline) && ($logline .= ' ' . <LOGFILE>); &parse; if (&filter) { &account; &out; } } close LOGFILE; foreach (@listTypes) { ($list{$_}) && (&printArray (&countItems($list{$_}))); } foreach (@countTypes) { ($count{$_}) && (print "$_ $count{$_}\n"); } # # Parse # sub parse { # if you use '/' as delimiter for RegExp, you need to # escape it, since '/' also occurs in $path # # Parse $logline # parse $request: $file, $params # if ($netscape) { ($host, $logname, $authuser, $date, $request, $status, $bytes, $domain, $browser, $os) = $logline =~ /^(.*) (.*) (.*) \[(.*)\] \"(.*)\" (.*) (.*) (.*) (.*) \((.*)\)$/; ($method, $path, $httpVersion) = $request =~ m|^(\w+) /(.*) HTTP/(.*)$|; } else { ($host, $logname, $authuser, $date, $request, $status, $bytes) = $logline =~ /^(.*) (.*) (.*) \[(.*)\] \"(.*)\" (.*) (.*)$/; ($method, $protocol, $path, $httpVersion) = $request =~ m|^(\w+) (\w+)://(.*) HTTP/(.*)$|; } # # Date, Time (e.g. 29/Sep/1995:13:13:11) # ($day, $month, $year, $hour, $minute, $second) = $date =~ m|^(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+)|; # # exchange name of month with number # $month = $monthName{$month}; # # parse $path: ($domain,) $dir, $file, $param # ($path, $param) = $path =~ /([^\?]*)\??([^\?]*)$/; if ($netscape) { ($dir, $file) = $path =~ m|^(.*/)?([^/]*)$|; } else { ($domain, $dir, $file) = $path =~ m|^([^/]+)/(.*/)?([^/]*)$|; } $dir = '/' . $dir; # # parse $domain: $topdomain, $port # ($topdomain, $port) = $domain =~ /.*\.([^:]*)?:?(\d+)?$/; } # # Filter # sub filter { local ($filter, $type); local ($min, $max); local (@filterRange, $range); # perl versions 4 and 5 seem to interpret # 'eval' differently; # eval $prg_filter; # works for version 4, but f**k version 5 ((eval $prg_filter) eq '0') && return $false; # # Bytes # if ($filter{'bytes'}) { @filterRange = split(' ', $filter{'bytes'}); foreach $range (@filterRange) { ($min, $max) = $range =~ /(\d*)-(\d*)/; ($bytes < $min) && return $false; ($max) && ($bytes > $max) && return $false; } } # # date # if ($filter{'date'}) { @filterRange = split(' ', $filter{'date'}); foreach $range (@filterRange) { ($min, $max) = $range =~ m|([\d/]+)-?([\d/]*)|; ($minDay, $minMonth, $minYear) = ($maxDay, $maxMonth, $maxYear) = $min =~ m|^(\d+)/(\d+)/(\d+)$|; if ($max) { ($maxDay, $maxMonth, $maxYear) = $max =~ m|(\d+)/(\d+)/(\d+)|; } (($year < $minYear) || (($year == $minYear) && ($month < $minMonth)) || (($year == $minYear) && ($month == $minMonth) && ($day < $minDay))) && return $false; (($year > $maxYear) || (($year == $maxYear) && ($month > $maxMonth)) || (($year == $maxYear) && ($month == $maxMonth) && ($day > $maxDay))) && return $false; } } # # time # if ($filter{'time'}) { @filterRange = split(' ', $filter{'time'}); foreach $range (@filterRange) { ($min, $max) = $range =~ m|([\d:]+)-?([\d:]*)|; ($minHour, $minMin, $minSec) = ($maxHour, $maxMin, $maxSec) = $min =~ m|^(\d+):(\d+):(\d+)$|; if ($max) { ($maxHour, $maxMin, $maxSec) = $max =~ m|(\d+):(\d+):(\d+)|; } (($hour < $minHour) || (($hour == $minHour) && ($minute < $minMin)) || (($hour == $minHour) && ($minute == $minMin) && ($second < $minSecond))) && return $false; (($hour > $maxHour) || (($hour == $maxHour) && ($minute > $maxMin)) || (($hour == $maxHour) && ($minute == $maxMin) && ($second > $maxSecond))) && return $false; } } return $true; } # # Accounting # sub account { local (@cntTypes); # # List # eval $prg_account{'list'}; # # Count # @cntTypes = split (' ', $output{'count'}); (grep (/^bytes$/, @cntTypes)) && ($count{'bytes'} += $bytes); (grep (/^requests$/, @cntTypes)) && ($count{'requests'}++); } # # Count items in an array # sub countItems { local ($list) = @_; local (@array, @keys, @resultArray); local (%count); @array = sort(split(' ', $list)); # count occurences # Programming Perl, p.254 for (@array) { $count{$_}++; } # sort alphabetically # Programming Perl, p. 235 for (reverse sort keys %count) { push (@resultArray, ($_ . ' ' . $count{$_})); } # sort numerically # Programming Perl, p. 249 if ($sortNum) { foreach (@resultArray) { push (@keys, (split(/ /))[1] ); } } # subroutine for sorting numerically sub byNumber { $keys[$a] <=> $keys[$b]; } @sortArray = @resultArray[reverse sort byNumber $[..$#resultArray]; } # # Print an array # # Programming Perl, p. 230 # this is preferred to print join("\n", @array) # since the following code is more efficient for # a very long array and more space-conservative # sub printArray { local (@array) = @_; local ($,, $\) = ("\n", "\n"); print @array; } # # Output a logline # sub out { ($printExtended) && return; if ($verbose) { print "\nHost: $host\n"; print "Logname: $logname\n"; print "Authuser: $authuser\n"; print "Date: $date\n"; # request print "Method: $method\n"; (!$netscape) && print "Protocol: $protocol\n"; # path print "Domain: $domain\n"; print "topDom: $topdomain\n"; ($port) && print "Port: $port\n"; print "Dir: $dir\n"; print "File: $file\n"; ($param) && print "Param: $param\n"; print "HTTP-Ver: $httpVersion\n"; print "Status: $status\n"; print "Bytes: $bytes\n"; if ($netscape) { print "Browser: $browser\n"; print "Operating System: $os\n"; } } else { print $logline; } } # # Help # sub help { print <<EOF; WWW log file analyzer 1.0 Peter Kai Wimmer (peter\@leo.org) -l logfile -netscape logfile is in NetScape format (default: common log) -regexp use regular expressions instead of shell-style expr. -sort sort -list output by number Filter: -host -logname wimmer -authuser -date dd/mm/yyyy-dd/mm/yyyy 01/10/1995-30/10/1995 -time hh:mm:ss-hh:mm:ss 06:00:00-07:30:00 -method GET, POST, ... -protocol http, ftp, ... -domain www.ibm.com, ... -topdomain de, com, edu, ... -dir -file -param -status 303, 20*, ok, error -bytes xxxxx-yyyyy -browser Output: -count bytes | requests -list hosts | lognames | authusers | methods | protocols | domains | topdomains | dirs | files | params | browsers without '-count' or '-list': logline itself -verbose print verbosely '\@' preceding a file name reads parameters from a file (e.g. '\@args') EOF }