#!/opt/bin/perl use strict; use warnings; use Getopt::Long; my $options = {}; my ($report_title, $output_dir, $output_file, $config_file); GetOptions( 'output_dir=s' => \$output_dir, 'report_title=s' => \$report_title, 'config_file=s' => \$config_file, ); $options->{'reportTitle'} = $report_title if (defined $report_title); $options->{'outputDir'} = $output_dir if (defined $output_dir); $options->{'outputFile'} = $output_file if (defined $output_file); $options->{'configFile'} = $config_file if (defined $config_file); $options->{'accessLogs'} = [@ARGV] if (0 < @ARGV); my $bc = NihongoOrg::WebStats::BrowserCounter->new( $options ); $bc->process_stats; exit; ###################################################################################################### ###################################################################################################### ###################################################################################################### package NihongoOrg::WebStats::BrowserCounter; =head1 NAME NihongoOrg::WebStats::BrowserCounter - A package for processing web server logs and generating a user agents ("browsers") report =head1 SYNOPSIS my $access_logs = [@ARGV]; my $bc = NihongoOrg::WebStats::BrowserCounter->new( accessLogs => $access_logs ); $bc->process_stats; =head1 DESCRIPTION =head1 CHANGES 2.00 20 Aug 2005 - Initial release =cut ####################################################################### use strict; use warnings; use Carp qw (croak confess); use File::Spec; use IO::File; use Symbol qw (gensym); use Data::Dumper; use vars qw ($VERSION); ####################################################################### BEGIN { $VERSION = '2.00'; } ####################################################################### =head1 METHODS =cut ####################################################################### # Object properties generic get/set accessor sub _property { my $self = shift; my $property = shift; my $package = __PACKAGE__; if (0 == @_) { my $output = $self->{$package}->{$property}; return $output; } elsif (1 == @_) { my $input = shift; $self->{$package}->{$property} = $input; return; } else { die ("{$package}::_property() - bad calling parameters\n"); } } ####################################################################### =over 4 =item new([accessLogs => [@list_of_log_files,] [configFile => $configurationFile ]); Creates and optionally initializes the log processor. It takes the following optional initialization parameters: =over 4 =item accessLogs An anonymous list of log files for processing. =item configFile The path to the file to be used for setting the configuration of the processor =back =back =cut sub new { my $proto = shift; my $package = __PACKAGE__; my $class = ref($proto) || $proto || $package; my $self = bless {}, $class; my $options = {}; if (1 < @_) { %$options = @_; } else { ($options) = @_; } if (defined $options->{'configFile'}) { $self->config_file($options->{'configFile'}); delete $options->{'configFile'}; } $self->probable_robots({}); $self->unrecognized_robots({}); $self->unrecognized_robots_count(0); $self->excluded_lines_count(0); $self->log_decompressors({}); $self->log_parser_patterns({}); $self->log_parser_maps({}); $self->exclude_robots('yes'); $self->minimum_browser_report_percentage(1); # Default 1% $self->raw_user_agents({}); $self->access_logs([]); $self->class_map({}); $self->load_configuration({ configFile => $self->config_file }); if (defined $options->{'accessLogs'}) { my $access_logs = $options->{'accessLogs'}; my $parm_type = ref($access_logs); if ($parm_type eq '') { $self->access_logs([$access_logs]); } elsif ($parm_type eq 'ARRAY') { if (0 < @$access_logs) { $self->access_logs($access_logs); } } else { croak("accessLog parameter had unsupported reference type of $parm_type"); } delete $options->{'accessLogs'}; } if (defined $options->{'reportTitle'}) { $self->report_title($options->{'reportTitle'}); delete $options->{'reportTitle'}; } if (defined $options->{'outputDir'}) { $self->output_dir($options->{'outputDir'}); delete $options->{'outputDir'}; } if (defined $options->{'outputFile'}) { $self->output_file($options->{'outputFile'}); delete $options->{'outputDir'}; } my @remaining_parms = sort keys %$options; if (0 < @remaining_parms) { confess("Unexpected parameters in 'new' parameter list: " . Dumper ($options)); } return $self; } ####################################################################### =over 4 =item access_logs([\@list_of_log_files]); Get/Set accessor for the list of logfiles for processing. Expects/Returns an anonymous list: Ex. $bc->access_logs(['/var/log/httpd/access_log']); my $access_logs = $bc->access_logs; print "Log files: " . join(', ', @$access_logs) . "\n"; =back =cut sub access_logs { shift->_property('access_logs', @_); } ####################################################################### =over 4 =item config_file([$configuration_file_path]); Get/Set accessor for the path to the configuration file. If the configuration file is set to undef or the empty string the default configuration is used. =back =cut sub config_file { shift->_property('config_file', @_); } ####################################################################### =over 4 =item log_parse_map([$log_fields_map_hash]); Get/Set accessor for the map of field names to positions in the log parsing regular expression. Ex. $bc->log_parse_map({ 'remote_addr' => 0, 'ident' => 1, 'user' => 2, 'timedate' => 3, 'method' => 4, 'uri' => 5, 'protocal' => 6, 'status' => 7, 'bytes' => 8, 'referrer' => 9, 'useragent' => 10, }); my $parse_fields_map = $bc->log_parse_map; =back =cut sub log_parse_map { shift->_property('log_parse_map', @_); } ####################################################################### =over 4 =item log_parse_regex([$parsing_regular_expression]); Get/Set accessor for the regular expression used to parse the log files. Ex. $bc->log_parse_regex(qr/^(\S+) (\S+) (\S+) (\S+) \[([^\]\[]+)\] \"(\S+)\s+(\S+)\s+(\S+)\" (\S+) (\S+) \"?([^"]*)\"? \"(.*)\"/); my $parser_regex = $bc->log_parse_regex; =back =cut sub log_parse_regex { shift->_property('log_parse_regex', @_); } ####################################################################### sub _user_agent_field { return shift->log_parse_map->{'useragent'}; } sub _referrer_field { return shift->log_parse_map->{'referrer'}; } sub _remote_addr_field { return shift->log_parse_map->{'remote_addr'}; } sub _uri_field { return shift->log_parse_map->{'uri'}; } sub _month_field { return shift->log_parse_map->{'month'}; } sub _year_field { return shift->log_parse_map->{'year'}; } sub class_map { shift->_property('class_map', @_); } sub report_template { shift->_property('report_template', @_); } sub report_title { shift->_property('report_title', @_); } sub show_detailed_browser_report { shift->_property('show_detailed_browser_report', @_); } sub output_dir { shift->_property('output_dir', @_); } sub output_file { shift->_property('output_file', @_); } sub raw_user_agents { shift->_property('raw_user_agents', @_); } sub monthly_raw_user_agents { shift->_property('monthly_raw_user_agents', @_); } sub probable_robots { shift->_property('probable_robots', @_); } sub unrecognized_robots_count { shift->_property('unrecognized_robots_count', @_); } sub unrecognized_robots { shift->_property('unrecognized_robots', @_); } sub unparsable_lines_count { shift->_property('unparsable_lines_count', @_); } sub excluded_lines_count { shift->_property('excluded_lines_count', @_); } sub processed_lines_count { shift->_property('processed_lines_count', @_); } sub refs_count { shift->_property('refs_count', @_); } sub exclude_robots { shift->_property('exclude_robots', @_); } sub exclude_remote_addrs { shift->_property('exclude_remote_addrs', @_); } sub include_remote_addrs { shift->_property('include_remote_addrs', @_); } sub include_only_refs_to_uri_regex { shift->_property('include_only_refs_to_uri_regex', @_); } sub exclude_all_refs_to_uri_regex { shift->_property('exclude_all_refs_to_uri_regex', @_); } sub log_parser_patterns { shift->_property('log_parser_patterns', @_); } sub log_parser_maps { shift->_property('log_parser_maps', @_); } sub log_decompressors { shift->_property('log_decompressors', @_); } sub log_format { shift->_property('log_format', @_); } sub robots_useragent_regex { shift->_property('robots_useragent_regex', @_); } sub robots_useragent_false_positives { shift->_property('robots_useragent_false_positives', @_); } sub minimum_browser_report_percentage { shift->_property('minimum_browser_report_percentage', @_); } ####################################################################### # _hi_res_time_available; # # Returns availability of Time::HiRes (true if available, false if not) # sub _hi_res_time_available { my $self = shift; my $hi_res_time = $self->_property('hi_res_time_available'); unless (defined $hi_res_time) { eval { require Time::HiRes; }; if ($@) { $hi_res_time = 0; } else { $hi_res_time = 1; } $self->_property('hi_res_time_available', $hi_res_time); } return $hi_res_time; } ####################################################################### # _start_time([$time]); # # Stores the start time for the elapsed time timer # sub _start_time { shift->_property('start_time', @_); } ####################################################################### # _reset_timer; # # Resets the elapsed time timer to the current time. # sub _reset_timer { my $self = shift; if ($self->_hi_res_time_available) { my $current_time = [Time::HiRes::gettimeofday()]; $self->_start_time($current_time); } else { my $current_time = time; $self->_start_time($current_time); $self->_high_res_time(0); } return; } ####################################################################### # _elapsed_time; # # Returns the elapsed wallclock time since the last '_reset_timer' call. # This time is either the integer number of seconds (if Time::HiRes # is not available) or the time accurate to the nearest millisecond # (if Time::HiRes _is_ available). # sub _elapsed_time { my $self = shift; my $start_time = $self->_start_time; unless (defined $start_time) { my $package = __PACKAGE__; confess("_elapsed_time called without first calling _reset_time. Bad Programmer: no biscuit."); } if ($self->_hi_res_time_available) { my $elapsed_time = sprintf('%0.3f',Time::HiRes::tv_interval($start_time)); return $elapsed_time; } else { my $elapsed_time = time - $start_time; return $elapsed_time; } } ####################################################################### =over 4 =item process_stats; Executes the complete stats analysis including parsing logs files and outputting all requested reports. =back =cut sub process_stats { my $self = shift; my $package = __PACKAGE__; $self->_reset_timer; $self->parse_logs; $self->output_reports; } ####################################################################### =over 4 =item parse_logs; Parses the specified log files according the the current configuration settings. =back =cut sub parse_logs { my $self = shift; my $access_logs = $self->access_logs; unless (defined $access_logs) { my $package = __PACKAGE__; my $subname = $package . '::' . (caller(0))[3]; croak("$subname - no logs specified for processing"); } my %probable_robots = %{$self->probable_robots}; my %unrecognized_bot = %{$self->unrecognized_robots}; my $unrecognized_bots_counter = $self->unrecognized_robots_count; my $parse_re = $self->log_parse_regex; unless (defined $parse_re) { croak("No regular expression for parsing log was set"); } my $parse_map = $self->log_parse_map; unless (defined $parse_map) { croak("No map for regular expression for parsing log was set"); } my $month_field = $self->_month_field; unless (defined $month_field) { croak("No 'month' field map for regular expression for parsing log was set"); } my $year_field = $self->_year_field; unless (defined $year_field) { croak("No 'year' field map for regular expression for parsing log was set"); } my $agent_field = $self->_user_agent_field; unless (defined $agent_field) { croak("No 'useragent' field map for regular expression for parsing log was set"); } my $ref_field = $self->_referrer_field; unless (defined $ref_field) { croak("No 'referrer' field map for regular expression for parsing log was set"); } my $remote_addr_field = $self->_remote_addr_field; unless (defined $remote_addr_field) { croak("No 'remote_addr' field map for regular expression for parsing log was set"); } my $uri_field = $self->_uri_field; unless (defined $uri_field) { croak("No 'uri' field map for regular expression for parsing log was set"); } my $processed_lines = 0; my $unparsable_lines = 0; my %not_bots = (); my %month_translations = qw( Jan 1 Feb 2 Mar 3 Apr 4 May 5 Jun 6 Jul 7 Aug 8 Sep 9 Oct 10 Nov 11 Dec 12 01 1 02 2 03 3 04 4 05 5 06 6 07 7 08 8 09 9 10 10 11 11 12 12 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 11 11 12 12 ); my %log_escapes = ( "\\\\" => '%5c', "\\\"" => '%22', ); my %raw_user_agents = %{$self->raw_user_agents}; my $exclude_remote_addrs = $self->exclude_remote_addrs; my $include_remote_addrs = $self->include_remote_addrs; my $include_only_refs_to_uri_regex = $self->include_only_refs_to_uri_regex; my $exclude_all_refs_to_uri_regex = $self->exclude_all_refs_to_uri_regex; my $robots_useragent_regex = $self->robots_useragent_regex; my $robots_useragent_false_positives = $self->robots_useragent_false_positives; my $exclude_robots = ($self->exclude_robots eq 'yes') ? 1 : 0; my $decompressors = $self->log_decompressors; my %monthly_user_agent_breakdown = (); ###### # This is where the virtually all of the time needed for log processing is spent. # It is _extremely_ time critical so be very careful when modifying anything in the following # block - it has been heavily optimized for speed. { local *AGENT_LOG_FH; # We do this vice 'gensym' or IO::File because it is _much_ faster according to testing foreach my $access_log (@$access_logs) { my ($logfile_suffix) = $access_log =~ m/\.([-_A-Za-z0-9]+)$/; $logfile_suffix = lc ($logfile_suffix); my $access_log_opener = $access_log; if (defined $decompressors->{$logfile_suffix}) { $access_log_opener = $decompressors->{$logfile_suffix} . " $access_log |"; } if (! open(AGENT_LOG_FH,$access_log_opener) ) { warn "Can't open ${access_log}. Skipped.\n $!"; next; } binmode AGENT_LOG_FH; while () { $processed_lines++; s#(\\[\\"])#$log_escapes{$1}#gs; my ($remote_addr, $year, $month, $filename, $user_agent) = (m/$parse_re/)[$remote_addr_field, $year_field, $month_field, $uri_field, $agent_field]; unless (defined $user_agent) { $unparsable_lines++; next; } $month = $month_translations{$month}; next if ( ($include_only_refs_to_uri_regex && ($filename !~ m/$include_only_refs_to_uri_regex/)) or ($exclude_all_refs_to_uri_regex && ($filename =~ m/$exclude_all_refs_to_uri_regex/)) or ($exclude_remote_addrs && ($remote_addr =~ m/$exclude_remote_addrs/)) or ($include_remote_addrs && ($remote_addr !~ m/$include_remote_addrs/)) ); $user_agent =~ s#\s+# #gs; # Fixes proxy info bug. Fix suggested by # James Walter Martin III if ($filename eq '/robots.txt') { if (not($probable_robots{$user_agent})) { if ($user_agent !~ m/$robots_useragent_regex/) { if ($user_agent !~ m/$robots_useragent_false_positives/) { $unrecognized_bots_counter++; $probable_robots{$user_agent}++; } } else { $probable_robots{$user_agent}++; } } } $monthly_user_agent_breakdown{$year}->{$month}->{$user_agent}++; $raw_user_agents{$user_agent}++; } } } $self->monthly_raw_user_agents(\%monthly_user_agent_breakdown); $self->raw_user_agents(\%raw_user_agents); $self->unrecognized_robots_count($unrecognized_bots_counter); $self->processed_lines_count($processed_lines); $self->probable_robots(\%probable_robots); $self->unparsable_lines_count($unparsable_lines); return; } ####################################################################### sub output_reports { my $self = shift; my $processed_lines = $self->processed_lines_count; my $excluded_lines = $self->excluded_lines_count; my $refscounter = $self->refs_count; my $unparsable_lines = $self->unparsable_lines_count; my $probable_robots = $self->probable_robots; my $raw_user_agents = $self->raw_user_agents; my $report_title = $self->report_title; my $exclude_robots = ($self->exclude_robots eq 'yes') ? 1 : 0; my $robots_useragent_regex = $self->robots_useragent_regex; my $robots_useragent_false_positives = $self->robots_useragent_false_positives; my $output_dir = $self->output_dir; unless (defined $output_dir) { croak("No 'output_dir' defined"); } unless (-e $output_dir) { mkdir $output_dir; } if (not -e $output_dir) { croak("output_dir $output_dir does not exist and could not be created: $!"); } unless(-d _) { croak("output_dir $output_dir is not a directory"); } unless (-w _) { croak("output_dir $output_dir cannot be written to (permissions error)"); } my $output_file = $self->output_file; unless (defined $output_file) { croak("No 'output_file' defined"); } my $index_file = File::Spec->catfile($output_dir,$output_file); my $minimum_browser_report_percentage = $self->minimum_browser_report_percentage; my %Ruser_agents = (); my $robot_hits = 0; while (my ($user_agent, $agent_count) = each %$raw_user_agents) { if ($probable_robots->{$user_agent}) { $probable_robots->{$user_agent} = $agent_count; $robot_hits += $agent_count; } elsif (($user_agent =~ m/$robots_useragent_regex/) and ($user_agent !~ m/$robots_useragent_false_positives/o)) { $robot_hits += $agent_count; $probable_robots->{$user_agent} = $agent_count; } $refscounter += $agent_count; $Ruser_agents{$user_agent} = $agent_count; } my $bot_hits = 0; foreach my $agent (keys %$probable_robots) { $bot_hits += $Ruser_agents{$agent}; } my %rawagents; my $non_bot_hits = 0; my %html_escape = ( '<' => '<', '>' => '>', '&' => '&', '"' => '"', ); while(my ($user_agent,$Count) = each(%Ruser_agents)) { if (($user_agent eq "-") || ($user_agent eq '')) { $user_agent = "Unknown"; } else { my $robot_id = ''; if ($probable_robots->{$user_agent}) { next if $exclude_robots; $robot_id = ' Possible Robot'; } else { $non_bot_hits += $Count; } # Undo any URL encoding of user agent $user_agent =~ tr/+/ /; $user_agent =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; # Despoof various people if ($user_agent =~ m#^\S+\s+(WebTV/\S+)#o) { $user_agent = "$1 spoofing as $user_agent"; } elsif ($user_agent =~ m#(MSIECrawler|VoilaBot|Girafabot|Netnose-Crawler)#) { $user_agent = "$1 spoofing as $user_agent"; } elsif ($user_agent =~ m#AppleWebKit.* (Safari/\d+)#) { $user_agent = "$1 spoofing as $user_agent"; } elsif ($user_agent =~ m# (Opera) ([0-9]+\.[0-9]+) #) { $user_agent = "$1/$2 spoofing as $user_agent"; } elsif ($user_agent =~ m# (Firefox)/([0-9.]+)$#) { my ($browser, $version) = ($1,$2); if (($version eq '1') or ($version eq '1.0')) { $version = '1.0.0' } $user_agent = "$browser/$version spoofing as $user_agent"; } elsif ($user_agent =~ m# (Konqueror)[-/](\d+)#) { $user_agent = "$1/$2 spoofing as $user_agent"; } elsif ($user_agent =~ m# (Cerberian Drtrs) Version[-/](\d+)#) { $user_agent = "$1/$2 spoofing as $user_agent"; } else { # despoofs people using pseudo-'standard' of 'compatible' if ($user_agent =~ m#^Mozilla.*\(compatible; *([^;)]+)#oi) { my $spoofer = $1; $spoofer =~ s#/#-#og; $spoofer =~ s/\W+$//o; $user_agent="$spoofer spoofing as $user_agent"; } } $user_agent .= $robot_id; # Lets not let children play with dangerous toys... $user_agent =~ s#([<>&"])#$html_escape{$1}#gs; } $rawagents{$user_agent}+=$Count; } my ($longagent, $base, $name, $version, %agentgroup, %agentversion, %baseagent); foreach my $agent (keys (%rawagents)) { my $robot_id = ''; if ($agent =~ m/ Possible Robot$/) { next if $exclude_robots; $robot_id = ' (Possible Robot)'; } $longagent = $agent; $longagent =~ s/ Possible Robot$//; ($base) = $longagent =~ m#^([^\(\[]+)#o; $base =~ s#\s+$##o; $base =~ s#via proxy.*$##ogi; ($name,$version) = $base =~ m#^([^\d\/]+)[\s\/vV]+(\d[\.\d]+)#o; ($name) = $base =~ m#^([^\d\/]+)#o if (! $name); $name = 'Failed to parse' unless defined ($name); $version = '0' unless (defined $version); $agentgroup{"$name$robot_id"} += $rawagents{$agent}; $agentversion{"$name $version$robot_id"} += $rawagents{$agent}; $baseagent{"$base$robot_id"} += $rawagents{$agent}; } my $major_version; foreach my $key (keys(%agentversion)) { my $mversion = $key; $mversion =~ s/(\d+)\.\d+([^\.].*$|$)/$1/og; $major_version->{$mversion} += $agentversion{$key}; } $refscounter -= $bot_hits if $exclude_robots; $excluded_lines += $processed_lines - $refscounter; my $report_template = $self->report_template; my $min_matches_percent_formatted = sprintf("%0.2f",$minimum_browser_report_percentage); my $date= localtime(time); $refscounter = $non_bot_hits if ($self->exclude_robots eq 'yes'); my $class_map = $self->class_map; #################################################################################################3 ########################## # Brand Summary $report_template = $self->ranked_hits_report({ 'hitsData' => \%agentgroup, 'totalHits' => $refscounter, 'minimumPercent' => $minimum_browser_report_percentage, 'classMap' => $class_map, 'tag' => 'browser_brand_summary', 'template' => $report_template, }); ########################## # Major version summary $report_template = $self->ranked_hits_report({ 'hitsData' => $major_version, 'totalHits' => $refscounter, 'minimumPercent' => $minimum_browser_report_percentage, 'classMap' => $class_map, 'tag' => 'browser_major_version_summary', 'template' => $report_template, }); ########################## # Minor version summary $report_template = $self->ranked_hits_report({ 'hitsData' => \%agentversion, 'totalHits' => $refscounter, 'minimumPercent' => $minimum_browser_report_percentage, 'classMap' => $class_map, 'tag' => 'browser_minor_version_summary', 'template' => $report_template, }); ########################## # The nearly raw dump of user agents (the detailed report) if ($self->show_detailed_browser_report eq 'yes') { $report_template = $self->alpha_hits_report({ 'hitsData' => \%rawagents, 'totalHits' => $refscounter, 'classMap' => $class_map, 'tag' => 'browser_detail_summary', 'template' => $report_template, }); } ########## # All the real work is done. Now we build the output results and save them. my $elapsed_time = $self->_elapsed_time; my $lines_per_second = 'unspecified'; if ($elapsed_time > 0) { $lines_per_second = int($processed_lines / $elapsed_time); } my %report_values = ( 'version' => $VERSION, 'report_date' => $date, 'report_title' => $report_title, 'processed_lines' => $processed_lines, 'measured_hits' => $refscounter, 'robot_hits' => $robot_hits, 'lines_per_second' => $lines_per_second, 'elapsed_time' => $elapsed_time, 'unparsable_lines' => $unparsable_lines, 'excluded_lines' => $excluded_lines, 'browser_report_cutoff' => $min_matches_percent_formatted, ); my @subkeys = sort keys %report_values; my $template_sub_string = '\$\{(' . join('|',@subkeys) . ')\}'; $report_template =~ s/$template_sub_string/$report_values{$1}/gs; my $output_fh = IO::File->new($index_file, ">"); unless ($output_fh) { croak ("Could not open $index_file for writing.\n$!"); } print $output_fh $report_template; close ($output_fh); } ############################################################################### # ranked_hits_report({ tag => 'example', # hitsData => \%hit_count_data, # template => $report_template, # [ totalHits => $total_number_of_hits, ] (optional, will be computed from hitsData if not given) # [ minimumPercent => $min_percent_in_report, ] (optional) # [ minimumHits => $min_hits_in_report, ] (optional) # ); # sub ranked_hits_report { my $self = shift; my ($parms) = @_; my $tag = $parms->{'tag'}; my $hits_data = $parms->{'hitsData'}; my $min_percent = $parms->{'minimumPercent'}; my $min_hits = $parms->{'minimumHits'}; my $template = $parms->{'template'}; my $refscounter = $parms->{'totalHits'}; my $class_map = $parms->{'classMap'}; my $ranking = 0; my $data = {}; my @line_keys = sort { $hits_data->{$b} <=> $hits_data->{$a} } keys %$hits_data; unless (defined $refscounter) { $refscounter = 0; foreach my $key (@line_keys) { $refscounter += $hits_data->{$key} }; } foreach my $key (@line_keys) { $ranking++; my $hits = $hits_data->{$key}; my $percentage = sprintf('%0.2f', (100 * $hits / $refscounter)); next if ((defined($min_percent) and ($percentage < $min_percent)) or (defined($min_hits) and ($hits < $min_hits)) ); my $class = defined($class_map->{$key}) ? $class_map->{$key} : 'unspecified'; my $subs = { 'hits' => $hits, 'percentage' => $percentage, 'name' => $key, 'ranking' => $ranking, 'class' => $class, }; $data->{$ranking} = $subs; } $template = $self->make_tagged_block_substitutions({ 'data' => $data, 'template' => $template, 'startTag' => "<$tag>", 'endTag' => "", }); return $template; } ############################################################################### # alpha_hits_report({ tag => 'example', # hitsData => \%hit_count_data, # template => $report_template, # [ totalHits => $total_number_of_hits, ] (optional, will be computed from hitsData if not given) # [ minimumPercent => $min_percent_in_report, ] (optional) # [ minimumHits => $min_hits_in_report, ] (optional) # [ classMap => $class_keys_map,] (optional) # ); # sub alpha_hits_report { my $self = shift; my ($parms) = @_; my $tag = $parms->{'tag'}; my $hits_data = $parms->{'hitsData'}; my $min_percent = $parms->{'minimumPercent'}; my $min_hits = $parms->{'minimumHits'}; my $template = $parms->{'template'}; my $refscounter = $parms->{'totalHits'}; my $class_map = $parms->{'classMap'}; my $ranking = 0; my $data = {}; my @line_keys = sort keys %$hits_data; unless (defined $refscounter) { $refscounter = 0; foreach my $key (@line_keys) { $refscounter += $hits_data->{$key} }; } foreach my $key (@line_keys) { $ranking++; my $hits = $hits_data->{$key}; my $percentage = sprintf('%0.2f', (100 * $hits / $refscounter)); next if ((defined($min_percent) and ($percentage < $min_percent)) or (defined($min_hits) and ($hits < $min_hits)) ); my $class = defined($class_map->{$key}) ? $class_map->{$key} : 'unspecified'; my $subs = { 'hits' => $hits, 'percentage' => $percentage, 'name' => $key, 'ranking' => $ranking, 'class' => $class, }; $data->{$ranking} = $subs; } $template = $self->make_tagged_block_substitutions({ 'data' => $data, 'template' => $template, 'startTag' => "<$tag>", 'endTag' => "", }); return $template; } ############################################################################### # make_tagged_block_substititions ( 'template' => $template_text, # 'startTag' => '', # 'endTag' => '', # 'data' => { # 1 => { ...data hash item1... }, # 2 => { ...data hash item2... }, # .... # }); sub make_tagged_block_substitutions { my $self = shift; my ($parms) = @_; my $start_tag = $parms->{'startTag'}; my $end_tag = $parms->{'endTag'}; my $data = $parms->{'data'}; my $template = $parms->{'template'}; my @tagged_blocks = $template =~ m#($start_tag.*?$end_tag)#gs; my %subs = (); my %block_targets = (); my %item_subs_hash = (); my %all_targets = (); my @data_keys_list = sort { $a <=> $b } keys %$data; foreach my $data_key (@data_keys_list) { my $record = $data->{$data_key}; my @item_key_list = keys %$record; foreach my $item_key (@item_key_list) { my $escaped_item = quotemeta($item_key); $all_targets{$escaped_item} = 1; } } my $block_string = "^$start_tag(.*?)$end_tag"; my $item_sub_string = '\$\{(' . join('|',sort keys %all_targets) . ')\}'; my $item_sub_regex = qr/$item_sub_string/; foreach my $block (@tagged_blocks) { my ($unwrapped_content) = $block =~ m/$block_string/s; my $escaped_block = quotemeta($block); my @sub_data = (); foreach my $key (@data_keys_list) { my $sub_hash = $data->{$key}; my $content = $unwrapped_content; { no strict; $content =~ s/$item_sub_regex/$sub_hash->{$1}/gs; } push (@sub_data,$content); } my $sub_result = join('', @sub_data); $subs{$block} = $sub_result; $block_targets{$escaped_block} = 1; } my $block_targets_regex = '(' . join('|', keys %block_targets ) . ')'; $template =~ s/$block_targets_regex/$subs{$1}/gs; return $template; } ##

Summary by fine detail of version:

## ## ## ## ## ## ## EOF ## foreach my $key (sort {$baseagent{$b} <=> $baseagent{$a}} keys(%baseagent)) { ## my $percentage=100*$baseagent{$key}/$refscounter; ## next unless ($percentage >= $minimum_browser_report_percentage); ## $percentage = 0.00 if ($percentage < 0.001); ## $percentage=~s/(....).*/$1/o; ## print $output_fh "\n \n \n \n\n"; ## } ## ## print $output_fh "
HitsPercentBrowser
$baseagent{$key}"; ## print $output_fh "${percentage}\%$key
\n"; ############################################################################### # load_configuration({ configFile => $config_file }); # # Returns an anon hash containing the configuration information # # If no 'configFile' is passed, it uses the __DATA__ section for configuration data. # sub load_configuration { my $self = shift; my ($parms) = @_; my $config_file = $parms->{'configFile'}; my $fh; if (defined $config_file) { unless (-e $config_file) { croak("$config_file either does not exist or cannot be accessed\n"); } unless (-r _) { croak("$config_file cannot be read (check file permissions)\n"); } unless (-f _) { croak("$config_file does not appear to be a regular file\n"); } if (-d _) { croak("$config_file is a directory (was expecting a file)\n"); } } if (defined $config_file) { $fh = IO::File->new($config_file, O_RDONLY); unless ($fh) { die("Unable to open $config_file: $!\n"); } } else { $fh = \*DATA; } my $configuration = {}; my $linecount = 0; my $errors = ''; my $log_decompressors = $self->log_decompressors; my $log_parser_patterns = $self->log_parser_patterns; my $log_parser_maps = $self->log_parser_maps; my $access_logs = $self->access_logs; my $class_map = {}; $self->class_map($class_map); while (<$fh>) { chomp; $linecount++; # Skip comments and blank lines next if ((m/^\s*#/) or (m/^\s*$/)); if (m/^\s*decompress.(\S+)\s*=\s*(.*?)\s*$/) { my $suffix = lc($1); my $program = $2; $log_decompressors->{$suffix} = $program; } elsif (m/^\s*report_title\s*=\s*(.*?)\s*$/) { my $report_title = $1; $self->report_title($report_title); } elsif (m/^\s*class_map\.([A-Za-z][A-Za-z0-9]*)\s*=\s*(.*?)\s*$/) { my $map_class = $1; my $map_key = $2; $class_map->{$map_key} = $map_class; } elsif (m/^\s*access_log\s*=\s*(.*?)\s*$/) { my $access_log = $1; push (@$access_logs, $access_log); } elsif (m/^\s*output_dir\s*=\s*(.*?)\s*$/) { my $output_dir = $1; $self->output_dir($output_dir); } elsif (m/^\s*output_file\s*=\s*(.*?)\s*$/) { my $output_file = $1; $self->output_file($output_file); } elsif (m/^\s*exclude_all_refs_to_uri_regex\s*=\s*(.*?)\s*$/) { my $exclude_all_refs_to_uri_regex = $1; if ($exclude_all_refs_to_uri_regex ne '') { $self->exclude_all_refs_to_uri_regex(qr/$exclude_all_refs_to_uri_regex/); } } elsif (m/^\s*include_only_refs_to_uri_regex\s*=\s*(.*?)\s*$/) { my $include_only_refs_to_uri_regex = $1; if ($include_only_refs_to_uri_regex ne '') { $self->include_only_refs_to_uri_regex(qr/$include_only_refs_to_uri_regex/); } } elsif (m/^\s*exclude_remote_addrs\s*=\s*(.*?)\s*$/) { my $exclude_remote_addrs = $1; if ($exclude_remote_addrs ne '') { $self->exclude_remote_addrs(qr/$exclude_remote_addrs/); } } elsif (m/^\s*include_remote_addrs\s*=\s*(.*?)\s*$/) { my $include_remote_addrs = $1; if ($include_remote_addrs ne '') { $self->include_remote_addrs(qr/$include_remote_addrs/); } } elsif (m/^\s*exclude_robots\s*=\s*(yes|no)\s*$/i) { my $exclude_robots = lc($1); $self->exclude_robots($exclude_robots); } elsif (m/^\s*minimum_browser_report_percentage\s*=\s*(.*?)\s*$/) { my $minimum_browser_report_percentage = $1; $self->minimum_browser_report_percentage($minimum_browser_report_percentage); } elsif (m/^\s*show_detailed_browser_report\s*=\s*(yes|no)\s*$/i) { my $show_detailed_browser_report = lc($1); $self->show_detailed_browser_report($show_detailed_browser_report); } elsif (m/^\s*robots_useragent_regex\s*=\s*(.*?)\s*$/) { my $robots_useragent_regex = $1; $self->robots_useragent_regex(qr/$robots_useragent_regex/i); } elsif (m/^\s*robots_useragent_false_positives_regex\s*=\s*(.*?)\s*$/) { my $robots_useragent_false_positives = $1; $self->robots_useragent_false_positives(qr/$robots_useragent_false_positives/i); } elsif (m/^\s*log_format\s*=\s*([-_a-zA-Z0-9]+)\s*$/) { my $log_format = lc($1); $self->log_format($log_format); } elsif (m/^\s*log_parsing_regex\.([-_a-zA-Z0-9]+)\s*=\s*(.*?)\s*$/) { my $pattern_name = lc($1); my $pattern_value = $2; $log_parser_patterns->{$pattern_name} = qr/$pattern_value/; } elsif (m/^\s*log_parsing_fields\.([-_a-zA-Z0-9]+)\s*=\s*(.*?)\s*$/) { my $pattern_name = lc($1); my $pattern_value = $2; my @field_names = split(/\s+/,$pattern_value); my $fields_index = {}; for (my $index = 0; $index < @field_names; $index++) { $fields_index->{$field_names[$index]} = $index; } $log_parser_maps->{$pattern_name} = $fields_index; } elsif (m/^\s*__START REPORT TEMPLATE__\s*$/) { my @template_lines = (); while (<$fh>) { last if (m/\s*__END REPORT TEMPLATE__\s*$/); push (@template_lines, $_); } if (not m/\s*__END REPORT TEMPLATE__\s*$/) { $errors .= "No __END REPORT TEMPLATE__ found in configuration\n"; } else { my $report_template = join('',@template_lines); $self->report_template($report_template); } } else { $errors .= "Syntax error in configuration at line $linecount: $_\n"; next; } } my $log_format = $self->log_format; my $log_regex = $log_parser_patterns->{$log_format}; my $log_map = $log_parser_maps->{$log_format}; if (not defined $log_format) { $errors .="No 'log_format' was specified in configuration\n"; } if (not defined $log_parser_maps->{$log_format}) { $errors .="No 'log_parsing_fields.$log_format' found in configuration\n"; } if (not defined $log_parser_patterns->{$log_format}) { $errors .="No 'log_parsing_regex.$log_format' found in configuration\n"; } if ($errors ne '') { croak ($errors); } $self->log_parse_regex($log_parser_patterns->{$log_format}); $self->log_parse_map($log_parser_maps->{$log_format}); return; } ####################################################################### =head1 CONFIGURATION FILE =head1 BUGS None known. =head1 TODO Add more regression tests. =head1 AUTHOR Benjamin Franz =head1 VERSION Version 2.00 15 Aug 2005 =head1 COPYRIGHT Copyright (c) Benjamin Franz. All rights reserved. =head1 LICENSE This program is free software; you can redistribute it and/or modify it under the same terms and conditions as Perl itself. This means that you can, at your option, redistribute it and/or modify it under either the terms the GNU Public License (GPL) version 1 or later, or under the Perl Artistic License. See http://dev.perl.org/licenses/ =head1 DISCLAIMER THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Use of this software in any way or in any form, source or binary, is not allowed in any country which prohibits disclaimers of any implied warranties of merchantability or fitness for a particular purpose or any disclaimers of a similar nature. IN NO EVENT SHALL I BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION (INCLUDING, BUT NOT LIMITED TO, LOST PROFITS) EVEN IF I HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE =head1 SEE ALSO http://nihongo.org/snowhare/utilities/browsercounter/ =cut ############################################################################## # From here down is the default configuration and templates used by the # analyzer. # __DATA__ ############################################################################### # report_title is the title to be used for the report # # This can be overridden by the '--report_title=Some Title' command line option. # report_title = Sample Report ############################################################################### # access_log is the log file(s) to be processed. More than one log file may # be specified (one per 'access_log' line). This field is optional, and is # overridden by any command line listed files. # access_log = /home/snowhare/bin/webstats/sample_log.bz2 ############################################################################### # output_dir is the path to the directory used to output the browser report output_dir = /home/snowhare/bin/webstats ############################################################################### # output_file is the file within the output_dir where the report will be placed output_file = index.html ############################################################################### # show_detailed_browser_report flags if you want the 'detail' listing of all # web browsers sorted by name. This listing can be very long. # # I'm not kidding: IT CAN BE VERY LONG. You've been warned. # show_detailed_browser_report = no ############################################################################### # 'decompress.xxx' sections declare programs for handling the decompression of # compressed log files. The programs are required to take a filename on the # command line specifying the file to be decompressed and to send the decompressed # output to STDOUT. # # The format is 'decompress.$suffix = $program_invokation' # decompress.z = gzip -cd decompress.gz = gzip -cd decompress.bz2 = bzip2 -cd ############################################################################### # 'exclude_robots' declares whether you want to exclude robots from the reports # (other than those specifically about robots). # # Allowed values are 'yes' and 'no' # exclude_robots = yes ############################################################################### # 'minimum_browser_report_percentage' excludes browsers with less than the # specified percentage from the reports. The number must be a value between # 0 and 100 (inclusive). # minimum_browser_report_percentage = 0.1 ############################################################################### # include_only_refs_to_uri_regex defines a regular expression that URIs # must match to be included. If omitted or blank, all URIs are accepted. # include_only_refs_to_uri_regex = (\/|\.[jps]?html?|\.txt|\.[ja]sp|\.php\d*|\.cf)$ ############################################################################### # exclude_all_refs_to_uri_regex defines a regular expression for excluding # hits on URIs matching the regex from being included. If omitted or blank, # nothing is excluded. # exclude_all_refs_to_uri_regex = ############################################################################### # 'robots_useragent_regex' specifies a regular expression used to # identify robot user agents # robots_useragent_regex = http|FSP Utilities|Powermarks|Java\/[0-9]|bot|crawl|spider|slurp|search|google|teoma|archive|htdig|scooter|Bookmark Renewal|webcollage|ichiro|grub|findlinks|libwww-perl|larbin|RedAlert\.com|User-Agent: User-Agent:|User-Agent: Mozilla|MSNPTC|Walker|cfetch|index|Wget ############################################################################### # 'robots_useragent_false_positives_regex' specifies a regular expression used to # identify user agents that are 'false positived' by the 'robots_useragent_regex' # robots_useragent_false_positives_regex = ^Mozilla\/.*MSIE ############################################################################### # 'log_format' declares the log parsing pattern to be used to analyze the # log file log_format = multihost-combined ############################################################################### # 'log_parsing_regex.$log_format' is a regular expression used to parse # the log file being analysed. # # 'log_parsing_fields.$log_format' declares the identifying field names for # fields parsed by the 'log_parsing_regex.$log_format' regular expression # # This gives the mapping of each returned item to their field names # in the same order as returned by the log parsing pattern. ############################################################################### # Pattern for parsing a multi-host 'combined' format log file where the # first field is the webhost name and the remaining fields are a standard # 'combined' format log log_parsing_regex.multihost-combined = ^(\S+) (\S+) (\S+) (\S+) \[(\d\d?)/([A-Za-z]{3})/(\d{4}):(\d\d?):(\d\d):(\d\d) (\S+)\] \"(\S+)\s+(\S+)\s+(\S+)\" (\S+) (\S+) \"?([^"]*)\"? \"(.*)\" log_parsing_fields.multihost-combined = host remote_addr ident user day month year hour minute second timezone method uri protocal status bytes referrer useragent ############################################################################### # Pattern for parsing a standard 'combined' format log file where the last field is the User # Agent and the second to last field is the referring URL log_parsing_regex.combined = ^(\S+) (\S+) (\S+) \[(\d\d?)/([A-Za-z]{3})/(\d{4}):(\d\d?):(\d\d):(\d\d) (\S+)\] \"(\S+)\s+(\S+)\s+(\S+)\" (\S+) (\S+) \"?([^"]*)\"? \"(.*)\" log_parsing_fields.combined = remote_addr ident user day month year hour minute second timezone method uri protocal status bytes referrer useragent ############################################################################### # Pattern for parsing a reverse 'combined' format log file where the last field is the referring # URL and the second to last field is the User Agent log_parsing_regex.reverse-combined = ^(\S+) (\S+) (\S+) \[(\d\d?)/([A-Za-z]{3})/(\d{4}):(\d\d?):(\d\d):(\d\d) (\S+)\] \"(\S+)\s+(\S+)\s+(\S+)\" (\S+) (\S+) \"?([^"]*)\"? \"(.*)\" log_parsing_fields.reverse-combined = remote_addr ident user day month year hour minute second timezone method uri protocal status bytes useragent referrer ############################################################################### # class_map provides mapping for various keyed items to CSS classes # class_map.msie = MSIE class_map.msie = MSIE 7 class_map.msie = MSIE 7.0 class_map.msie = MSIE 7.00 class_map.msie = MSIE 7.01 class_map.msie = MSIE 6 class_map.msie = MSIE 6.0 class_map.msie = MSIE 5 class_map.msie = MSIE 5.0 class_map.msie = MSIE 5.00 class_map.msie = MSIE 5.01 class_map.msie = MSIE 5.17 class_map.msie = MSIE 5.22 class_map.msie = MSIE 5.23 class_map.msie = MSIE 5.5 class_map.msie = MSIE 4 class_map.msie = MSIE 4.0 class_map.msie = MSIE 4.01 class_map.firefox = Firefox class_map.firefox = Firefox 0 class_map.firefox = Firefox 0.10.0 class_map.firefox = Firefox 0.10.1 class_map.firefox = Firefox 1 class_map.firefox = Firefox 0.8 class_map.firefox = Firefox 0.9 class_map.firefox = Firefox 0.9.3 class_map.firefox = Firefox 1.0 class_map.firefox = Firefox 1.0.0 class_map.firefox = Firefox 1.0.1 class_map.firefox = Firefox 1.0.2 class_map.firefox = Firefox 1.0.3 class_map.firefox = Firefox 1.0.4 class_map.firefox = Firefox 1.0.5 class_map.firefox = Firefox 1.0.6 class_map.firefox = Firefox 1.0.7 class_map.firefox = Firefox 1.0.8 class_map.firefox = Firefox 1.0.9 class_map.firefox = Firefox 1.0.10 class_map.firefox = Firefox 1.0.11 class_map.firefox = Firefox 1.0.12 class_map.firefox = Firefox 1.1 class_map.firefox = Firefox 1.1.0 class_map.firefox = Firefox 1.5 class_map.firefox = Firefox 1.5.0 class_map.firefox = Firefox 1.5.1 class_map.firefox = Firefox 1.5.2 class_map.firefox = Firefox 1.5.3 class_map.firefox = Firefox 1.5.4 class_map.firefox = Firefox 1.5.5 class_map.firefox = Firefox 1.5.6 class_map.firefox = Firefox 1.5.7 class_map.firefox = Firefox 1.5.8 class_map.firefox = Firefox 1.5.9 class_map.firefox = Firefox 1.5.10 class_map.firefox = Firefox 2 class_map.firefox = Firefox 2.0 class_map.firefox = Firefox 2.0.0 class_map.firefox = Firefox 2.0.1 class_map.firefox = Firefox 2.0.2 class_map.firefox = Firefox 2.0.3 class_map.firefox = Firefox 2.0.4 class_map.firefox = Firefox 2.0.5 class_map.firefox = Firefox 2.0.6 class_map.firefox = Firefox 2.0.7 class_map.firefox = Firefox 2.0.8 class_map.firefox = Firefox 2.0.9 class_map.opera = Opera class_map.opera = Opera 0 class_map.opera = Opera 1 class_map.opera = Opera 2 class_map.opera = Opera 3 class_map.opera = Opera 4 class_map.opera = Opera 5 class_map.opera = Opera 6 class_map.opera = Opera 7 class_map.opera = Opera 7.54 class_map.opera = Opera 8 class_map.opera = Opera 8.01 class_map.opera = Opera 9 class_map.opera = Opera 10 class_map.omniweb = OmniWeb class_map.omniweb = OmniWeb 0 class_map.omniweb = OmniWeb 1 class_map.omniweb = OmniWeb 2 class_map.omniweb = OmniWeb 3 class_map.omniweb = OmniWeb 4 class_map.omniweb = OmniWeb 5 class_map.safari = Safari class_map.safari = Safari 85 class_map.safari = Safari 125 class_map.safari = Safari 312 class_map.safari = Safari 412 class_map.konqueror = Konqueror class_map.konqueror = Konqueror 0 class_map.mozilla = Mozilla class_map.mozilla = Mozilla 0 class_map.mozilla = Mozilla 1 class_map.mozilla = Mozilla 2 class_map.mozilla = Mozilla 3 class_map.mozilla = Mozilla 3.0 class_map.mozilla = Mozilla 3.01 class_map.mozilla = Mozilla 4 class_map.mozilla = Mozilla 4.0 class_map.mozilla = Mozilla 4.01 class_map.mozilla = Mozilla 5 class_map.mozilla = Mozilla 5.0 class_map.mozilla = Mozilla 5.5 class_map.mozilla = Mozilla 6 class_map.mozilla = Mozilla 7 class_map.unknown = Unknown class_map.unknown = Unknown 0 ############################################################################### # The report template blocks contains the HTML templates used to generate the main # browser report. This allows nearly complete freedom in choosing what sections # will appear in the report and how they will be presented. # __START REPORT TEMPLATE__ ${report_title}

${report_title}

${report_date}
${processed_lines} lines processed in ${elapsed_time} seconds (${lines_per_second} lines per second).
${unparsable_lines} lines could not be parsed. ${robot_hits} hits looked like robots.
${excluded_lines} lines excluded, ${measured_hits} hits measured in this run.
Report cutoff at ${browser_report_cutoff}%

Overview
Hits Percent Browser
${hits} ${percentage}% ${name}
Major Version
Hits Percent Browser
${hits} ${percentage}% ${name}
Minor Version
Hits Percent Browser
${hits} ${percentage}% ${name}


__END REPORT TEMPLATE__