#-----------------------------------------------------------------
# Function:  parseLogLine
# Purpose:   Read a line from an apache web server log and split
#            into components
# Author:    Tony Byorick
# I/O:       $record - one line of a web access log file
#            $retHash - a reference to a hash which returns values 
#                       parsed by this subroutine.
#
#-----------------------------------------------------------------
sub parseLogLine
{
my $record = shift;   # A single line from the access log
my $retHash = shift;  # reference to output hash

my $all;      # hold remainder of log entry after each regex match
my $trimmed;  # hold trimmed up values; when tweaking needed. Trimmed
              # value is returned to main subroutine

my ($host, $ident_user, $auth_user, $date, $time, $time_zone, $method, $url, $protocol, $status, $bytes, $referer, $client);

$all = $record;  # move input line into working storage

if ( $all =~ /\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/) {
   $host = $&;                   # match on IP address
   $all = $';                    # move post-match to working storage
   $$retHash{'host'} = $host;    # add to return hash
   }

if ( $all =~ /\[.+]/) {
   $date = $&;                    # match on date/time/tz
   $all = $';                     # move post-match to working storage
   ($trimmed = $date) =~ s/]$//;  # strip trailing bracket
   $trimmed =~ s/^\[//;           # strip leading bracket
   $$retHash{'date'} = $trimmed;  # add to return hash
   }

if ( $all =~ /(GET|POST)/) {
   $method = $&;                  # match on http method 
   $all = $';                     # move post-match to working storage
   $$retHash{'method'} = $method; # add to return hash
   }

if ( $all =~ /\/(\S+)\s/) {
   $url = $&;
   $all = $';                      # move post-match to working storage
   ($trimmed = $url) =~ s/\s+$//;  # strip trailing spaces
   $$retHash{'url'} = $trimmed;    # add to return hash
   }

if ( $all =~ /HTTP\/\S+\s/) {
   $protocol = $&;                      # match on protocol
   $all = $';                           # move post-match to working storage
   ($trimmed = $protocol) =~ s/\s+$//;  # strip trailing spaces
   $trimmed =~ s/\"$//;                 # strip trailing double quotes
   $$retHash{'protocol'} = $trimmed;    # add to return hash
   }

if ( $all =~ /(\d+)\s/) {
   $status = $&;                      # match on http status code
   $all = $';                         # move post-match to working storage
   ($trimmed = $status) =~ s/\s+$//;  # strip trailing spaces
   $$retHash{'status'} = $trimmed;    # add to return hash
   }

if ( $all =~ /(\d+)\s/) {
   $bytes = $&;                      # match on page size - bytes
   $all = $';                        # move post-match to working storage
   ($trimmed = $bytes) =~ s/\s+$//;  # strip trailing spaces
   $$retHash{'bytes'} = $trimmed;    # add to return hash
   }

if ( ($all =~ /\"\w(\S+)\"\s/i) || ($all =~ /\"-\"/i) ) {
   $referer = $&;                       # match on referer
   $all = $';                           # move post-match to working storage
   ($trimmed = $referer) =~ s/\s+$//;   # strip trailing spaces
   $trimmed =~ s/\"$//;                 # strip trailing double quotes
   $$retHash{'referer'} = $trimmed;     # add to return hash
   }

if ( $all =~ /\"Mozilla(.+)\"\s/i) {
   $client = $&;                       # match on client type
   $all = $';                          # move post-match to working storage
   ($trimmed = $client) =~ s/\s+$//;   # strip trailing spaces
   $trimmed =~ s/\"$//;                # strip trailing double quotes
   $trimmed =~ s/^\"//;                # strip leading double quotes
   $$retHash{'client'} = $trimmed;     # add to return hash
   }
}