#!/usr/local/bin/perl #----------------------------------------------------------------------------------- # Name: googleSearches.pl # Purpose: Create an html page containing google searches retrieved from # apache web logs. # Desc: The script runs on Mac or Windows PC. It downloads the # latest access logs to a local folder containing an archive of # previously downloaded logs and scans every log in the archive folder. # References to Google searches are written to an output file. When the # local archive folder is completely scanned, the output HTML file is # uploaded to the service provider’s host. The script should be run once # per day. Many service providers provide the current access log plus one # or two rotated logs. # Author: Tony Byorick # Date: 02/12/08 #------------------------------------------------------------------------------------ use strict; use Net::FTP; #--- Define global variables use vars qw(%rec); # hold sections of an access log record #--- Define other variables my ($today, $host, $device, $url, $stringSearch, $cvsString, $seqDate, $outfileName); my @localLogFiles; # a list of access files on the local hard drive my ($logfile, $thisLine); my @sortTemp; # array use for sorting by date my $outputHtmlLine; # formatted html output line my ($zTime, $zDate, $zTz); #--- FTP setup: Change any of these values as needed for your environment my $localfile = ""; my $remotehost = "ftp.yourdomain.com"; # name of machine hosting the web server instance my $remotefolder = "logs"; # location of log files on the remote machine my $user = "yourID"; # ftp login ID my $password = "yourPWD"; # ftp login passord my $localdir = "C:\core\web\Apache2\htdocs\logs"; # Location to store log files on local machine. Change as needed #------------------------------------------------ # FTP DOWNLOAD # Call getFile() to download latest access logs #------------------------------------------------ my @logfiles; # define array to a list of logfiles @logfiles = &getFile($localfile, $remotehost, $remotefolder, $user, $password); #------------------------------------------------ # Get a list of log files from local hard drive. #------------------------------------------------ @localLogFiles = &getLocalLogFileNames(); #----------------------------------------------------- # Read each line of log file content from local drive #----------------------------------------------------- foreach my $logfile (@localLogFiles) { open (FILE, "$logfile") || die("Couldn't open $logfile\n"); #print OUT "--- $logfile ---------------------\n"; while () { $thisLine = $_; # move current line into working storage if ($thisLine =~ /^#/) { next; } # discard comment lines if ( &discard($thisLine) ) { next; } # skip this line, it contains a discard word chop $thisLine; # strip newline if ($thisLine ne "") { if ( &retain($thisLine) ) { #--- Populate the %rec hash with elements of the #--- access log for the current line. &parseLogLine($thisLine, \%rec); ($today = $rec{'date'}) =~ s/:(.+)$//; # strip out the time of day $url = $rec{'url'}; $host = $rec{'host'}; #------------------------------------------------------------------------- #--- Convert apache date into a numeric string to help with sorting. #--- The apache date format, "28/Jan/2008:16:54:18 -0800" is converted #--- to this format: "20080127-141031" and stored in $seqDate. Later, #--- pre-pend the numeric date onto the output line while pushing things #--- onto the sortTemp array. #------------------------------------------------------------------------- my $seqDate = apacheDate2Bin($rec{'date'}); #--- Push Google Searches onto the sort array if ( ($rec{'referer'} =~ /www\.google(.+)\/search\?/) ) { #--- append sequence date to output line to help with sorting $outputHtmlLine = $seqDate . '~' . $rec{'date'} . '' . '
' . &anchorWrap($rec{'referer'}, '') . '' . "

\n"; push(@sortTemp, $outputHtmlLine); } } } } } #------------------------------------------- # Open output file and populate from the # sortTemp array. #------------------------------------------- my $outfileNameIphone = 'filtered-log-out-googlesearch.htm'; $outfileNameIphone = 'googleSearch.htm'; open(OUTPHONEGOOGLE, ">$outfileNameIphone") || die "Can't open $outfileNameIphone"; #--- Print start of HTML page. Supply page header and subtitle strings, '1' means place a table around the #--- whole page print OUTPHONEGOOGLE &htmlBegin("Google Searches", "Google Searches that Produced
Visits to www.yourDomain.com/icore", 1); foreach my $outputHtmlLine (sort @sortTemp) { my ($sortkey, $outText) = split("~", $outputHtmlLine); print OUTPHONEGOOGLE $outText; } print OUTPHONEGOOGLE &htmlEnd(1); # finish the html page close(OUTPHONEGOOGLE); # close the output file handle #------------------------------------------------ # FTP UPLOAD # Call putFile() to upload the summary #------------------------------------------------ #--- Change any of these values as needed for your environment $remotehost = "ftp.yourDomain.com"; my $remotefolder = "icore/access"; my $user = "yourID"; my $password = "yourPWD"; my $localdir = 'C:\core\web\Apache2\htdocs\logs'; # change as needed my @logfiles; $localfile = $outfileNameIphone; &putFile($localdir, $localfile, $remotehost, $remotefolder, $user, $password); #-------------------------- END OF MAIN SCRIPT -------------------------- #-------------------------------------------------------------------- # Function: parseLogLine # Purpose: Read a line from an apache web server log and split # into components # I/O: $record - one line of a web access log file # $retHash - Reference to a hash which returns values # parsed by this subroutine to the main script #--------------------------------------------------------------------- sub parseLogLine { my $record = shift; my $retHash = shift; # reference to output hash my $all; # holds remainder of log entry after each regex match my $trimmed; # hold trimmed up values; when tweaking needed. my ($host, $ident_user, $auth_user, $date, $time, $time_zone, $method, $url, $protocol, $status, $bytes, $referer, $client); $all = $record; # move current log record to working storage if ( $all =~ /\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/ ) { $host = $&; # match on hostname (IP) $all = $'; # postmatch to working storage $$retHash{'host'} = $host; # return Hostname } if ( $all =~ /\[.+]/ ) { $date = $&; # match on date $all = $'; # postmatch to working storage ($trimmed = $date) =~ s/]$//; # strip trailing bracket $trimmed =~ s/^\[//; # strip leading bracket $$retHash{'date'} = $trimmed; } if ( $all =~ /(GET|POST)/ ) { $method = $&; # match on http method $all = $'; # postmatch to working storage $$retHash{'method'} = $method; } if ( $all =~ /\/(\S+)\s/ ) { $url = $&; # match on requested URL $all = $'; # postmatch to working storage ($trimmed = $url) =~ s/\s+$//; # strip trailing spaces $$retHash{'url'} = $trimmed; } if ( $all =~ /HTTP\/\S+\s/ ) { $protocol = $&; # match on protocol $all = $'; # postmatch to working storage ($trimmed = $protocol) =~ s/\s+$//; # strip trailing spaces $trimmed =~ s/\"$//; # strip trailing double quotes $$retHash{'protocol'} = $trimmed; } if ( $all =~ /(\d+)\s/ ) { $status = $&; # match on http status code $all = $'; # postmatch to working storage ($trimmed = $status) =~ s/\s+$//; # strip trailing spaces $$retHash{'status'} = $trimmed; } if ( $all =~ /(\d+)\s/ ) { $bytes = $&; # match on bytes transfered $all = $'; # postmatch to working storage ($trimmed = $bytes) =~ s/\s+$//; # strip trailing spaces $$retHash{'bytes'} = $trimmed; } if ( ($all =~ /\"\w(\S+)\"\s/i) || ($all =~ /\"-\"/i) ) { $referer = $&; # match on referer $all = $'; # postmatch to working storage ($trimmed = $referer) =~ s/\s+$//; # strip trailing spaces $trimmed =~ s/\"$//; # strip trailing double quotes $trimmed =~ s/^\"//; # strip leading double quotes $$retHash{'referer'} = $trimmed; } if ( $all =~ /\"Mozilla(.+)\"\s/i ) { $client = $&; # match on client $all = $'; # postmatch to working storage ($trimmed = $client) =~ s/\s+$//; # strip trailing spaces $trimmed =~ s/\"$//; # strip trailing double quotes $trimmed =~ s/^\"//; # strip leading double quotes $$retHash{'client'} = $trimmed; } } #----------------------------------------------------------------- # Function: getLocalLogFileNames # Purpose: Return a list of access.log file names by scanning a # folder located the local machine. # Process: #----------------------------------------------------------------- sub getLocalLogFileNames { my ($file, $dirname, @fileList); # Replace dirname with folder path holding access logs FTP'd down # from remote server $dirname = '/core/web/Apache2/htdocs/logs'; opendir(DIR, $dirname) or die "cant open $dirname: $!\n"; while ( defined ($file = readdir DIR) ) { if ($file =~ /^access\w*\.log/) { print "$file\n"; push(@fileList, $file); } } return @fileList; } #----------------------------------------------------------------- # Function: getFile # Purpose: Use ftp to copy log files from a remote machine to # the local machine. #----------------------------------------------------------------- sub getFile { my $filename = shift; # local filename (full path) my $from_host = shift; # To Host my $from_folder = shift; # To Folder my $uid = shift; # ftp login id my $pwd = shift; # ftp login password my ($ftp); my ($currentDir, $result); $ftp = Net::FTP->new($from_host) or die "Can't connect: $@\n"; #--- Login $ftp->login($uid, $pwd ) or die "Can't login with $uid and $pwd\n"; #--- get name of current directory $currentDir = $ftp->pwd(); #--- Change to the target directory $ftp->cwd("$from_folder") or die "Can't change directory\n"; #--- get name of current directory on remote machine $currentDir = $ftp->pwd(); $ftp->ascii(); # set ascii transfer mode #--- print files in remote Dir my @remotefiles; my @localfiles; undef @localfiles; @remotefiles = $ftp->dir(); foreach my $longfilename (@remotefiles) { if ($longfilename =~ /access\w*\.log/) { $filename = $&; $filename =~ s/^\s+//g; print "\nGetting filename = [$filename] "; push(@localfiles, $filename); } $ftp->get($filename); } #--- Disconnect $ftp->quit() or warn "Couldn't quit. Oh well.\n"; return @localfiles; } #----------------------------------------------------------------- # Function: putFile # Purpose: Use ftp to copy all log files the local machine to # a remote machine. # Process: # @logfiles = &putFile($localfolder, $localfile, $remotehost, $remotefolder, $user, $password); #----------------------------------------------------------------- sub putFile { my $from_folder = shift; # Local Folder my $filename = shift; # local filename (full path) my $to_host = shift; # To Host my $to_folder = shift; # To Folder my $uid = shift; my $pwd = shift; print "Sending $filename to $to_folder $to_host\n "; my ($ftp); my ($currentDir, $result); #--- create ftp object $ftp = Net::FTP->new($to_host) or die "Can't connect: $@\n"; #--- Login $ftp->login($uid, $pwd ) or die "Can't login with [$uid] and [$pwd]\n"; #--- print name of current directory $currentDir = $ftp->pwd(); #--- Change to correct directory $ftp->cwd("$to_folder") or die "Can't change directory\n"; #--- print name of current directory $currentDir = $ftp->pwd(); $ftp->ascii(); # set ascii transfer mode my $fullpath = "$from_folder/$filename"; $ftp->put($fullpath); #--- Disconnect $ftp->quit() or warn "Couldn't quit. Oh well.\n"; } #----------------------------------------------------------------- # Function: discard # Purpose: Return true if the input line contains a character # string indicating we dont care about the line of text. #----------------------------------------------------------------- sub discard { my $lineOfText = shift; my $weDontCare = 0; # initialyze return variable my @discard; # initialyze array to hold discard strings # Load the discard array push(@discard, "ocadia"); push(@discard, "themes"); push(@discard, '24\.18\.'); push(@discard, '76\.114\.206'); push(@discard, '\.css'); push(@discard, '\\/js'); push(@discard, '\.png'); push(@discard, 'favicon\.ico'); push(@discard, '\/image\/background'); push(@discard, 'wp-admin\/images'); # Compare the input line to the list of discard strings foreach my $discardString (@discard) { if ($lineOfText =~ /$discardString/i) { $weDontCare = 1; # this line contains a word that eliminates it. } } return $weDontCare; } #----------------------------------------------------------------- # Function: retain # Purpose: Return true if the input line contains a character # string we care about. Lines containing these words # are included in processing by main script. #----------------------------------------------------------------- sub retain { my $lineOfText = shift; my $wecare = 0; my @retain; push(@retain, '\(iphone'); push(@retain, "hello-world"); push(@retain, "wpblog"); push(@retain, "iphoneCafe\.jpg "); # Compare the input line to the list of retain strings foreach my $retainString (@retain) { if ($lineOfText =~ /$retainString/i) { $wecare = 1; } } return $wecare; } #--------------------------------------------------------- # Function: htmlBegin # Purpose: Return the start of an html page #-------------------------------------------------------- sub htmlBegin { my $titleFrame = shift; # Title my $titleBody = shift; # Title my $tableOn = shift; # tableOn=1 if create table border my $tableStart = ""; if ($tableOn) { $tableStart=< EOT } my $htmlHead =< $titleFrame

$tableStart $titleBody

EOT return $htmlHead; } #--------------------------------------------------------- # Function: htmlEnd # Purpose: Return the end of an html page # #-------------------------------------------------------- sub htmlEnd { my $tableOn = shift; # =1 if create table border my $tableEnd = ""; if ($tableOn) { $tableEnd=< EOT } my $htmlFoot =< EOT return $htmlFoot; } #--------------------------------------------------------- # Function: anchorWrap # Purpose: Wrap an href anchor tag around the input URL. # Insert line break before query string. #-------------------------------------------------------- sub anchorWrap { my $url = shift; # The URL my $description = shift; # The description if any my $altUrl = $url; # Modify the URL for display my $retVar = ""; if ( $url =~ /\?q=/) { ($altUrl = $url) =~ s/\?q=/
\?q=/; # insert
tag } if ( $url =~ /&q=/) { ($altUrl = $url) =~ s/&q=/
&q=/; # insert
tag } my $retVar =<$altUrl EOT return $retVar; } #--------------------------------------------------------- # Function: apacheDate2Bin # Purpose: Convert apache date to numeric string # Input Format: "28/Jan/2008:16:54:18 -0800" # Output Format: "20080127-141031" #-------------------------------------------------------- sub apacheDate2Bin { my $inputdate = shift; # a string value my ($zTime, $zDate, $zTz); # Split into time, date, and timezone if ( $inputdate =~ /:\d\d:\d\d:\d\d/ ) { $zTime = $&; # Match is the time $zDate = $`; # pre match is the date $zTz = $'; # post match is the timezone $zTime =~ s/^://; # strip leading colon } # Split date into day, month, year my ($dd, $mon, $yyyy) = split('/', $zDate); # Convert month from Mon to a 2 digit number my $bMonth = &binaryMonth($mon); # Split time into hours, minutes, seconds my ($hh, $mm, $ss) = split(":", $zTime); # concatenate everything into the output string my $outputdate = "$yyyy$bMonth$dd-$hh$mm$ss"; return $outputdate; } #----------------------------------------------------------------- # Function: binaryMonth # Purpose: Convert month from character number to binary format #----------------------------------------------------------------- sub binaryMonth { my $month = shift; # month in Mon format if ($month =~ /jan/i) { return "01"; } if ($month =~ /feb/i) { return "02"; } if ($month =~ /mar/i) { return "03"; } if ($month =~ /apr/i) { return "04"; } if ($month =~ /may/i) { return "05"; } if ($month =~ /jun/i) { return "06"; } if ($month =~ /jul/i) { return "07"; } if ($month =~ /aug/i) { return "08"; } if ($month =~ /sep/i) { return "09"; } if ($month =~ /oct/i) { return "10"; } if ($month =~ /nov/i) { return "11"; } if ($month =~ /dec/i) { return "12"; } }