#!/usr/bin/env perl
use strict;
use warnings;
use encoding 'utf8';
use 5.010;
use Getopt::Long ':config', 'bundling', 'no_ignore_case';
use POSIX 'setlocale', 'LC_CTYPE';
use Socket;
use Term::ANSIColor;

my $unicode = 0;
if (setlocale(LC_CTYPE) =~ /UTF-8$/) {
	$unicode = 1;
}

# Default options
my $bytes        = 0;
my $configfile   = "$ENV{HOME}/.lighty-stats.conf";
my ($file, @files);
my $ignore       = 0;
my $items        = 10;
my $log_format   = 'lighttpd';
my $match        = 0;
my $omission_chars = ($unicode ? '…' : '...');
my $resolve      = 1;
my $show_clients = 1;
my $show_days    = 0;
my $show_hosts   = 1;
my $show_referers= 0;
my $show_sites   = 1;
my $show_status  = 0;
my $show_useragents = 0;
my $truncate_client_ipv4 = -1;
my $truncate_client_ipv6 = -1;
my $truncate_referer = -1;
my $truncate_url = -1;
my $use_formats  = 'auto';
my $verbose      = 0;
my $width        = 60;

my %formats = (
	line_even    => 'underline',
	line_uneven  => 'clear',
	title        => 'bold',
);
# Hashref structure:
# $hashref->{item} = counter (either hits or transferred bytes)
my ($clients, $days, $http_status, $hosts, $referers, $sites, $useragents);

# Arrayref structure: [show by default, ref to hashref for _show, description]
my $categories = {
	clients  => [$show_clients , \$clients    , 'Clients'         ],
	days     => [$show_days    , \$days       , 'Hits per day'    ],
	hosts    => [$show_hosts   , \$hosts      , 'vHosts'          ],
	sites    => [$show_sites   , \$sites      , 'URLs'            ],
	status   => [$show_status  , \$http_status, 'HTTP Statuscodes'],
	referers => [$show_referers, \$referers   , 'Referer strings' ],
	useragents => [$show_useragents, \$useragents, 'User agents'  ],
};


my $i;
my $filesize;
my ($log, $line);

my $exitcode = 0;

# Modify (increase) everything by 1, which means we count the hits.
# If bytes is set, this will be changed dynamically to count bytes
my $modifier = 1;

my ($byte, $client, $day, $host, $status, $time, $url, $user, $useragent, $referer);

# vsftpd is not fully supported, just for testing purposes
my $regexes = {
	lighttpd => qr/^(?<client>\S+) (?<host>\S+) (?<user>\S+) \[(?<day>..\/...\/\d{4})\:(?<time>\d{2}(?:\:\d{2}){2}) \+....\] "\S+ (?<url>\/\S*) HTTP\/1.." (?<status>\d{3}) (?<byte>\d+) "(?<referer>[^"]+)" "(?<useragent>[^"]+)\"$/,
	vsftpd   => qr/^(?<day>\S+\s+\S+\s+\d+) (?<time>\S+) \d+ \[pid \d+\] .+ Client "(?<client>\S+)", "(?<url>[^"]+)", (?<byte>\d+) bytes/,
};


my %options = (
	"b|bytes"         => \$bytes,
#	"F|config-file=s" => handled by getopt_prepare
	"c|count=i"       => \$items,
	"C|custom-regex=s"=> \$regexes->{custom},
	"f|file=s"        => \@files,
	"u|format=s"      => \%formats,
	"h|?|help"        => \&print_usage,
	"i|ignore=s"      => \$ignore,
	"l|log-format=s"  => \$log_format,
	"m|match=s"       => \$match,
	"omission-chars=s"=> \$omission_chars,
	"resolve!"        => \$resolve,
	"n"               => sub {$resolve = 0},
	"show-clients!"   => \$categories->{clients}->[0],
	"show-days!"      => \$categories->{days}->[0],
	"show-hosts!"     => \$categories->{hosts}->[0],
	"show-referers!"  => \$categories->{referers}->[0],
	"show-sites!"     => \$categories->{sites}->[0],
	"show-status!"    => \$categories->{status}->[0],
	"show-useragents!"=> \$categories->{useragents}->[0],
	"truncate-client-ipv4=i" => \$truncate_client_ipv4,
	"truncate-client-ipv6=i" => \$truncate_client_ipv6,
	"truncate-referer=i" => \$truncate_referer,
	"truncate-url=i"  => \$truncate_url,
	"U|use-formats=s" => \$use_formats,
	"v|verbose+"      => \$verbose,
	"w|width=i"       => \$width,
);

getopt_prepare();
GetOptions(%options);

if (defined($regexes->{custom})) {
	$log_format = 'custom';
	$regexes->{custom} = qr/$regexes->{custom}/;
}

unless (defined($regexes->{$log_format})) {
	print "Unsupported logfile format: $log_format\n";
	exit(100);
}

if ($truncate_client_ipv4 > 0 or $truncate_client_ipv6 > 0) {
	$resolve = 0;
}

push(@files, @ARGV);

foreach(@files) {
	if ($_ eq '-') {
		$_ = '/dev/stdin';
	}
}

unless(@files) {
	@files = ('/var/log/lighttpd/access.log');
}

if ($use_formats eq 'never' or ($use_formats eq 'auto' and not -t STDOUT)) {
	foreach(keys(%formats)) {
		$formats{$_} = '';
	}
}

foreach $file (@files) {
	unless (open($log, '<', $file)) {
		print STDERR "skipping $file: $!\n";
		$exitcode++;
		next;
	}

	if ($verbose) {
		if ($file eq '/dev/stdin') {
			$filesize = 0;
		} else {
			$filesize = (stat($file))[7];
		}
		$| = 1;
	}

	line:
	while ($line = <$log>) {
		chomp($line);

		if ($verbose and ($. % 100 == 0)) {
			if ($filesize) {
				info(1, "\rparsing $file: ".int((tell)/$filesize*100).'%');
			} else {
				info(1, "\rparsing $file: " . tell);
			}
		}

		next line if (($match and not $line =~ /$match/) or ($ignore and $line =~ /$ignore/));
		unless ($line =~ $regexes->{$log_format}) {
			info(2, "$file: skipping line $. (\"$line\")\n");
			next line;
		}

		($byte, $client, $day, $host, $status, $time, $url, $user, $useragent, $referer) =
		@+{'byte', 'client', 'day', 'host', 'status', 'time', 'url', 'user', 'useragent', 'referer'};

		if ($bytes) {
			# Sometimes $byte contains a '-' instead of the sent bytes.
			# We assume this means that nothing was sent at all.
			if ($byte eq '-') {
				$modifier = 0;
			} else {
				$modifier = $byte;
			}
		}

		if ($truncate_url > 0) {
			$url =~ s/^(\/([^\/]+\/){$truncate_url})(.*)$/$1/
			and $url .= $omission_chars;
		}
		if ($truncate_referer >= 0) {
			$referer =~ s/^(\w+:\/\/[^\/]+(\/([^\/]+\/){$truncate_referer}))(.*)$/$1/
			and $referer .= $omission_chars;
		}
		if ($truncate_client_ipv4 > 0) {
			$client = truncate_ip(4, $client, $truncate_client_ipv4);
		}
		if ($truncate_client_ipv6 > 0) {
			$client = truncate_ip(6, $client, $truncate_client_ipv6);
		}

		$clients->{$client}     += $modifier;
		$days->{$day}           += $modifier;
		$hosts->{$host}         += $modifier;
		$http_status->{$status} += $modifier;
		$referers->{$referer}   += $modifier;
		$sites->{$url}          += $modifier;
		$useragents->{$useragent}+=$modifier;
	}
	close($log);

	info(1, "\r\e[2Kparsing $file: done\n");
}

foreach (keys(%$categories)) {
	if ($categories->{$_}->[0] == 1) {
		show(${$categories->{$_}->[1]}, $categories->{$_}->[2]);
	}
}

exit($exitcode);

sub ipv6tobin {
	my $bin = '';
	foreach (split(/:/, shift)) {
		$bin .= sprintf('%016b', hex);
	}
	return($bin);
}

sub truncate_ip {
	my ($mode, $ip, $prefix) = @_;
	my ($binip, $i);
	my @length;
	my ($ipsize, $blocksize, $type, $sep);

	$ip =~ s/^::ffff://;
	if ($ip !~ /:/ and $mode == 4) {
		$blocksize = 8;
		$ipsize = 32;
		$sep = '.';
		$type = '%d';
		$binip = sprintf('%08b' x 4, split(/\./, $ip));
	} elsif ($ip =~ /:/ and $mode == 6) {
		$blocksize = 16;
		$ipsize = 128;
		$sep = ':';
		$type = '%x';
		if ($ip =~ /^(.*)::(.*)$/) {
			@length = (scalar(@{[$1 =~ /(:)/g]}), scalar(@{[$2 =~ /(:)/g]}));
			$binip = ipv6tobin($1);
			$binip .= '0' x (16*(6-$length[0]-$length[1]));
			$binip .= ipv6tobin($2);
			unless (length($binip) == $ipsize) {
				die("length($binip) is ", length($binip), ", should be $ipsize");
			}
		} else {
			$binip = ipv6tobin($ip);
		}
	} else {
		return($ip);
	}
	$binip = substr($binip, 0, $prefix);
	$binip .= '0'x($ipsize-$prefix);
	$ip = '';
	for ($i = 0; $i < $ipsize/$blocksize; $i++) {
		$ip .= $sep if($i);
		$ip .= sprintf($type, oct('0b'.substr($binip, $i*$blocksize, $blocksize)));
	}
	$ip .= "/$prefix";
	return($ip);
}

sub getopt_prepare {
	for (my $i=0; $i < @ARGV; $i++) {
		$_ = $ARGV[$i] || next;
		if (/^-(-config-file|F)/) {
			if (/^-(?:-config-file=(.*)|F(.*))$/) {
				$configfile = $1 || $2;
				splice(@ARGV, $i--, 1);
			} elsif (exists($ARGV[$i+1])) {
				$configfile = $ARGV[$i+1];
				splice(@ARGV, $i--, 2);
			}
		}
	}

	if (open(my $fh, '<', $configfile)) {
		while(<$fh>) {
			chomp;
			next if /^\s*#/;
			if (/^\s*(\S+)\s*=\s*(.+)$/) {
				unshift(@ARGV, "--$1", $2);
			} elsif (/^\s*(\S+)$/) {
				unshift(@ARGV, "--$1");
			}
		}
		close($fh);
	}
}

sub show {
	my ($item, $count, $hits);
	my $i;
	my @sorted;
	my $what = shift;
	my $title = shift;
	my $iaddr;

	print_format("\n$title\n", 'title');
	@sorted = sort { $what->{$b} <=> $what->{$a} } keys %$what;
	for ($i=0; ($items == 0 or $i < $items) and exists($sorted[$i]); $i++) {
		$item = $sorted[$i];
		$count = $what->{$item};

		if ($resolve and $what == $clients) {
			$item =~ s/^::ffff://;
			if ($iaddr = inet_aton($item)) {
				$item = gethostbyaddr($iaddr, AF_INET) || $item;
			}
		}
		if (!$resolve and $what == $clients) {
			$item =~ s/:(?:0+([^0-9a-f]))+/::$1/i;
		}

		$hits = "$count " . ($bytes ? "bytes" : "hits");
		print_format($item . ' 'x($width-length($item)-length($hits)) . $hits."\n", (($i % 2) ? 'line_even' : 'line_uneven'));
	}
}

sub print_format {
	my $line = shift;
	my $format = shift;
	if($formats{$format}) {
		print colored($line, $formats{$format});
	} else {
		print $line;
	}
}

sub info {
	my $level = shift;
	my $message = shift;

	if ($verbose >= $level) {
		print $message;
	}
}

sub print_usage {
	exec('perldoc', '-F', $0) or die('See man lighty-stats or perldoc lighty-stats');
}

__END__

=head1 NAME

lighty-stats - a light-weight lighttpd logfile analyzer

=head1 SYNOPSIS

B<lighty-stats> [ I<options> ] [ I<logfile> I<...> ]

=head1 DESCRIPTION

B<lighty-stats> is a logfile analyzer which, unlike others, displays the result
on stdout instead of writing it to HTML files.

By default, captions will be displayed bold and every second item will be
underlined for better readability.  (But only if the output is sent to a tty)

If not specified on the commandline, I<logfile> will be
F</var/log/lighttpd/access.log>.

Unlike the name suggests, lighty-stats can also analyze Apache logfiles,
and probably other ones as well as long as they use the same log format

=head1 OPTIONS

=over

=item B<--show-hosts>

Display category 'vHosts'

=item B<--show-sites>

Display category 'URLs'

=item B<--show-clients>

Display category 'Clients'

=item B<--show-status>

Display category 'HTTP Statuscodes'

=item B<--show-days>

Display category 'Hits per day'

=item B<--show-referers>

Display category 'Referer strings'

=item B<--show-useragents>

Display category 'User agents'

=back

The display of a category can be prohibited by a preceeding 'no-', like
B<--no-show-status>

=over

=item B<-u>, B<--format> I<format>=I<value>

Change the output format.
I<format> can be one of 'line_uneven', 'line_even' or 'title'.
I<value> is a string understood by L<Term::ANSIColor>(3perl).
'line_uneven' and 'line_even' are a category's lines,
'title' is the category name.

=item B<-U>, B<--use-formats>=I<when>

When to apply formats to the output.
By default, this is only done if the output is a terminal.
I<when> may be B<never>, B<auto> or B<always>.

=item B<-b>, B<--bytes>

Sort items by transferred bytes instead of hits.

=item B<-F>, B<--config-file>=I<filename>

Read additional options from I<filename>.
By default, F<~/.lighty-stats.conf> is read.
Use C<--config-file=> to ignore it.

See L</"CONFIGURATION"> for the file's syntax

=item B<-c>, B<--count>=I<integer>

Show I<integer> items per category (default: 10).
If I<integer> is B<0>, show all.

=item B<-C>, B<--custom-regex>=I<regex>

If your logfile deviates from the standard format: Use I<regex> for parsing.

I<regex> is a perl regular expression as described in perlre(1),
which must use named capture buffers of the form
(?E<lt>I<var>E<gt>I<regexp>)

lighty-stats uses the following I<var>s:
client, host, user, day, time, url, status, byte, referer, useragent.
A log line is parsed if it contains at least client, host, status, day and url.

Note: Avoid placing a named capture buffer as 'optional'
(like C<< (?:"(?<url>\S+)") >>) - it will lead to undefined behaviour.
Also, currently it's recommended that all I<var>s listed above occur in the
regexp, even if they're empty (like C<< (?<time>) >>).

=item B<-f>, B<--file>=I<logfile>

Analyze I<logfile> or (if I<logfile> is '-') stdin.
This option may be used multiple times

=item B<-i>, B<--ignore>=I<regexp>

ignore lines matching I<regexp> for statistics

=item B<-l>, B<--log-format>=I<application>

Use another I<application>'s logfile format for parsing.
Currently, only 'lighttpd' is supported, so using this option is quite pointless.

=item B<-m>, B<--match>=I<regexp>

Only use lines matching I<regexp> for statistics

=item B<-n>, B<--no-resolve>

Don't resolve client IPs.

=item B<--omission-chars>=I<string>

String to append after most truncated items

=item B<--truncate-url>=I<integer>

Cut requested paths to I<integer> directories.
Example: With --truncate-url=2, /i/like/this/path will become /i/like/...

=item B<--truncate-referer>=I<integer>

Same as --truncate-url, but starting after the hostname.
Example: With --truncate-referer=2,
http://example.org/hello/there/my/friend will become
http://example.org/hello/there/...

Note: This truncate option also accepts '0' as argument,
cutting off everything but the hostname

=item B<--truncate-client-ipv4>=I<prefix>

Truncate Client IPs to /I<prefix> subnets.
Overrides B<--resolve>

=item B<--truncate-client-ipv6>=I<prefix>

Truncate Client IPs to /I<prefix> prefixes.
Overrides B<--resolve>

=item B<-v>, B<--verbose>

Print some status information while analyzing the file.
Specify multiple times for more status information

=item B<-w>, B<--width>=I<columns>

Limit lines to I<columns> length

=back

=head1 CONFIGURATION

The commandline options listed above may be written to a configuration file.
Only long options without the trailing '--' are allowed, an option must have
a '=' before it's value, and lines starting with a '#' will be ignored.
For example,

  verbose
  # this is a comment, while the next line ain't
  match = #purrl
  ignore = some user agent

will be like the commandline invocation
C<lighty-stats --verbose --match='#purrl' --ignore='some user agent'>

=head1 CAVEATS

B<--match> and B<--ignore> always apply to a whole logfile line. There is no
possibility to only match/ignore a user-agent, request-URI or similar.

=head1 EXAMPLES

=over

=item * C<lighty-stats --width=$COLUMNS>

Use the whole window for displaying, so longer items are possible

=item * C<lighty-stats --ignore='127(\.\d+){3}' --pathlength=2>

Ignore local requests, trim paths to two levels

=item * C<lighty-stats -vFnever>

verbose mode, unformatted output

=item * C<zcat /var/log/lighttpd/access.log.*.gz | lighty-stats - /var/log/lighttpd/access.log>

Analyze all available logfiles

=back

=head1 AUTHORS

Daniel Friesel E<lt>derf@derf.homelinux.orgE<gt>

Some contributions by Vsevolod Kozlov

=head1 LICENSE

See COPYING
