#!/usr/bin/perl -ws
# Created by Ben Okopnik on Thu Jun 28 09:11:52 EDT 2007
#
# Copyright (C) 2007 Ben Okopnik <ben@okopnik.com>
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.

=pod
################################## Changelog ##############################

07/28/07  0:41 - v2.2
* Added 'jwhois' as the preferred option, with a warning if it's not
	installed. Caching for 'jwhois' is disabled when '-X' is in effect;
	'-H' is no longer a hard-wired argument to 'whois' ('jwhois' doesn't
	support it), but is still appended if 'whois' is used.
* Tweaked a couple of the regexen to process new TLDs (.fi, .ly, etc.)
* Giving serious thought to modifying the format of the -F files; it would
	be nice to be able to specify the whois server for individual domains.

07/20/07  9:36 - v2.1
* Added a bunch of tracing/debugging statements to the date parser, making
	the '-X' option much more useful
* Built a 'switch-case' structure around the parser so that only one regex
	would apply to any given host
* Added a '-H' argument to 'whois' ("elide legal disclaimer") to make
	debugging output less annoying (and maybe speed things up fractionally)
* Made the 'no expiration date found' error into a non-fatal warning (used
	to break list processing)
* Modified the output format slightly (warnings now appear on the same line
	as the domain name)
* Domains without a registrar will no longer be omitted from the mailed
	notifications

07/19/07 22:28 - v2.0
* Now parsing .ci domains as well (millions of people cheer, world peace
	can't be far away now...)

07/19/07 20:54 - v1.9
* Added a little regex-fu to accept lines that have whitespace at the end
* Added a Big Sekrit Option ('-X' - shhh, don't tell anybody!) for debugging

07/19/07 11:56 - v1.8
* Lots and lots of fixes for many different TLDs; much mangling of regexen.
	Now handles many more expiration date types than before. Most
	importantly, domains that don't list a registrar will now be displayed
	anyway; people probably know where to send their money, but not
	necessarily _when._

07/04/07 12:28 - v1.7
* Scrapped previous approach to the .org delay; the .orgs are now sorted to
	the end of the domain list and all except the first one wait 20 seconds.
* Added a cute little time ticker to the delay routine, just because. :)

07/03/07  1:27 - v1.6
* Added a rate limiter (3/minute) for .org domains

06/30/07 18:34 - v1.5
* Added a "domain not parseable; please report" warning
* Added an "Unable to read 'whois' info" warning for the 'fgets: connection
	reset by peer' error.
* All expiration warnings are now sent as one email instead of one per
	domain; ditto the expired domains notifications.
* The 'printf' for the 'SKIPPED' error was ignoring the '-q' option; fixed

06/30/07  8:19 - v1.4
* Removed dependency on File::Find; searching PATH 'manually'
* Added an 'exit 1' to the silent failure mode of 'croak'

06/30/07  7:06 - v1.3
* Improved the date-parsing regexes (the numerical months part can now only
	match '01-12' instead of 'any two digits'); this should increase the
	reliability of resolving 'dd-mm-yyyy' vs. 'mm-dd-yyyy' somewhat.
* More accurate reporting for the 'SKIPPED' error (now shows exact reason)
* Fixed the regexes that I screwed up while adding the Dotster extension
* Added a '-v' option

06/29/07 18:54 - v1.2
* Got rid of an unnecessary system dependency ('which') - 'File::Find' is a
	bit clunky, but better than depending on unknowns...
* Another date-processing regex (ISOC-IL: 'validity: 29-06-2007')

06/29/07 17:07 - v1.1
* Modified output format to include both exp. date and days remaining
* Added another date-processing regex (DOTSTER: 'Expires on: 29-Jun-07')

06/29/07 15:06 - v1.0
I'm finally willing to admit that this script is usable. :) Recent changes
include:

* Parsing routine for "2007/08/12" date format
* 'croak' notifies admin of problems encountered in silent mode
* Added a fallback email address for 'croak'
* Fixed GMT parsing routine miscalc (thanks to Rick Moen for the heads up)
 
###########################################################################
=cut

use strict;
use Time::Local;
$|++;

# Command-line variables
our ($d, $e, $F, $h, $q, $s, $v, $x, $X);

### FALLBACK ADDRESS FOR NOTIFICATION ############
my $address = 'root@localhost';
##################################################

my ($name) = $0 =~ /([^\/]+)$/;

my $usage =<<"+EoT+";
Usage: $name [-e=email] [-x=expir_days] [-q] [-h] <-d=domain_name|-F=domainfile>

  -d=domain        : Domain to analyze
  -e=email_address : Send a warning message by email
  -F=domain_list   : File with a list of domains, one per line
  -h               : Print this message
  -q               : Don't print to the console (REQUIRES '-e' OPTION)
  -s=whois server  : Use alternate whois server
  -v               : Display current version of this script
  -x=days          : Change default (30d) expiration interval (REQUIRES '-e' OPTION)

+EoT+

# Locate 'whois' or (preferred) 'jwhois'
my ($whois) = grep -e, map "$_/jwhois", split /:/, $ENV{PATH};
($whois) = grep -e, map "$_/whois", split /:/, $ENV{PATH} unless $whois;
die "'whois' not found in path.\n" unless $whois;
if ($whois =~ m#/whois$#){
	$q || print "You really should install 'jwhois'; it gives better results.\n";
	# Turn down the noise (minimal output option - only works with 'whois')
	$whois .= " -H";
}
else {
	# Turn off caching for 'jwhois' if the debug option is on
	$whois .= " -f" if $X;
}

# Find a mail client (mutt or mailx)
my ($mail) = grep -e, map "$_/mutt", split /:/, $ENV{PATH};
# Switch Mutt into 'mailx' mode if found
if ($mail){
	$mail .= " -x";
}
else {
	($mail) = grep -e, map "$_/mailx", split /:/, $ENV{PATH};
}
die "No 'mailx' or 'mutt' (mail client) found in path.\n" unless $mail;

# Read the version number at the top of the changelog
if ($v){
	seek DATA, 0, 0;
	while (<DATA>){
		if (m[^\d+/\d+/\d+[^v]+v([0-9.]+)]){
			print "Version: $1\nCopyright (C) 2007 Ben Okopnik <ben\@okopnik.com>\n\n";
			exit 0;
		}
	}
}

# Email admin if '-q' is on; otherwise, just exit with the error
sub croak {
	if ($q){
		# If '-e' wasn't specified, use the fallback address
		$e ||= $address;

		# No place to send an error if this fails... :)
		open Mail, "|$mail -s 'WARNING: $name script error' $e";
		print Mail "$name [" . localtime() . "]: ", $_[0];
		close Mail;

		exit 1;
	}
	else {
		die $_[0];
	}
}

# Display the help output if requested or in case of incorrect usage
die "$usage\n" if $h;
die "\n*ERROR: '$name' requires an email address with the '-q' and the '-x' options*\n\n$usage" if ($q || $x) && ! $e;
die "\n*ERROR: '$name' requires either a domain name or a domain list as an argument*\n\n$usage" if ! $d && ! $F;

# Set default notification interval to 30 days
if ($x){
	croak "Expiration interval must be specified in days (0-9999).\n"
		unless $x =~ /^\d{1,4}$/;
}
else {
	$x = 30;
}

# Add the server to the "whois" command if it's been specified
$whois .= " -h $s" if $s;

# Read the domain list file
my @domains;
if ($F){
	croak "$F is not a regular file\n" unless -f $F;
	croak "Can't read $F\n" unless -r _;
	# Open the file if it exists
	open F or croak "$F: $!\n";
	while (<F>){
		# Skip blank lines; ignore comments
		next if /^\s*(?:#|$)/;
		# Strip preceding and following blanks
		s/^\s*(.*?)\s*$/$1/;
		# Strip URI method and any terminal '/'s
		s#^.*://##;
		s#/$##;
		push @domains, $_;
	}
	close F;
}

# Having a '-F' AND a '-d' is explicitly not excluded
if ($d){
	# Strip URI method and any terminal '/'s
	$d =~ s#^.*://##;
	$d =~ s#/$##;
	push @domains, $d;
}

# Sort list to push .orgs to the end; ASCIIbetical sort otherwise
@domains = sort { ($a =~ /\.org$/i) <=> ($b =~ /\.org$/i) || $a cmp $b } @domains;

# Trim strings to specified length; return '**UNKNOWN**' if undef
sub trim {
	defined $_[0] || return "**UNKNOWN**";
	substr($_[0], 0, $_[1]);
}

# Lookup list for month number->name conversion
my (%mth,%mlookup);
@mth{map sprintf("%02d", $_), 1..12} = qw/jan feb mar apr may jun jul aug sep oct nov dec/;
# Lookup list for month name->abbrev conversion
@mlookup{qw/January February March April May June July August September October November December/} =
	qw/jan feb mar apr may jun jul aug sep oct nov dec/;

########################## DATA COLLECTION SECTION #############################

# Process the domain list
my ($seen, %list);
for my $host (@domains){

	$q || print "\b\nProcessing $host... ";

	# Delay to avoid triggering rate limiter
	if ($host =~ /\.org$/i){
		$q || print "(NOTE: Subsequent ORG queries will be delayed by 20 seconds each due to rate limiting) "
			unless $seen;
		# Show the cute little time ticker :)
		if ($seen++){
			my @chars = split //, '|/-\\';
			for (0 .. 19){
				$q || print $chars[$_ % 4], "\b";
				sleep 1;
			}
			print " \b";
		}
	}

	# Execute the query
	my $out;
	open Who, "$whois $host|" or croak "Error executing $whois: $!\n";
	{
		# Read in the entire output of 'whois' as a single string
		local $/;
		$out = <Who>;
	}
	close Who;

	# 'fgets: connection reset by peer' - bloody annoying response!
	if (!$out || $out !~ /domain/i){
		$q || print "Unable to read 'whois' info for $host. Skipping... ";
		next;
	}

	# Freak out and run away if there's no match
	if ($out =~ /no match/i){
		$q || print "No match for $host!\n";
		next;
	}
	# Ditto for bad hostnames
	if ($out =~ /No whois server is known for this kind of object/i){
		$q || print "'whois' doesn't recognize this kind of object. ";
		next;
	}

	# Convert multi-line 'labeled block' output to 'Label: value'
	my $debug;
	if ($out =~ /registrar:\n/i){
		$out =~ s/:\n(?!\n)/: /gsm;
		$debug .= "matched on line " . (__LINE__ - 2) . ": Multi-line 'labeled block'\n";
	}

	# Date preprocessing. Desired date format is '29-jun-2007'
	# 'Fri Jun 29 15:16:00 EDT 2007'
	if ($out =~ s/(date:\s*| on:\s*)[A-Z][a-z]+\s+(...)\s+(\d+).*?(\d+)\s*$/$1$3-$2-$4/igsm){
		$debug .= "matched on line " . (__LINE__ - 2) . ": 'Fri Jun 29 15:16:00 EDT 2007'\n";
	}
	# '29-Jun-07'
	elsif ($out =~ s/(date:\s*| on:\s*)(\d{2})[\/ -](...)[\/ -](\d{2})\s*$/$1$2-$3-20$4/igsm){
		$debug .= "matched on line " . (__LINE__ - 2) . ": '29-Jun-07'\n";
	}
	# '2007-Jun-29'
	elsif ($out =~ s/[^\n]*(?:date| on|expires on\.+):\s*(\d{4})[\/-](...)[\/-](\d{2})\.?\s*$/Expiration date: $3-$2-$1/igsm){
		$debug .= "matched on line " . (__LINE__ - 2) . ": '2007-Jun-29'\n";
	}
	# '2007/06/29'
	elsif ($out =~ s/(expires:\s*|date\s*:\s*| on:\s*)(\d{4})(?:[\/-]|\. )(0[1-9]|1[0-2])(?:[\/-]|\. )(\d{2})\.?\s*$/$1$4-$mth{$3}-$2/igsm){
		$debug .= "matched on line " . (__LINE__ - 2) . ": '2007/06/29'\n";
	}
	# '29-06-2007'
	elsif ($out =~ s/(?:validity:|expir(?:y|ation) date:|expires (?:on [^:]+:?|at:))\s*(\d{2})[\/.-](0[1-9]|1[0-2])[\/.-](\d{4})\s*[0-9:.]*\s*\w*\s*$/Expiration date: $1-$mth{$2}-$3/igsm){
		$debug .= "matched on line " . (__LINE__ - 2) . ": '29-06-2007'\n";
	}
	# '[Expires on]     2007-06-29' (.jp, .ru)
	elsif ($out =~ s/(?:valid-date|expiration date:|paid-till:|\[expires on\]|expires on :|expired:)\s*(\d{4})[\/.-](0[1-9]|1[0-2])[\/.-](\d{2})\s*[0-9:.]*\s*\w*\s*$/Expiration date: $3-$mth{$2}-$1/igsm){
		$debug .= "matched on line " . (__LINE__ - 2) . ": '[Expires on]     2007-06-29' (.jp, .ru)\n";
	}
	# 'expires:     June  29 2007' (.is)
	elsif ($out =~ s/expires:\s*([A-Z][a-z]+)\s+(\d{1,2})\s+(\d{4})\s*$/"Expiration date: " . sprintf("%02d", $2) . "-$mlookup{$1}-$3"/iegsm){
		$debug .= "matched on line " . (__LINE__ - 2) . ": 'expires:     June  29 2007' (.is)\n";
	}
	# 'renewal: 29-June-2007' (.ie)
	elsif ($out =~ s/renewal:\s*(\d{1,2})[\/ -]([A-Z][a-z]+)[\/ -](\d{4})\s*$/Expiration date: $1-$mlookup{$2}-$3/igsm){
		$debug .= "matched on line " . (__LINE__ - 2) . ": 'renewal: 29-June-2007' (.ie)\n";
	}
	# 'expire:         20080315' (.cz, .ke)
	elsif ($out =~ s/expir[ey]:\s*(\d{4})(\d{2})(\d{2})\s*$/Expiration date: $3-$mth{$2}-$1/igsm){
		$debug .= "matched on line " . (__LINE__ - 2) . ": 'expire:         20080315' (.cz, .ke)\n";
	}
	# 'domain_datebilleduntil: 2007-06-29T00:00:00+12:00' (.nz)
	elsif ($out =~ s/domain_datebilleduntil:\s*(\d{4})[-\/](\d{2})[-\/](\d{2})T[0-9:.+-]+\s*$/Expiration date: $3-$mth{$2}-$1/igsm){
		$debug .= "matched on line " . (__LINE__ - 2) . ": 'domain_datebilleduntil: 2007-06-29T00:00:00+12:00' (.nz)\n";
	}
	# '29 Jun 2007 11:58:42 UTC' (.coop)
	elsif ($out =~ s/((?:date|expires):\s*)(\d{2})[\/ -](...)[\/ -](\d{4})\s*[0-9:.]*\s*\w*\s*$/$1$2-\L$3\E-$4/igsm){
		$debug .= "matched on line " . (__LINE__ - 2) . ": '29 Jun 2007 11:58:42 UTC' (.coop)\n";
	}
	# 'Record expires on 17/8/2100' (.hm, fi)
	elsif ($out =~ s/(?:expires(?: on|:))\s*(\d{2})[\/.-]([1-9]|0[1-9]|1[0-2])[\/.-](\d{4})\s*[0-9:.]*\s*\w*\s*$/"Expiration date: $1-".$mth{sprintf "%02d", $2} . "-$3"/iegsm){
		$debug .= "matched on line " . (__LINE__ - 2) . ": 'Record expires on 17/8/2100' (.hm)\n";
	}
	else {
		$debug = "No regexes matched.\n";
	}

	# Collect the data from each query
	for (split /\n/, $out){
		# Clip pre- and post- blanks
		s/^\s*(.*?)\s*$/$1/;
		# Squash repeated tabs and spaces
		tr/ \t//s;

		# This is where it all happens - regexes to capture registrar and expiration
		$list{$host}{Registrar} ||= $1 if /(?:authorized agency|registrar)(?:\s*|_)(?:name|id)?:\s*(.*)$/i;
		$list{$host}{Expires} ||= $1 if /(?:expires(?: on)?|expir(?:e|y|ation) date\s*|renewal(?:[- ]date)?)[:\] ]\s*(\d{2}-[a-z]{3}-\d{4})/i;
		# print "Registrar: $list{$host}{Registrar}\nExpires: $list{$host}{Expires}\n";
	}

	# Assign default message if no registrar was found
	$list{$host}{Registrar} ||= "[[[ No registrar found ]]]";
	
	$q || print "No expiration date found in 'whois' output. Please report this domain to the author!"
		unless defined $list{$host}{Expires};

	# Debug option (activated by '-X'); exits here with parsed 'whois' output
	$debug .= "Registrar: $list{$host}{Registrar}\n" if defined $list{$host}{Registrar};
	$debug .= "Expires: $list{$host}{Expires}\n" if defined $list{$host}{Expires};
	die "\n", "=" x 70, "\n$out", "=" x 70, "\n$debug", "=" x 70, "\n" if $X;
}

########################## DATA ANALYSIS SECTION #############################

# Get current time snapshot in UTC
my $now = timegm(gmtime);

# Convert dates to UTC epoch seconds; *will* fail on 19 Jan 2038. :)
my %months;
@months{qw/jan feb mar apr may jun jul aug sep oct nov dec/} = 0..11;

# Print the header if '$q' is off and there's content in %list
$q || %list && printf "\n\n%-24s%-36s%s\n%s\n", "Host", "Registrar", "Exp.date/Days left", "=" x 78;

# Process the collected data
my (%exp, %end);
for my $k (sort keys %list){
	unless (defined $list{$k}{Expires}){
		$q || printf "%-32s%s\n", trim($k, 31), "*** SKIPPED (missing exp. date) ***";
		delete $list{$k};
		next;
	}
	my @chunks = split /-/, $list{$k}{Expires};
	my $epoch = timegm(0, 0, 0, $chunks[0], $months{lc $chunks[1]}, $chunks[2] - 1900);
	my $diff = int(($epoch - $now) / 86400);
	$q || printf "%-24s%-36s%-12s/%5s\n", trim($k, 23), trim($list{$k}{Registrar}, 35),
		$list{$k}{Expires}, $diff;

	# Prepare alerts if domain is expired or the expiration date is <= $x days
	if ($e && ($diff <= $x)){
		if ($diff <= 0){
			$exp{$k} = -$diff;
		}
		else {
			$end{$k} = $diff;
		}
	}
}

# Report expired domains
if (%exp){
	open Mail, "|$mail -s '$name: Expired domains' $e" or croak "$mail: $!\n";
	print Mail "According to 'whois', the following domains have expired:\n\n";
	for my $x (sort { $exp{$a} <=> $exp{$b} } keys %exp){
		my $s = $exp{$x} == 1 ? "" : "s";
		print Mail "$x ($exp{$x} day$s ago)\n";
	}
	close Mail;
}

# Report domains that will expire within the '-x' period
if (%end){
	open Mail, "|$mail -s '$name: Domain expiration warning ($x day cutoff)' $e" or croak "$mail: $!\n";
	print Mail "According to 'whois', these domains will expire soon:\n\n";
	for my $d (sort { $end{$a} <=> $end{$b} } keys %end){
		my $s = $end{$d} == 1 ? "" : "s";
		print Mail "$d (in $end{$d} day$s)\n";
	}
	close Mail;
}

__END__

