#!/usr/bin/perl

## en2ris.pl: converts EndNote "RIS" datasets to RIS format
##
## usage: perl en2ris.pl < endnote.ris > outfile.ris
##
## Dependencies: perl 5.0.0 or later
##               RefDB::CGI
##               RefDB::Pref
##               RefDB::Log
##               Text::Iconv
##
## markus@mhoenicka.de 2003-04-27
## $Id: en2ris.in,v 1.1.2.2 2005/04/19 19:34:25 mhoenicka Exp $

##   This program is free software; you can redistribute it and/or modify
##   it under the terms of the GNU General Public License as published by
##   the Free Software Foundation; either version 2 of the License, or
##   (at your option) any later version.
##   
##   This program is distributed in the hope that it will be useful,
##   but WITHOUT ANY WARRANTY; without even the implied warranty of
##   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##   GNU General Public License for more details.
   
##   You should have received a copy of the GNU General Public License
##   along with this program; if not, see <http://www.gnu.org/licenses/>

## change character encoding on the fly
use Text::Iconv;

## the common RefDB modules
use RefDB::Log;
use RefDB::Prefs;

## use this module to read command line options
use Getopt::Std;

## this one is for syslog (who'd have guessed)
use Sys::Syslog;

## this is for the config file stuff
my $prefs;

## read config file settings
my $confdir = "/etc/refdb";
my $read_prefs = 1;
my $next;

## look for -q and -y options
foreach $arg (@ARGV) {
#    my $next;
    if ($next) {
	$confdir = $arg;
	$next = 0;
    }
    elsif ($arg eq "-y") {
	$next = 1;
    }
    elsif ($arg eq "-q") {
	$read_prefs = 0;
    }
}

## read config files
if ($read_prefs) {
    my $home = $ENV{'HOME'};
    $prefs = RefDB::Prefs::->new("$confdir/en2risrc", "$home/en2risrc");
}

#### variables to hold config options. Will be initialized by whatever
#### was in the config files

## name of the output file, if any. If undef, send data to stdout
my $outfile = $prefs->{"outfile"};

## if f, overwrite; if t, append
my $append = (defined($prefs->{"outappend"})) ? $prefs->{"outappend"} : "f";

## encodings to use for input and output. empty string uses the locale
my $from_enc = (defined($prefs->{"from_enc"})) ? $prefs->{"from_enc"} : "ISO-8859-1";
my $to_enc = (defined($prefs->{"to_enc"})) ? $prefs->{"to_enc"} : "UTF-8";

## logging options
my $logfile = (defined($prefs->{"logfile"})) ? $prefs->{"logfile"} : "/var/log/en2ris.log";
my $loglevel = (defined($prefs->{"loglevel"})) ? $prefs->{"loglevel"} : 6;
my $logdest = (defined($prefs->{"logdest"})) ? $prefs->{"logdest"} : 2; ## 0 = stderr, 1 = syslog, 2 = file

## this hash will receive the command line options
my %opts;

## the switches are:
## -e dest: log destination
## -f enc: input encoding
## -h: prints help
## -l level: log level
## -L file: log file
## -o/-O file: specifies output file for writing/appending
## -q: ignore config file
## -t enc: output encoding
## -y path: set confdir
getopts('e:f:hl:L:o:O:qt:y:', \%opts);

## loop over all command line options
while (($key, $value) = each %opts) {
    if ($key eq "e") {
	$logdest = $value;
    }
    elsif ($key eq "f") {
	$from_enc = $value;
    }
    elsif ($key eq "h") {
	print "en2ris.pl turns EndNote \"RIS\" output into RIS\n";
	print "Usage: [perl] en2ris.pl [-e dest] [-f enc] [-h] [-l level] [-L logfile] [(-o|-O) outfile] [-q] [-t enc] [-y path]\n Reads EndNote \"RIS\" data from stdin. Output is sent to stdout unless one of the -o/-O options is used\nOptions: -e dest     log destination (stderr|syslog|file)\n         -f enc      input encoding\n         -h          print this help and exit\n         -l loglevel set log level (0-7)\n         -L logfile  path of custom log file\n         -o outfile  send output to outfile (overwrite)\n         -O outfile  send output to outfile (append)\n         -q          ignore config file\n         -t enc     output encoding\n         -y path     set custom config file path\n";
	exit(0);
    }
    elsif ($key eq "l") {
	$loglevel = $value;
    }
    elsif ($key eq "L") {
	$logfile = $value;
    }
    elsif ($key eq "o") {
	$outfile = $value;
    }
    elsif ($key eq "O") {
	$outfile = $value;
	$append = t;
    }
    elsif ($key eq "q") {
	## do nothing, -q was used before getopts
    }
    elsif ($key eq "t") {
	$to_enc = $value;
    }
    elsif ($key eq "y") {
	## do nothing, -y was used before getopts
    }
}

## post-process a few variables
$logdest = RefDB::Log::num_logdest($logdest);
$loglevel = RefDB::Log::num_loglevel($loglevel);

## if we're supposed to write to an output file, try to open it
if (length($outfile) > 0 && $is_cgi == 0) {
    ## try to open the output file
    if ($append eq "t") {
	open OUT, ">>$outfile" or die "cannot open output file for appending: $outfile\n";
    }
    else {
	open OUT, ">$outfile" or die "cannot open output file for overwriting: $outfile\n";
    }

    ## make all print commands send output to this handle
    select OUT;
}

## set up logging
my $log = RefDB::Log::->new($logdest, $loglevel, $logfile, "en2ris.pl");

## here the code proper starts
my $last_tag = "TY  - ";
my $PY = "";
my $Y2 = "";

## initialize character encoding conversion
my $converter = Text::Iconv->new($from_enc, $to_enc);

## counter for datasets
my $set_count = 0;

## this hash helps to convert month names to numbers
my %monthnames = (
		  "January" => "01",
		  "February" => "02",
		  "March" => "03",
		  "April" => "04",
		  "May" => "05",
		  "June" => "06",
		  "July" => "07",
		  "August" => "08",
		  "September" => "09",
		  "October" => "10",
		  "November" => "11",
		  "December" => "12");

## this hash helps to convert month abbreviations to numbers
my %monthabbrevs = (
		  "Jan" => "01",
		  "Feb" => "02",
		  "Mar" => "03",
		  "Apr" => "04",
		  "May" => "05",
 		  "Jun" => "06",
		  "Jul" => "07",
		  "Aug" => "08",
		  "Sep" => "09",
		  "Oct" => "10",
		  "Nov" => "11",
		  "Dec" => "12");

## read data from stdin
while (<>) {
    # remove an odd character that EndNote exports once in a while for no
    # good reason
    s/[\035]//;

    if ($_ =~ /^(..  - )/) {
	$last_tag = $1;
    }
    elsif ($last_tag eq "KW  - ") {
	print $last_tag . $_;
	$_ = "";
    }

    if ($_ =~ /^SP  - /) {
	$_ =~ s/^SP  - (.*)-(.*)/SP  - $1/;
	if (length("$2") > 0) {
	    $_ .= "EP  - $2\n";
	}
	$log->log_print("debug", "fixed SP");
    }
    elsif ($_ =~ /^EP  - /) {
	if ($_ =~ /^EP  - .*-.*/) {
	    $_ =~ s/^EP  - .*-(.*)/EP  - $1/;
	}
	else {
	    $_ = "";
	}
	$log->log_print("debug", "fixed EP");
    }
    elsif ($_ =~ /^PY  - /) {
	chomp $_;
	$PY = substr($_, 6);
	$_ = "";
	$log->log_print("debug", "found PY");
    }
    elsif ($_ =~ /^Y2  - /) {
	chomp $_;
	$Y2 = substr($_, 6);
	$_ = "";
	$log->log_print("debug", "found Y2");
    }
    elsif ($_ =~ /^ER  - /) {
	# dump pubyear string, reset variables for new round
	my $datestring = fix_dates($PY, $Y2);
	print $converter->convert("PY  - $datestring\n");
	$PY = "";
	$Y2 = "";
	$set_count++;
    }
    elsif ($_ =~ /^ID  - /) {
	# informational message about the current dataset
	$log->log_print("info", substr($_, 6));
    }

    print $converter->convert("$_");
}

$log->log_print("info", "converted $set_count datasets");

## done processing all input
$log->close();

## the end
exit 0;

## this function assumes that the publication year is in the PY field,
## whereas month, date, and otherinfo are in the Y2 field. Two formats
## of the Y2 field are recognized: /month/day/otherinfo, where month
## may either be numeric or a month name/abbrev; or a date like "March 10"
sub fix_dates {
    my ($PY, $Y2) = @_;

    my $month;
    my $day;
    my $otherinfo;
    
    # strip leading slashes
    $Y2 =~ s!^\/*!!;

    # see whether we have a monthname
    while ( ($key, $value) = each %monthnames) {
	if ($Y2 =~ s!^$key/!!) {
	    $month = $value;
	    $otherinfo = $Y2;
	    last;
	}
    }

    # see whether we have a monthabbrev
    if (!length($month)) {
	while ( ($key, $value) = each %monthabbrevs) {
	    if ($Y2 =~ s!^$key/!!) {
		$month = $value;
		$otherinfo = $Y2;
		last;
	    }
	}
    }

    # if we still don't have a month, use string as numeric
    if (!length($month)) {
	$month = $Y2;
	$month =~ s!^([^/]*)/.*!$1!;
	if ($month =~ s/^(\d{1,2}).*/$1/) {
	    if (length($month) == 1) {
		$month = "0" . $month;
	    }
	    $Y2 =~ s!^[^/]*/(.*)!$1!;
	}
	else {
	    $month = undef;
	}
	$otherinfo = $Y2;
    }

    # separate day and otherinfo, if any
    if (length($otherinfo)) {
	$otherinfo =~ s!^[\s/]*!!;
	$day = $otherinfo;
	# use first one or two digits as day
	if ($day =~ s/^(\d{1,2}).*/$1/) {
	    $otherinfo =~ s/^\d{1,2}\s*(.*)/$1/;
	    # day requires exactly two digits
	    if (length($day) == 1) {
		$day = "0" . $day;
	    }
	}
	else {
	    $day = undef;
	}
    }

    if (length($otherinfo)) {
	$otherinfo =~ s!^[\s/]*!!;

	if (!defined($month)) {
	    # last attempt to find month and date; assume no slashes
	    while ( ($key, $value) = each %monthnames) {
		if ($otherinfo =~ s!^$key!!) {
		    $month = $value;
		    last;
		}
	    }
	    
	    # see whether we have a monthabbrev
	    if (!length($month)) {
		while ( ($key, $value) = each %monthabbrevs) {
		    if ($otherinfo =~ s!^$key!!) {
			$month = $value;
			last;
		    }
		}
	    }
	    
	    # maybe there's also a day
	    if (length($otherinfo)) {
		$otherinfo =~ s!^[\s]*!!;
		$day = $otherinfo;
		# use first one or two digits as day
		if ($day =~ s/^(\d{1,2}).*/$1/) {
		    $otherinfo =~ s/^\d{1,2}\s*(.*)/$1/;
		    # day requires exactly two digits
		    if (length($day) == 1) {
			$day = "0" . $day;
		    }
		}
		else {
		    $day = undef;
		}
	    }
	}
    }

    if (!length($PY)) {
	$log->log_print("warning", "found no PY");
	$PY = "0000";
    }

    # assemble return string
    return $PY . "/" . $month . "/" . $day . "/" . $otherinfo;
}
