#!/usr/bin/perl -w

###########################################################################
#
# getoai.pl --
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################


# This program will contact the named DL server
#  and export its metadata and (optionally) it documents.

# Currently only designed for OAI exporting


BEGIN {
    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
    die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
}

use colcfg;
use util;
use parsargv;
use FileHandle;


sub print_usage {
    print STDERR "\n  usage: $0 [options] collection-name\n\n";
    print STDERR "  options:\n";
    print STDERR "   -verbosity number      0=none, 3=lots\n";
    print STDERR "   -getid                 Also download if source document if present\n";
    print STDERR "   -importdir directory   Where the original material lives\n";
    print STDERR "   -keepold               Will not destroy the current contents of the\n";
    print STDERR "                          import directory (the default)\n";
    print STDERR "   -removeold             Will remove the old contents of the import\n";
    print STDERR "                          directory -- use with care\n";
    print STDERR "   -gzip                  Use gzip to compress exported documents\n";
    print STDERR "                          (don't forget to include ZIPPlug in your plugin\n";
    print STDERR "   -maxdocs number        Maximum number of documents to import\n";
    print STDERR "   -debug                 Print imported text to STDOUT\n";
    print STDERR "   -collectdir directory  Collection directory (defaults to " .
	&util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n";
    print STDERR "   -out                   Filename or handle to print output status to.\n";
    print STDERR "                          The default is STDERR\n\n";
}




sub print_usage_old
{
    my ($prog_name) = @_;

    print STDERR "Usage: $prog_name OAI-base-URL\n";
    exit 1;
}

sub get_oai_ids
{
    my ($base_url, $out) = @_;

    print $out "Requesting list of identifiers ...\n";

    open (OAIIN,"wget -q -O - $base_url?verb=ListIdentifiers|")
	|| die "wget request failed: $!\n";
 
    my $li_record = "";

    my $line;
    while (defined($line=<OAIIN>))
    {
	$li_record .= $line;
	# print $out $line;
    }
    
    close(OAIIN);
    print $out "... Done.\n";

    return $li_record;
}

sub parse_oai_ids
{
    my ($li_record, $out) = @_;

    # extract identifier list
    $li_record =~ s/^.*?<identifier>/<identifier>/s;
    $li_record =~ s/^(.*<\/identifier>).*$/$1/s;

    my @ids = ();

    while ($li_record =~ m/<identifier>(.*?)<\/identifier>(.*)$/s)
    {
	$li_record = $2;
	push(@ids,$1);
    }

    return \@ids;
}


sub dir_file_split
{
    my ($file) = @_;

    my @dirs = split("/",$file);
    my $local_file = pop(@dirs);
    my $sub_dirs = join("/",@dirs);

    return ($sub_dirs,$local_file);
}

sub get_oai_document
{
    my ($doc_url,$output_dir, $out) = @_;

    my ($id_dir,$id_fname) = dir_file_split($doc_url);

    print $out "Getting document $doc_url\n";

    `mkdir $output_dir/srcdocs` if (!-e "$output_dir/srcdocs");
	
    my $wget_cmd = "wget -q -O $output_dir/srcdocs/$id_fname \"$doc_url\"";

    (system($wget_cmd)==0)
	|| print STDERR "Error: failed to execute $wget_cmd\n";
    
}

sub get_oai_records
{
    my ($base_url,$ids,$output_dir, $get_id, $maxdocs, $out) = @_;

    my $doc_count = 0;

    my $i;
    foreach $i ( @$ids )
    {
	# wget it;
	my $url = "$base_url?verb=GetRecord&metadataPrefix=oai_dc";
	$url .= "&identifier=$i";
	print $out "Downloading metadata record for $i\n";

	my $file_i = "$output_dir/$i.oai";
	$file_i =~ s/:/\//g;

	# obtain record
	my $wget_cmd = "wget -q -O - \"$url\"";

	open (OAIIN,"$wget_cmd|")
	    || die "wget request failed: $!\n";
	my $i_record = "";
	
	my $line;
	while (defined($line=<OAIIN>))
	{
	    $i_record .= $line;
	}

	close(OAIIN);
	
	# prepare subdirectory for record (if needed)
	my ($i_dir,$unused) = dir_file_split($file_i);
	`mkdir -p $i_dir`;

	# look out for identifier tag in metadata section
	if ($i_record =~ m/<metadata>(.*)<\/metadata>/s)
	{
	    my $m_record = $1;

	    if ($get_id)
	    {
		if ($m_record =~ m/<identifier>(.*?)<\/identifier>/s)
		{
		    my $doc_url = $1;
		    get_oai_document($doc_url,$i_dir, $out);


		    my ($id_dir,$id_fname) = dir_file_split($doc_url);

		    $i_record =~ s/<metadata>(.*?)<identifier>$doc_url<\/identifier>(.*?)<\/metadata>/<metadata>$1<OrigURL>$doc_url<\/OrigURL><identifier>srcdocs\/$id_fname<\/identifier>$2<\/metadata>/s;
		}
	    }
	}

	# save record 
	open (OAIOUT,">$file_i")
	    || die "Unable to save oai metadata record: $!\n";
	print OAIOUT $i_record;
	close(OAIOUT);

	$doc_count++;
	last if ($doc_count == $maxdocs);
    }
}


sub main {
    my ($verbosity, $importdir, $keepold, 
	$removeold, $gzip, $groupsize, $debug, $maxdocs, $collection,
	$configfilename, $collectcfg,
	$out, $collectdir);

    if (!parsargv::parse(\@ARGV, 
			 'verbosity/\d+/2', \$verbosity,
			 'getid', \$getid,
			 'importdir/.*/', \$importdir,
			 'keepold', \$keepold,
			 'removeold', \$removeold,
			 'gzip', \$gzip,
			 'debug', \$debug,
			 'maxdocs/^\-?\d+/-1', \$maxdocs,
			 'collectdir/.*/', \$collectdir,
			 'out/.*/STDERR', \$out)) {
	&print_usage();
	die "\n";
    }

    my $close_out = 0;
    if ($out !~ /^(STDERR|STDOUT)$/i) {
	open (OUT, ">$out") || die "Couldn't open output file $out\n";
	$out = 'import::OUT';
	$close_out = 1;
    }
    $out->autoflush(1);

    # set removeold to false if it has been defined
    $removeold = 0 if ($keepold);

    # get and check the collection name
    if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
	&print_usage();
	die "\n";
    }


    # get the list of export servers for this collection
    my $export = [];
    $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
    if (-e $configfilename) {
	$collectcfg = &colcfg::read_collect_cfg ($configfilename);
	if (defined $collectcfg->{'export'}) {
	    $export = $collectcfg->{'export'};
	}
	if (defined $collectcfg->{'importdir'} && $importdir eq "") {
	    $importdir = $collectcfg->{'importdir'};
	}
	if (defined $collectcfg->{'removeold'}) {
	    if ($collectcfg->{'removeold'} =~ /^true$/i && !$keepold) {
		$removeold = 1;
	    }
	    if ($collectcfg->{'removeold'} =~ /^false$/i && !$removeold) {
		$removeold = 0;
	    }
	}
    } else {
	die "Couldn't find the configuration file $configfilename\n";
    }
    
    # fill in the default import directory if none
    # were supplied, turn all \ into / and remove trailing /
    $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
    $importdir =~ s/[\\\/]+/\//g;
    $importdir =~ s/\/$//;

    # remove the old contents of the import directory if needed
    if ($removeold && -e $importdir) {
	print $out "Warning - removing current contents of the import directory\n";
	print $out "          in preparation for the export\n";
	sleep(5); # just in case...
	&util::rm_r ($importdir);
    }

    my $e;
    foreach $e ( @$export )
    {
	print $out "Exporting from $e->[0]\n";

	my $base_url = $e->[0];

	my $li_record = get_oai_ids($base_url,$out);
	my $ids = parse_oai_ids($li_record,$out);

	get_oai_records($base_url,$ids,$importdir, $getid, $maxdocs, $out);
    }
    
    close OUT if $close_out;
}


&main();








