###########################################################################
#
# CSVPlugin.pm -- A plugin for files in comma-separated value format
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright 2006 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

package CSVPlugin;

use extrametautil;

use ReadTextFile;
use SplitTextFile; # for a couple routines, but we not inheriting
use MetadataRead;
use CSVFieldSeparator;

use strict;
no strict 'refs'; # allow filehandles to be variables and viceversa

use Text::CSV;

sub BEGIN {
    @CSVPlugin::ISA = ('MetadataRead', 'ReadTextFile', 'CSVFieldSeparator');
    binmode(STDERR, ":utf8");

}


my $arguments = 
    [ 
      { 'name' => "process_exp",
	'desc' => "{BaseImporter.process_exp}",
	'type' => "regexp",
	'reqd' => "no",
	'deft' => &get_default_process_exp() },
      { 'name' => "filename_field",
	'desc' => "{CSVPlugin.filename_field}",
	'type' => "string",
	'reqd' => "no",
	'deft' => "Filename" },
       { 'name' => "no_document_if_source_unspecified",
	'desc' => "{CSVPlugin.no_document_if_source_unspecified}",
	'type' => "flag",
	'reqd' => "no"},
       { 'name' => "no_document_if_source_missing",
	'desc' => "{CSVPlugin.no_document_if_source_missing}",
	'type' => "flag",
	'reqd' => "no"},
      { 'name' => "use_namespace_for_field_names",
	'desc' => "{CSVPlugin.use_namespace_for_field_names}",
	'type' => "string",
	'reqd' => "no"},
       { 'name' => "store_field_values_as_document_text",
	'desc' => "{CSVPlugin.store_field_values_as_document_text}",
	'type' => "flag",
	'reqd' => "no"},
       { 'name' => "ignore_field",
	'desc' => "{CSVPlugin.ignore_field}",
	'type' => "string",
	'reqd' => "no"},


     ];


my $options = { 'name'     => "CSVPlugin",
		'desc'     => "{CSVPlugin.desc}",
		'abstract' => "no",
		'inherits' => "yes",
		'explodes' => "yes",
		'args'     => $arguments };


# This plugin processes files with the suffix ".csv"
sub get_default_process_exp {
    return q^(?i)(\.csv)$^;
}

sub new
{
    my ($class) = shift (@_);
    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
    push(@$pluginlist, $class);

    push(@{$hashArgOptLists->{"ArgList"}}, @{$arguments});
    push(@{$hashArgOptLists->{"OptList"}}, $options);

    new CSVFieldSeparator($pluginlist, $inputargs, $hashArgOptLists);
    my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);

    $self->{'textcat_store'} = {};
    $self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc

    if ((defined $self->{'use_namespace_for_field_names'}) && ($self->{'use_namespace_for_field_names'} =~ m/^\s*$/)) {
	$self->{'use_namespace_for_field_names'} = undef;
    }
    if ((defined $self->{'ignore_field'}) && ($self->{'ignore_field'} =~ m/^\s*$/)) {
	$self->{'ignore_field'} = undef;
    }

    return bless $self, $class;
}


# mark the file as a metadata file
sub file_block_read {
    my $self = shift (@_);
    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $gli) = @_;
	
    my $filename_full_path = &FileUtils::filenameConcatenate($base_dir, $file);
    return undef unless $self->can_process_this_file($filename_full_path);    

    if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
	# convert to full name - paths stored in block hash are long filenames
	$filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
    }
# kjdon - upgrade method converts everyhting to lower case drive letter.
# so would we need the following stuff???
#	my $lower_drive = $filename_full_path;
#	$lower_drive =~ s/^([A-Z]):/\l$1:/i;
	
#	my $upper_drive = $filename_full_path;
#	$upper_drive =~ s/^([A-Z]):/\u$1:/i;
	
#	$block_hash->{'metadata_files'}->{$lower_drive} = 1;
#	$block_hash->{'metadata_files'}->{$upper_drive} = 1;
		
#    }
#    else {
###	$block_hash->{'metadata_files'}->{$filename_full_path} = 1;
 #   }
    $block_hash->{'metadata_files'}->{$filename_full_path} = 1;
    return undef; #1
}

sub metadata_read
{
    my $self = shift (@_);
    my ($pluginfo, $base_dir, $file, $block_hash, 
	$extrametakeys, $extrametadata, $extrametafile,
	$processor, $gli, $aux) = @_;

    # can we process this file??
    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
    return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
    # the current directory
    my $current_dir = &util::filename_head($filename_full_path);
    
    print STDERR "\n<Processing n='$file' p='CSVPlugin'>\n" if ($gli);
    print STDERR "CSVPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;

    my $outhandle = $self->{'outhandle'};
    my $failhandle = $self->{'failhandle'};
    my $verbosity = $self->{'verbosity'};

    # don't add to block list, as we may do some processing in read.

    # Do encoding stuff
    my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
    if ($self->{'verbosity'} > 2) {
	print $outhandle "CSVPlugin: reading $file as ($content_encoding,$language)\n";
    }
    # store these values for read
    my $le_rec = { 'language' => $language, 'encoding' => $content_encoding };
    $self->{'textcat_store'}->{$file} = $le_rec;

    my $metadata_store = {};
    $self->{'metadata_store'}->{$file} = $metadata_store; # used to record metadata for segments with no src doc

    my $CSV_FILE;
    open($CSV_FILE, "<:encoding($content_encoding)", "$filename_full_path");
    my $separate_char = $self->{'csv_field_separator'};

    my $md_val_sep = $self->{'metadata_value_separator'};
    undef $md_val_sep if ($md_val_sep eq "");

    my $csv_file_field_line;
    if ($separate_char =~ m/^auto$/i) {
	
	$csv_file_field_line = <$CSV_FILE>;
	$separate_char = $self->resolve_auto($csv_file_field_line,$self->{'plugin_type'});
	seek $CSV_FILE, 0, 0; # move pointer back to start of file, as we want to read in the fields using csv.
    }

    my $md_sep_fields = $self->{'metadata_separate_fields'};
    undef $md_sep_fields if ($md_sep_fields eq "");

    my $md_sep_fields_lookup = undef;
    if (defined $md_sep_fields) {
	$md_sep_fields_lookup = {};

	my @md_fields = split(/\s*,\s*/,$md_sep_fields);

	for my $md_field (@md_fields) {
	    $md_sep_fields_lookup->{$md_field} = 1;
	}
    }

    my $csv = Text::CSV->new();
    $csv->sep_char($separate_char);
    $csv->binary(1);

    my @csv_file_fields = undef;

    my $first_row = $csv->getline ($CSV_FILE);
    if (defined $first_row) {
	@csv_file_fields = @$first_row;
    }
    else {
	$self->print_error($outhandle, $failhandle, $gli, $filename_full_path, "Error: Badly formatted CSV header line: $csv_file_field_line");
	return -1;
    }

    my $found_filename_field = 0;
    my $filename_field = $self->{'filename_field'};
    my $ignore_field = $self->{'ignore_field'};
    my $ignore_col;
    for (my $i = 0; $i < scalar(@csv_file_fields); $i++) {
	# Remove any spaces from the field names, and surrounding quotes too
	$csv_file_fields[$i] =~ s/ //g;
	$csv_file_fields[$i] =~ s/^"//;
	$csv_file_fields[$i] =~ s/"$//;


	if ($self->{'use_namespace_for_field_names'}) {
	    $csv_file_fields[$i] =  $self->{'use_namespace_for_field_names'}. "." . $csv_file_fields[$i];
	}
	if ($csv_file_fields[$i] eq  $filename_field) {
	    $found_filename_field = 1;
	}
	if ($ignore_field && $csv_file_fields[$i] eq $ignore_field) {
	    $ignore_col = $i;
	}

    }

    
    if (!$found_filename_field) {
	$self->print_warning($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field field in CSV file, metadata cannot be assigned to documents, will use metadata only dummy documents");
	
    }



    my $count = 0;
    while (my $csv_line = $csv->getline($CSV_FILE)) {
	my @md_vals = @$csv_line;
	
	if (defined $ignore_col && $md_vals[$ignore_col] =~ m/\w/) {
	    # ignore this line
	    print STDERR "ignoring line ".join(",", @md_vals)."\n";
	    next;
	}
	$count++;
	
	# Build a hash of metadata name to metadata value for this line
	my %csv_line_metadata;

	my $md_vals_len = scalar(@md_vals);

	for (my $i=0; $i<$md_vals_len; $i++) {
	    my $md_val = $md_vals[$i];
	    # Only bother with non-empty and non-only-whitespace values
	    if ($md_val =~ m/\S/ && defined($csv_file_fields[$i])) {
		
		my $md_name = $csv_file_fields[$i];
		$csv_line_metadata{$md_name} = [];

		my $needs_md_val_sep = 0;
		if (defined $md_val_sep) {
		    # Default coming in is 'no' (0)
		    # => Check to see if any conditions met to turn this into a 'yes' (1)
		    
		    # check to see if md_sep_fields is in play, and if it is
		    # => determine if this $md_name is one of the ones in $md_sep_fields_lookup
		    
		    if (defined $md_sep_fields_lookup) {
			if ($md_sep_fields_lookup->{$md_name}) {
			    $needs_md_val_sep = 1;
			}
		    }
		    else {
			# if not set, then we apply the md_val_sep to all metadata fields
			$needs_md_val_sep = 1;
		    }
		}
		
		if ($needs_md_val_sep) {			
			
		    my @within_md_vals = split(/${md_val_sep}/,$md_val);
		    
		    # protect square brackets in metadata values by hex entity encoding them
		    # As unescaped square bracket chars in metadata
		    # have special meaning in GS' Java runtime code
		    my @escaped_within_md_vals = ();
		    for my $meta_value (@within_md_vals) {
			
			$meta_value =~ s/\[/&\#091;/g;
			$meta_value =~ s/\]/&\#093;/g;
			push(@escaped_within_md_vals, $meta_value);
		    }
		    push (@{$csv_line_metadata{$md_name}}, @escaped_within_md_vals);
		}
		else {
		    # protect square brackets in metadata values by hex entity encoding them
		    my $escaped_metadata_value = $md_val;
		    $escaped_metadata_value =~ s/\[/&\#091;/g;
		    $escaped_metadata_value =~ s/\]/&\#093;/g;
		    push (@{$csv_line_metadata{$md_name}}, $escaped_metadata_value);
		}
	    }
	}

        # don't process this line unless we had some actual values
        my $num_keys = scalar keys %csv_line_metadata;
        next if ($num_keys == 0);
           
	my $csv_line_section_array = $csv_line_metadata{"Section"};
	my $section_suffix = "";
	if (defined $csv_line_section_array) {
	    my $section_value = shift(@$csv_line_section_array);
	    if ($section_value =~ /[\d.]+/m){
		my $section_suffix = "///Section/" . $section_value;
		foreach my $metaname (keys %csv_line_metadata) {
		    my $new_name = $metaname . $section_suffix;
		    $csv_line_metadata{$new_name} = delete $csv_line_metadata{$metaname};
		}
	    } else{
		unshift(@$csv_line_section_array, $section_value);
	    }
	}
	
	# do we have filename field?
	# We can't associate any metadata without knowing the file to associate it with
	my $has_srcdoc = 0;
	my $missing_srcdoc = 0;
	my $csv_line_filename="";;
	if ($found_filename_field) {
	    # is there a srcdoc mentioned?
	    my $csv_line_filename_array = $csv_line_metadata{$filename_field};
	    if (!defined $csv_line_filename_array) {
		$self->print_warning($outhandle, $failhandle, $gli, $filename_full_path, "No $filename_field metadata in CSV line num $count");
	    } else {
		$csv_line_filename = shift(@$csv_line_filename_array);
		# TODO - have an option for whether we do this or not
		if (&FileUtils::fileExists(&FileUtils::filenameConcatenate($current_dir, $csv_line_filename))) {
		    $has_srcdoc = 1;
		    
		    delete $csv_line_metadata{$filename_field};
		} else {
		    $self->print_warning($outhandle, $failhandle, $gli, $filename_full_path, "$csv_line_filename in $filename_field metadata in CSV line num $count is not found");
		    $missing_srcdoc = 1; # there was one mentioned but its not found
		}
	    }
	    
	}
	if ($has_srcdoc) {
	    print $outhandle "Storing metadata, segment $count, for document $csv_line_filename\n" if ($verbosity > 2);
	    $self->store_meta_in_extrametadata($csv_line_filename, \%csv_line_metadata, $file, $filename_full_path, $extrametakeys, $extrametadata, $extrametafile);
	} else {
	    my $store_for_dummy = 1;
	    if ($missing_srcdoc && $self->{'no_document_if_source_missing'}) {
		$self->print_warning($outhandle, $failhandle, $gli, $filename_full_path,"Not storing metadata for line $count as source doc is missing");
		$store_for_dummy = 0;
	    } elsif(!$missing_srcdoc && $self->{'no_document_if_source_unspecified'}) {
		$self->print_warning($outhandle, $failhandle, $gli, $filename_full_path,"Not storing metadata for line $count as source doc is unspecified");
		$store_for_dummy = 0;
	    } 
	    if ($store_for_dummy) {
		
		print $outhandle "Storing metadata for dummy document, segment $count\n" if ($verbosity > 2);
		$metadata_store->{$count} = \%csv_line_metadata;
	    }
	}
    } # while csv_line = csv->getline
    close ($CSV_FILE);
}

#adapted from read in splittextfile
sub read {
    my $self = shift (@_);
    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
    my $outhandle = $self->{'outhandle'};
    my $verbosity = $self->{'verbosity'};

    # can we process this file??
    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
    return undef unless $self->can_process_this_file($filename_full_path);

    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up

    my $le_rec = $self->{'textcat_store'}->{$file};
    if (!defined $le_rec) {
	# means no text was found;
	return 0; # not processed but no point in passing it on
    }

    print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
    print $outhandle "$self->{'plugin_type'} processing $file\n"
	    if $self->{'verbosity'} > 1;    

    my $language = $le_rec->{'language'};
    my $encoding = $le_rec->{'encoding'};
    $self->{'textcat_store'}->{$file} = undef;

    my $metadata_store = $self->{'metadata_store'}->{$file}; # a hash of seg num to metadata hash


    # Process each segment in turn
    my $segment = 0; #which segment/record number we have
    my $count = 0; # num doc objs produced

    my ($filemeta) = $file =~ /([^\\\/]+)$/; #why?
    my $plugin_filename_encoding = $self->{'filename_encoding'};
    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);

    my $id;

    foreach $segment (sort { $a <=> $b } keys (%$metadata_store)) {
	print $outhandle "processing segment $segment as its own document\n"
	    if $self->{'verbosity'} > 1;    
	$count++;
	# create a new document
	my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
	my $cursection = $doc_obj->get_top_section();
	$doc_obj->add_utf8_metadata($cursection, "Language", $language);
	$doc_obj->add_utf8_metadata($cursection, "Encoding", $encoding);

	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);

	$doc_obj->add_utf8_metadata($cursection, "SourceSegment", "$segment");
	if ($self->{'cover_image'}) {
	    $self->associate_cover_image($doc_obj, $filename_full_path);
	}
	$doc_obj->add_utf8_metadata($cursection, "Plugin", "$self->{'plugin_type'}");

	# include any metadata passed in from previous plugins 
	# note that this metadata is associated with the top level section
	$self->extra_metadata ($doc_obj, $cursection, $metadata);

	# add our stored metadata from metadata_read pass
	my $segment_metadata = $metadata_store->{$segment};
	$self->extra_metadata($doc_obj, $cursection, $segment_metadata);
	if ($self->{'store_field_values_as_document_text'}) {
	    my $new_text = "";
	    foreach my $f (keys %$segment_metadata) {
		my $values = $segment_metadata->{$f};
		$new_text .= join (", ", @$values).", ";
	    }

	    $doc_obj->add_utf8_text($cursection, $new_text);
	}
	# do any automatic metadata extraction - does this make sense??
	#$self->auto_extract_metadata ($doc_obj);

	# Calculate a "base" document ID.
	if (!defined $id) {
	    $id = &SplitTextFile::get_base_OID($self,$doc_obj);
	}
	
	# add an OID
	&SplitTextFile::add_segment_OID($self, $doc_obj, $id, $segment);

	# process the document
	$processor->process($doc_obj);

	$self->{'num_processed'} ++;
	if ($maxdocs != -1 && $self->{'num_processed'} >= $maxdocs) {
	    last;
	}
    }

    delete $self->{'metadata_store'}->{$file};

    # Return number of document objects produced
    return $count; 
}

sub print_warning {
    my $self = shift(@_);
    my ($outhandle, $failhandle, $gli, $file, $error) = @_;

    print $outhandle "CSVPlugin Warning: $file: $error\n";
    print $failhandle "CSVPlugin Warning: $file: $error\n";
    print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);

}
sub print_error
{

    my $self = shift(@_);
    my ($outhandle, $failhandle, $gli, $file, $error) = @_;

    print $outhandle "CSVPlugin Error: $file: $error\n";
    print $failhandle "CSVPlugin Error: $file: $error\n";
    print STDERR "<ProcessingError n='$file' r='$error'/>\n" if ($gli);
}


1;
