###########################################################################
#
# OpenAIGPTsPlugout.pm -- the plugout module to output docs in a form
#                         suitable for ingest into OpenAI's GTPs capabilty
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 2006 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

# The role of this plugout is essentially to turn a raw doc_obj
# representation into a decent to look at HTML file based on the
# metadata and text, along with all the associate files

# Approach taken is to encode an XML Transform which changes
# Greenstone's doc.xml format into HTML

package OpenAIGPTsPlugout;

use strict;
no strict 'refs';
no strict 'subs';

eval {require bytes};
use util;
use FileUtils;
use GreenstoneXMLPlugout;
use docprint;

sub BEGIN {
    @OpenAIGPTsPlugout::ISA = ('GreenstoneXMLPlugout');
}

my $arguments = [
       { 'name' => "xslt_file", 
	'desc' => "{BasPlugout.xslt_file}",
	'type' => "string",
	'reqd' => "no",
	'deft' => "gsdom2gpts.xsl",
	'hiddengli' => "no"}
    ];

my $options = { 'name'     => "OpenAIGPTsPlugout",
		'desc'     => "{OpenAIGPTsPlugout.desc}",
		'abstract' => "no",
		'inherits' => "yes",
		'args'     => $arguments };

sub new {
    my ($class) = shift (@_);
    my ($plugoutlist, $inputargs,$hashArgOptLists) = @_;
    push(@$plugoutlist, $class);

    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
    push(@{$hashArgOptLists->{"OptList"}},$options);

    my $self = new GreenstoneXMLPlugout($plugoutlist,$inputargs,$hashArgOptLists);
    
    if ($self->{'info_only'}) {	
        # don't worry about any options etc
        return bless $self, $class;
    }

    return bless $self, $class;
}


sub get_doc_xml_filename {
    my $self = shift (@_);
    my ($doc_obj) = @_;

    # my $top_section = $doc_obj->get_top_section();

    my $oid = $doc_obj->get_OID();
    
    return "$oid.html";
}

# Note: This type of grouping (into dirs) is different to the base
# classes group_size. 'group_size' is about concatenating multiple
# Greenstone documents into one file (useful for small docs/records
# such as MARC).  The 'grouped_into_dirs' variables provide
# an ability to multiple, separate documents, saved into the same
# archives/export folder.

my $grouped_into_dirs_root = "group-";
my $grouped_into_dirs_doc_count = 0;

my $max_docs_per_grouped_dir = 10;
my $grouped_into_dirs_group_count = 0;
    
    
sub get_new_doc_dir
{
  my $self = shift (@_);
  my($working_info,$working_dir,$OID) = @_;

  my $doc_dir;
  
  if (defined $grouped_into_dirs_root) {
      $grouped_into_dirs_doc_count++;

      if (($grouped_into_dirs_doc_count % $max_docs_per_grouped_dir) == 0) {
	  $grouped_into_dirs_group_count++;
      }
      $doc_dir = sprintf("${grouped_into_dirs_root}%04d", $grouped_into_dirs_group_count);
  }
  else {

      # A slimmed down version of BasePlugout::get_new_doc_dir()
      # which creates a flat file structure, rather then nested
      
      my $doc_dir_rest = $OID;
      
      # remove any \ and / from the OID
      $doc_dir_rest =~ s/[\\\/]//g;

      # Remove ":" if we are on Windows OS, as otherwise they get confused with the drive letters
      if ($ENV{'GSDLOS'} =~ /^windows$/i)
      {
	  $doc_dir_rest =~ s/\://g;
      }
      
      $doc_dir = $doc_dir_rest;
  }
  
  my $created_directory = 0;
  
  my $full_doc_dir = &FileUtils::filenameConcatenate($working_dir, $doc_dir . '.dir');
  if(!FileUtils::directoryExists($full_doc_dir))
  {
      &FileUtils::makeAllDirectories($full_doc_dir);
      $created_directory = 1;
  }
  else {
      $created_directory = 1;
  }
  
  if (!$created_directory)
  {
    die("Error! Failed to create directory for document: $doc_dir\n");
  }

  return $doc_dir . '.dir';
}



sub get_group_doc_dir {
    my $self = shift (@_);
    my ($doc_obj) = @_;

    # If this Plugout is being used to with grouped_into_dirs, then
    # how get_group_dor_dir() needs to operate is different.  In fact
    # it is simpler than the super-class implementation, because (due
    # to the prefix manipulation of gsdlassocfiles) it is safe for
    # associated files to be saved in the same directory as other
    # documents.

    my $doc_dir = undef;
    
    if (defined $grouped_into_dirs_root) {

	my $outhandle = $self->{'output_handle'};
	my $OID = $doc_obj->get_OID(); 
	$OID = "NULL" unless defined $OID;

	my $groupsize = $self->{'group_size'};
	my $gs_count = $self->{'gs_count'};
	
	my $open_new_file = (($gs_count % $groupsize)==0);
	
	# opening a new file
	if (($open_new_file)  || !defined($self->{'gs_doc_dir'})) {
	    # first we close off the old output
	    if ($gs_count>0)
	    {
		return if (!$self->close_group_output());
	    }
	    
	    # this will create the directory
	    $doc_dir = $self->get_doc_dir ($doc_obj); 
	    $self->{'new_doc_dir'} = 1;
	    $self->{'gs_doc_dir'} = $doc_dir;
	    $self->{'group_position'} = 1;
	}
	else {
	    $doc_dir = $self->{'gs_doc_dir'};
	    $self->{'new_doc_dir'} = 0;
	}
		
    }
    else {
	$doc_dir = $self->SUPER::get_group_doc_dir();
    }
    
    return $doc_dir;
}




sub recursive_process_section_content
{
    my $self = shift (@_);
    my ($doc_obj, $section) = @_;
    
    my $section_ptr = $doc_obj->_lookup_section ($section);
    return unless defined $section_ptr;

    my $oid = $doc_obj->get_OID();
    
    my $text = $section_ptr->{'text'};
    $text =~ s/_httpdocimg_\//$oid-/g;    
    $section_ptr->{'text'} = $text;

    # Turn into text
    $text =~ s/<style[^>]*>.*?<\/style>//si;
    $text =~ s/<[^>]*>/ /g;
    $text =~ s/\s+ / /mg;

    my $gsdoc_marker = "<span fromGSDocId=\"$oid\"></span>";
    $text =~ s/((?:[^\s]+\s*){10})/$1$gsdoc_marker/sg;
    
	
    $section_ptr->{'text'} = "<div gsdocid=\"$oid\">$gsdoc_marker$text$gsdoc_marker</div>";

    # print STDERR "*** text = $text\n";
    
    # work through all the sub-sections
    foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
	$self->recursive_process_section_content($doc_obj, "$section.$subsection");
    }
}

sub process_content
{
    my $self = shift (@_);
    my ($doc_obj) = @_;

    my $top_section = $doc_obj->get_top_section();
    $self->recursive_process_section_content($doc_obj,$top_section);    
}


sub process_assoc_files {
    my $self = shift (@_);
    my ($doc_obj, $doc_dir, $handle) = @_;

    my $assoc_files = $doc_obj->get_assoc_files();

    my $oid = $doc_obj->get_OID();

    my $updated_associated_files = [];
    
    foreach my $assoc_file_rec (@$assoc_files) {
	my ($real_full_filename,$assoc_file,$mime_type,$section) = @$assoc_file_rec;

	my $updated_assoc_file = "$oid-$assoc_file";

	push(@$updated_associated_files,[$real_full_filename,$updated_assoc_file,$mime_type,$section]);	
    }

    # Not the cleanest way to do this, but gets the job done
    $doc_obj->{'associated_files'} = $updated_associated_files;

    # Also need to prcess the content elements, as links to associated images have now changed
    $self->process_content($doc_obj);

    $self->SUPER::process_assoc_files($doc_obj,$doc_dir,$handle);

}

sub output_xml_header {
    my $self = shift (@_);
    my ($outhandle, $doc_oid) = @_;

    # Transitional??
    # !DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">

    # <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/strict.dtd">
    # <html xmlns="http://www.w3.org/TR/xhtml1/strict" >
    
    print $outhandle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
    # print $outhandle "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/strict.dtd\">\n";
    #print $outhandle "<html xmlns=\"http://www.w3.org/TR/xhtml1/strict\">\n";
    
    print $outhandle "<Archive>\n";
}

sub output_xml_footer {
    my $self = shift (@_);
    my ($outhandle) = @_;

    print $outhandle "</Archive>\n";    
}



1;

