/**
 *############################################################################
 * A component of the Greenstone Librarian Interface, part of the Greenstone
 * digital library suite from the New Zealand Digital Library Project at the
 * University of Waikato, New Zealand.
 *
 * Author: Michael Dewsnip, NZDL Project, University of Waikato, NZ
 *
 * Copyright (C) 2004 New Zealand Digital Library Project
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *############################################################################
 */

package org.greenstone.gatherer.metadata;


import java.io.*;
import java.util.*;
import java.net.URLDecoder;
import org.greenstone.gatherer.DebugStream;
import org.greenstone.gatherer.Gatherer;
import org.greenstone.gatherer.util.Utility;

//import org.greenstone.gatherer.feedback.Base64; // decode() from Base64 didn't work
import org.apache.commons.codec.binary.Base64; // decoding from Base64 works

/** This class represents one doc.xml file */

public abstract class DocXMLFile extends File
{
    static boolean isWin = Utility.isWindows();
    
    protected HashMap source_file_name_to_description_elements_mapping = new HashMap();
    
    protected final String MetadataWrap;
    protected final String MetadataItem;

    protected final String FILE_RENAME_METHOD_NONE = "none";
    protected final String FILE_RENAME_METHOD_URL = "url";
    protected final String FILE_RENAME_METHOD_BASE64 = "base64";

    public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem)
    {
	super(doc_xml_file_path);
	this.MetadataWrap = metaWrap;
	this.MetadataItem = metaItem;
    }

    /** On Windows, file_relative_path will be hex-encoded for codepts beyond ASCII.
     * But keys into the source_file_name_to_description_elements_mapping will then also match on Windows */
    public ArrayList getMetadataExtractedFromFile(File file, String file_relative_path)
    {
	// Build up a list of metadata extracted from this file
	ArrayList metadata_values = new ArrayList();
	
	///for (Object relFilename : source_file_name_to_description_elements_mapping.keySet()) {
	///    System.err.println("\n@@@ relFilename: " + relFilename);
	///}
	
	// Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file
	ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
	if (description_elements_list == null) {
			// ...it doesn't
			///System.err.println("Unable to find meta for file path form " + file_relative_path);
			return metadata_values; // we're done
	} ///else { System.err.println("@@@ file rel path: " + file_relative_path + " matched" ); }

 	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);

	// Parse the file
	DebugStream.println("Applicable file: " + this);
	try {
	    BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));

	    int description_element_num = 0;
	    int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
	    boolean in_relevant_description_element = false;

	    String line = null;
	    for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
		// Check if this line contains the start of a relevant "Description" element
		// (mets:xmlData in METS parlance, Description in GreenstoneArchive format) 
		if (line_num == next_description_element_start) {
		    in_relevant_description_element = true;
		    continue;
		}

		// If we're not in a relevant Description element we don't care about anything
		if (in_relevant_description_element == false) {
		    continue;
		}

		// Check if this line contains the end of the relevant Description element
		if (line.indexOf("</"+MetadataWrap+">") != -1) {
		    description_element_num++;
		    if (description_element_num == description_elements_list.size()) {
			break;
		    }

		    next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue();
		    in_relevant_description_element = false;
		    continue;
		}

		// If this line doesn't contain a complete Metadata element, we're not interested
		if (line.indexOf("<"+MetadataItem+" ") == -1 || line.indexOf("</"+MetadataItem+">") == -1) {
		    continue;
		}

		// Extract the metadata element name
		int name_index = line.indexOf(" name=\"") + " name=\"".length();
		String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));

		// If the metadata has a namespace it isn't extracted metadata, so we're not interested
		// Actually, if it is ex. then we are interested 
		String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
		
		if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
		    continue;
		}

		// Extracted metadata!
		// do it like this just in case we have ex.
		String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);

		// We completely ignore bibliographic data
		if (metadata_element_name.equals("SourceSegment")) {
		    buffered_reader.close();
		    return new ArrayList();
		}

		// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
		if (metadata_element_name.startsWith("gsdl")) {
		    continue;
		}

		MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);

		// Value trees are not stored for extracted metadata, so create a new value tree node now
		int value_index = line.indexOf(">", name_index) + ">".length();
		String metadata_element_value = line.substring(value_index, line.lastIndexOf("</"+MetadataItem+">"));

		metadata_element.addMetadataValue(metadata_element_value);
		MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value);

		// Add the new metadata value to the list
		MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
		metadata_values.add(metadata_value);
	    }

	    buffered_reader.close();
	}
	catch (FileNotFoundException exception) {
 	    DebugStream.printStackTrace(exception);
 	}
	catch (IOException exception) {
 	    DebugStream.printStackTrace(exception);
 	}

	return metadata_values;
    }




    /**
     * Every file must be skimmed when a collection is opened, for two reasons:
     *   - To build a mapping from source file to its corresponding doc.xml file
     *   - To get a complete list of all extracted metadata elements
     */
    public void skimFile()
    {
	String fileRenameMethod = null;
	String gsdlsourcefilename_value = null;
	boolean is_unix_path = false;
	int description_element_start_gsdlsourcefilename_value = -1;
    
	MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);

	// Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements
	DebugStream.println("Skimming " + this + "...");
	try {
	    BufferedReader buffered_reader = new BufferedReader(new InputStreamReader(new FileInputStream(this), "UTF-8"));
	    int description_element_start = -1;

	    String line = null;
	    for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) {
		// This line contains the start of a "MetadataWrap" element 
		// (mets:xmlData in METS parlance, Description in GreenstoneArchive format) 
		if (line.indexOf("<"+MetadataWrap+">") != -1) {
		    if (description_element_start != -1) {
			System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!");
		    }
		    description_element_start = line_num;
		    continue;
		}

		// This line contains the end of a "MetadataWrap" element
		if (line.indexOf("</"+MetadataWrap+">") != -1) {
		    if (description_element_start == -1) {
			System.err.println("Parse error: "+MetadataWrap+" element unstarted!");
		    }
		    description_element_start = -1;
		    continue;
		}

		// If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements
		if (description_element_start == -1) {
		    continue;
		}

		// This line doesn't contain a Metadata element, so we're not interested
		if (line.indexOf("<"+MetadataItem+" ") == -1) {
		    DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element.");
		    continue;
		}

		// Extract the metadata element name
		int name_index = line.indexOf(" name=\"") + " name=\"".length();
		String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index));

		// If the metadata has a namespace it isn't extracted metadata, so we're not interested
		String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
		if (!metadata_set_namespace.equals("") && !metadata_set_namespace.equals("ex")) {
		    continue;
		}

		// Extracted metadata! May have ex. so make sure we remove that
		String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
		if(metadata_element_name.equals("gsdlsourcefilerenamemethod")) {
		    // Extract the element value
		    int value_index = line.indexOf(">", name_index) + ">".length();
		    fileRenameMethod = line.substring(value_index, line.indexOf("<", value_index));		    
		}
		
		// Note which file this is for
		else if (metadata_element_name.equals("gsdlsourcefilename")) {
		    // the gsdlsourcefilename metadata field may be encoded by the encoding denoted
			// in fileRenameMethod (and will need decoding)
		    
		    // Extract the gsdlsourcefilename element value
		    int value_index = line.indexOf(">", name_index) + ">".length();
		    gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
			
		    // We're only interested in the path relative to the import folder			
			int import_index = gsdlsourcefilename_value.indexOf("import");
		    if (import_index != -1) {
			
				///System.err.println("@@@@ Found gsdlsourcefilename: " + gsdlsourcefilename_value);
				///System.err.println("@@@@ Found description_element_start_gsdlsourcefilename_value: " + description_element_start);
				description_element_start_gsdlsourcefilename_value = description_element_start;
				
		    }

		    // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory or
		    // (as in the case of using FLI) if it is the etc/collect.cfg or etc/collectionConfig.xml file
		    // which are the gsdlsourcefilenames for the fedora digital object representing a collection.
		    // This (tmp dir) is true when the source files come from a zip file processed by ZIPPlug, for example
		    else if (gsdlsourcefilename_value.indexOf("tmp") == -1 
			     && !gsdlsourcefilename_value.endsWith("collect.cfg")
			     && !gsdlsourcefilename_value.endsWith("collectionConfig.xml")) {
				// We don't really know what is going on...
				System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);
		    }
		}
		
		// Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
		if (metadata_element_name.startsWith("gsdl")) {
		    continue;
		}

		MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name);
		if (metadata_element == null) {
		    // This element isn't defined in ex.mds, so create it for this session
		    DebugStream.println("Extracted metadata element not defined: " + metadata_element_name);
		    extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name);
		}
	    }

	    buffered_reader.close();

		// Work out if is_unix_path
		int import_index = gsdlsourcefilename_value.indexOf("import");
		if (import_index != -1) {
			String tempStr = gsdlsourcefilename_value.substring(import_index + "import".length());
			is_unix_path = tempStr.startsWith("/");
		}
		// We're only interested in the path relative to the import folder
		// Lop off "import" folder prefix
		gsdlsourcefilename_value = adjustForRelativeToImportDir(gsdlsourcefilename_value);

		// Now that we're done skimming, we actually need to decode gsdlsourcefilename
		// based on whatever fileRenameMethod was used to encode it, so that we can
		// at last properly compare against filenames on the file system
		// in order to load the correct ex.meta for the file.
		// Now that we should have both gsdlsourcefilename AND fileRenameMethod set,
		// we can finally perform the decoding of gsdlsourcefilename.	    
		if(fileRenameMethod == null) {
		    fileRenameMethod = FILE_RENAME_METHOD_URL; // default for building
		}
		if(!fileRenameMethod.equals(FILE_RENAME_METHOD_NONE)) {		    
		    gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path);			
		}

		// Now we can finally put the gsdlsourcefilename path relative to import dir into the hashmap
		///System.err.println("@@@ into map: " + gsdlsourcefilename_value);
		if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
			source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList());
		}
		((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(Integer.valueOf(description_element_start_gsdlsourcefilename_value));		
		
		// Next, if Windows, check if dealing with Win 8.3 Short Filename
		// In that case, convert short file name to long filename - works only if the file exists		
		if(isWin /*&& gsdlsourcefilename_value.indexOf("~") != -1*/) {
			
			String long_gsdlsourcefilename = gsdlsourcefilename_value;
			
			// gsdlsourcefilename is stored from import folder onwards: import/opt_subdir/filename.ext
			// This may contain Win 8.3 shortening. To get Win Long filename, prefix current collection dir
			// and if resulting file exists, getCanonicalPath() which produces Win Long filename.
			File currentCollectionFolder = Gatherer.c_man.getCollection().getCollectionDirectory();
			File f = new File(currentCollectionFolder, "import" + File.separator + gsdlsourcefilename_value); // should work even if linux style slashes in gsdlsourcefilename_value
			///System.err.println("### file: " + f.getAbsolutePath());
			
			if(f.exists()) {
				long_gsdlsourcefilename = f.getCanonicalPath();
				///System.err.println("### canon: " + long_gsdlsourcefilename);				
			} // else couldn't find a version of the filename stored in doc.xml that exists, giving up, leave gsdlsourcefilename_value as is
			
			// Again, we're only interested in the path relative to the import folder
			long_gsdlsourcefilename = adjustForRelativeToImportDir(long_gsdlsourcefilename);
			if(!gsdlsourcefilename_value.equals(long_gsdlsourcefilename)) { // truly distinct Win long and short file names
				// Put a copy of the ref to gsdlsourcefilename's metadata list under the long filename as well
				///System.err.println("@@@ long filename into map: " + long_gsdlsourcefilename);		
				Object arrList = source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value);
				source_file_name_to_description_elements_mapping.put(long_gsdlsourcefilename, arrList);
			}
		}
		
	}
	catch (FileNotFoundException exception) {
 	    DebugStream.printStackTrace(exception);
 	}
	catch (IOException exception) {
 	    DebugStream.printStackTrace(exception);
 	} catch (Exception exception) { // e.g. exception decoding gsdlsourcefilename
	    DebugStream.printStackTrace(exception);
	}
    }
	
	private String adjustForRelativeToImportDir(String gsdlsourcefilename_value) {
		int import_index = gsdlsourcefilename_value.indexOf("import");
		if (import_index != -1) {
			gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());

			boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
			gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);

			// (Will decode gsdlsourcefilename at end of this method, once we know
			// for certain the fileRenameMethod that was used to encode it.)

			// Make sure the path matches the OS that is running
			if (is_unix_path && isWin) {
				// Convert path from Unix to Windows
				gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
			}
			else if (!is_unix_path && !isWin) {
				// Convert path from Windows to Unix
				gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
			}
		}
		return gsdlsourcefilename_value;
	}
    
    protected String decodeSourceFilename(String relative_sourcefile_path,
					  String encodingMethod, boolean is_unix_path)
	throws Exception
    {	

	///System.err.println("*** relative_sourcefile_path: " + relative_sourcefile_path);

	// First get the file extension. Both in Base64 and URL encoded strings,
	// the full-stop character (.) doesn't get encoded.
	// That means getting the file extension is straightforward.
	
	// Valid base64: "The 64 characters (hence the name Base64) are 10 digits,
	// 26 lowercase characters, 26 uppercase characters as well as the
	// Plus sign (+) and the Forward Slash (/).
	int fullstop = relative_sourcefile_path.indexOf(".");
	String file_ext = "";
	if(fullstop != -1) {
	    file_ext = relative_sourcefile_path.substring(fullstop);
	    relative_sourcefile_path = relative_sourcefile_path.substring(0, fullstop);
	}
	
	String[] importFilePathParts = DocXMLFile.getFilePathParts(relative_sourcefile_path, is_unix_path);
	
	String decoded_gsdlsourcefilename = "";

	String separator = is_unix_path ? "/" : "\\";
	for(int i = 0; i < importFilePathParts.length; i++) {
	    String decoded_filePathPart = "";
	    if(encodingMethod.equals(FILE_RENAME_METHOD_URL)) {
		// URL decode each part of gsdlsourcefilename.
		// Need to set the decoder to use the default system encoding
		// This is stored in the System's file.encoding property.
		decoded_filePathPart = URLDecoder.decode(importFilePathParts[i], System.getProperty("file.encoding"));
	    }
	    else{ // if(encodingMethod.equals(FILE_RENAME_METHOD_BASE64)) {
		// Decoding with org.greenstone.gatherer.feedback.Base64 didn't work
		//byte[] bytes = org.greenstone.gatherer.feedback.Base64.decode(importFilePathParts[i]);
		// Using org.apache.commons.codec.binary.Base64 instead
		// https://commons.apache.org/proper/commons-codec/archives/1.7/apidocs/org/apache/commons/codec/binary/Base64.html
		// General info: https://stackoverflow.com/questions/43089541/difference-between-basic-and-url-base64-encoding-in-java-8
		byte[] bytes = Base64.decodeBase64(importFilePathParts[i].getBytes());
		///System.err.println("Got base64 string: " + importFilePathParts[i]);
		///System.err.println("Decoded from base64 to bytes: " + new String(bytes, System.getProperty("file.encoding")));
		// Using system file.encoding to interpret the resulting bytestring as a String,
		// just as we always did with URL decoding method
		decoded_filePathPart = (bytes == null) ? importFilePathParts[i] : new String(bytes, System.getProperty("file.encoding"));
	    }	    
	    
	    if(i == 0) {
		decoded_gsdlsourcefilename = decoded_filePathPart;
	    } else {
		decoded_gsdlsourcefilename = decoded_gsdlsourcefilename + separator + decoded_filePathPart;
	    }
	    ///System.err.println("Built up: " + decoded_gsdlsourcefilename);
	}

	// add the file extension back in
	decoded_gsdlsourcefilename += file_ext;
	
	///System.err.println("@@@@ decoded_gsdlsourcefilename: " + Utility.debugUnicodeString(decoded_gsdlsourcefilename));

	return decoded_gsdlsourcefilename;
    }    
    
    /**
     * Given a filepath, returns the parts between each file separator as an array.
     * For example, "/Users/me/pinky.txt" should return {"Users", "me", "pinky.txt"};
     */
    private static String[] getFilePathParts(String filepath, boolean is_unix_path) {	
	StringTokenizer tok;
	if(is_unix_path) {
	    tok = new StringTokenizer(filepath, "/");			
	} else {
	    tok = new StringTokenizer(filepath, "\\");			
	}
	String[] parts;
	int count = tok.countTokens();		
	if(count <= 0) {
	    parts = new String[]{filepath};			
	} else {
	    int i = 0;
	    parts = new String[count];
	    while(tok.hasMoreTokens()) {
		parts[i] = tok.nextToken();
		//System.err.println("Next part: " + parts[i]);
		i++;
	    }		
	}
	return parts;		
    }
   

}
