/**
 *#########################################################################
 *
 * A component of the Gatherer application, part of the Greenstone digital
 * library suite from the New Zealand Digital Library Project at the
 * University of Waikato, New Zealand.
 *
 * Author: John Thompson, Greenstone Digital Library, University of Waikato
 *
 * Copyright (C) 1999 New Zealand Digital Library Project
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *########################################################################
 */
package org.greenstone.gatherer.util;

import java.util.*;

/** Provides a standard, extensible way to convert from one format of string to another (given that each format has differing requirements regarding legal characters and escaped characters)
 * @author John Thompson, Greenstone Digital Library, University of Waikato
 * @version 2.3d
 */
public class Codec {
    
    static final public String DECODE_PATH = "DECODE_PATH";
    static final public String DECODE_SQUARE_BRACKETS = "DECODE_SQUARE_BRACKETS";
    static final public String DOM_TO_GREENSTONE = "DOM_TO_GREENSTONE";
    static final public String DOM_TO_TEXT = "DOM_TO_TEXT";
    static final public String ENCODE_PATH = "ENCODE_PATH";
    static final public String ENCODE_SQUARE_BRACKETS = "ENCODE_SQUARE_BRACKETS";
    static final public String ESCAPEDHTML_TO_UNESCAPED = "ESCAPEDHTML_TO_UNESCAPED";
    static final public String REINSTATE_HTML_TAGS = "REINSTATE_HTML_TAGS";
    static final public String GREENSTONE_TO_DOM = "GREENSTONE_TO_DOM";
    static final public String GREENSTONE_TO_TEXT = "GREENSTONE_TO_TEXT";
    static final public String TEXT_TO_DOM = "TEXT_TO_DOM";
    static final public String TEXT_TO_DOM_PRESERVE_TAGS = "TEXT_TO_DOM_PRESERVE_TAGS";
    static final public String TEXT_TO_DOM_PRESERVE_TAGS_GS3 = "TEXT_TO_DOM_PRESERVE_TAGS_GS3";
    static final public String TEXT_TO_GREENSTONE = "TEXT_TO_GREENSTONE";
    static final public String TEXT_TO_REGEXP = "TEXT_TO_REGEXP";
    static final public String TEXT_TO_SHELL_UNIX = "TEXT_TO_SHELL_UNIX";
    static final public String TEXT_TO_SHELL_WINDOWS = "TEXT_TO_SHELL_WINDOWS";
    
    static final private int MAX_CACHE_SIZE = 100;
    
    static private HashMap TRANSFORMS;
    static private HashMap3D CACHE;
    
    /** Static function called to construct TRANSFORMS mappings */
    static {
	TRANSFORMS = new HashMap();

	String[] decode_path = {
	    "\\|",    "\\\\",
	    "&#124;", "\\|"
	};
	TRANSFORMS.put(DECODE_PATH, decode_path);
	decode_path = null;

	// Transform text into text, but without [ and ]
	String[] decode_square_brackets = {
	    "&#091;", "\\[",
	    "&#093;", "\\]" 
	};
	TRANSFORMS.put(DECODE_SQUARE_BRACKETS, decode_square_brackets);
	decode_square_brackets = null;

	// Translate DOM encoded text into Greenstone encoding
	String[] dom_to_greenstone = {
	    "&apos;", "\\\\\'",
	    "&gt;", ">",
	    "&lt;", "<",
	    "&quot;", "\\\\\"",
	    "&amp;", "&"
	};
	//  removed "\n", "\\\\n", - config files are allowed new lines
	// added "\\|", "\\\\"

	TRANSFORMS.put(DOM_TO_GREENSTONE, dom_to_greenstone);
	dom_to_greenstone = null;

	// Transform DOM encoded text into plain text
	String[] dom_to_text = {
	   "&amp;#091;", "\\[",
	   "&amp;#093;", "\\]",
	   "&apos;", "\'",
	   "&gt;", ">",
	   "&lt;", "<",
	   "&quot;", "\"",
	   "&amp;", "&"
	};
	TRANSFORMS.put(DOM_TO_TEXT, dom_to_text);
	dom_to_text = null;

	// Transform text into a regular expression that will match it
	String[] text_to_regexp = {
	    "\\\\", "\\\\\\\\",
	    "\\(", "\\\\(",
	    "\\)", "\\\\)",
	    "\\[", "\\\\[",
	    "\\]", "\\\\]",
	    "\\{", "\\\\{",
	    "\\}", "\\\\}",
	    "\\.", "\\\\."
	};
	TRANSFORMS.put(TEXT_TO_REGEXP, text_to_regexp);
	text_to_regexp = null;

	String[] encode_path = {
	    "\\|",  "&#124;",
	    "\\\\", "\\|"
	};
	TRANSFORMS.put(ENCODE_PATH, encode_path);
	encode_path = null;

	// Transform text into text, but without [ and ]
	String[] encode_square_brackets = {
	    "\\[", "&#091;",
	    "\\]", "&#093;"
	};
	TRANSFORMS.put(ENCODE_SQUARE_BRACKETS, encode_square_brackets);
	encode_square_brackets = null;

	// Transform Greenstone encoded text to DOM encoding
	String[] greenstone_to_dom = {
	    "&", "&amp;",
	    "<", "&lt;",
	    ">", "&gt;",
	    "\\\\\"", "&quot;",
	    "\\\\\'", "&apos;",
	    "\"", "&quot;",
	    "\'", "&apos;"
	};
	// removed"\\\\n", "\n", added "\\\\", "\\|"

	TRANSFORMS.put(GREENSTONE_TO_DOM, greenstone_to_dom);
	greenstone_to_dom = null;

	// Transform Greenstone encoded text to plain text
	String[] greenstone_to_text = {
	    "\\\\\"", "\"",
	    "\\\\\'", "\'",
	    "&quot;", "\"",
	    "&apos;", "\'",
	    "&#091;", "\\[",
	    "&#093;", "\\]"
	};
	// removed    "\\\\n", "\n", "\\|", "\\\\"

	TRANSFORMS.put(GREENSTONE_TO_TEXT, greenstone_to_text);
	greenstone_to_text = null;

	// Transform plain html text into something that can be placed in a DOM
	String[] text_to_dom = {
	    "&", "&amp;",
	    "<", "&lt;",
	    ">", "&gt;",
	    "\"", "&quot;",
	    "\'", "&apos;"
	};
	TRANSFORMS.put(TEXT_TO_DOM, text_to_dom);
	text_to_dom = null;

	// Same as above, but preserve html element tags
	String[] text_to_dom_preserve_tags = {
	    "&", "&amp;",
	    "\"", "&quot;",
	    //"\'", "&apos;"
	};
	TRANSFORMS.put(TEXT_TO_DOM_PRESERVE_TAGS, text_to_dom_preserve_tags);
	text_to_dom_preserve_tags = null;
        
	// Same as above, but don't escape quotes
	String[] text_to_dom_preserve_tags_gs3 = {
	    "&", "&amp;",
	};
	TRANSFORMS.put(TEXT_TO_DOM_PRESERVE_TAGS_GS3, text_to_dom_preserve_tags_gs3);
	text_to_dom_preserve_tags_gs3 = null;

	// Unescape html (or xml) text
	String[] escapedhtml_to_unescaped = {
	    "&amp;", "&",
	    "&lt;", "<", 
	    "&gt;", ">",
	    "&quot;", "\""//,
	    //"&apos;", "\'"
	};
	TRANSFORMS.put(ESCAPEDHTML_TO_UNESCAPED, escapedhtml_to_unescaped);
	escapedhtml_to_unescaped = null;
	
	// Reinstate tag markers <>
	String[] reinstate_html_tags = {
	    "&lt;", "<", 
	    "&gt;", ">",
	};
	TRANSFORMS.put(REINSTATE_HTML_TAGS, reinstate_html_tags);
	reinstate_html_tags = null;
	

	// Transform plain html text into greenstone encoding
	String[] text_to_greenstone = {

	    "\\[", "&#091;",
	    "\\]", "&#093;",
	    "\"", "&quot;",
	    "\n", "\\\\n"
	};
	// 	    "\'", "&apos;",
	// removed "\\\\", "\\|",
	TRANSFORMS.put(TEXT_TO_GREENSTONE, text_to_greenstone);
	text_to_greenstone = null;

	// Transform plain html text into something that can be placed in a shell command
	String[] text_to_shell_unix = {
	    "\"", "\\\\\"",
	    "\'", "\\\\\'",
	    "\n", "\\\\n"
	};
	TRANSFORMS.put(TEXT_TO_SHELL_UNIX, text_to_shell_unix);
	text_to_shell_unix = null;

	// Transform plain html text into something that can be placed in a shell command. Windows requires twice as many escaped for speech marks to be passed to underlying processes
	String[] text_to_shell_windows = {
	    "\"", "\\\\\\\\\\\\\"",
	    "\'", "\\\\\'",
	    "\n", "\\\\n"
	};
	TRANSFORMS.put(TEXT_TO_SHELL_WINDOWS, text_to_shell_windows);
	text_to_shell_windows = null;

	CACHE = new HashMap3D();
    }

    static public String transform(String raw, String transform) {
	if(raw == null) {
	    return raw;
	}
	// System.err.println("Transforming by "+transform+":\n" + raw);
	String processed = (String) CACHE.get(transform, raw);
	if(processed == null) {
	    processed = raw;
	    String[] transforms = (String[]) TRANSFORMS.get(transform);
	    if(transforms != null) {
		for(int i = 0; i < transforms.length; i = i + 2) {
		    String target = transforms[i];
		    String result = transforms[i+1];
		    processed = processed.replaceAll(target, result);
		}
	    }
	    //DebugStream.println("\n*** Transform: " + transform + " ***");
	    //DebugStream.println("*** Raw      : '" + raw + "'");
	    //DebugStream.println("*** Processed: '" + processed + "'");
	    // If cache is at maximum size, empty it and start again
	    if(CACHE.size() == MAX_CACHE_SIZE) {
		CACHE.clear();
	    }
	    CACHE.put(transform, raw, processed);
	}
	return processed;
    }

    /** Transform either of the accepted unicode escape sequences styles from in the string into single characters */
    static final private char AND_CHAR = '&';
    static final private char ESCAPE_CHAR = '\\';
    static final private char HASH_CHAR = '#';
    static final private char LOWER_U_CHAR = 'u';
    static final private char UPPER_U_CHAR = 'U';
    static final private char SEMICOLON_CHAR = ';';

    static public String transformUnicode(String raw) {
	StringBuffer processed = new StringBuffer();
	int index = 0;
	int raw_length = raw.length();
	while(index < raw_length) {
	    char c0 = raw.charAt(index);
	    switch(c0) {
	    case AND_CHAR:
		if(index + 1 < raw_length) {
		    // First the HTML &#231; type
		    char c1 = raw.charAt(index + 1);
		    if(c1 == HASH_CHAR) {
			StringBuffer number_str = new StringBuffer();
			char c2;
			int offset = 2;
			while(index + offset < raw_length && (c2 = raw.charAt(index + offset)) != SEMICOLON_CHAR) {
			    number_str.append(c2);
			    offset++;
			}
			// We've either run out of characters or have parsed a number
			if(index + offset < raw_length && raw.charAt(index + offset) == SEMICOLON_CHAR) {
			    int number = Integer.parseInt(number_str.toString());
			    processed.append((char)number);
			    index = index + offset;
			    number_str = null;
			    break;
			}
			number_str = null;
		    }
		}
		processed.append(c0);
		break;
	    case ESCAPE_CHAR:
		// Now the \u00e7 type
		if(index + 1 < raw_length) {
		    char c3 = raw.charAt(index + 1);
		    if((c3 == UPPER_U_CHAR || c3 == LOWER_U_CHAR) && index + 5 < raw_length) {
			// We read four digits
			String hex_str = raw.substring(index + 2, index + 6);
			int number = Integer.parseInt(hex_str, 16);
			hex_str = null;
			processed.append((char)number);
			index = index + 5;
			break;
		    }
		}
		processed.append(c0);
		break;
	    default:
		processed.append(c0);
	    }
	    index++;
	}
	return processed.toString();
    }

    static public void main(String[] args) {
	if(args.length < 2) {
	    String processed;
	    String raw;
	    String transform;

	    System.err.println("Running Test Suite");

	    transform = "DOM_TO_GREENSTONE";
	    System.err.println("Test " + transform);
	    raw = "A &amp;lt;\nand a &lt;a href=&quot;here.html&quot;&gt;&lt;font size=&apos;2&apos;&gt;URL&lt;/font&gt;&lt;/a&gt;";
	    System.err.println("Raw:       '" + raw + "'");
	    processed = transform(raw, transform);
	    System.err.println("Processed: '" + processed + "'");

	    transform = "DOM_TO_TEXT";
	    System.err.println("Test " + transform);
	    raw = "A &amp;lt;\nand a &lt;a href=&quot;here.html&quot;&gt;&lt;font size=&apos;2&apos;&gt;URL&lt;/font&gt;&lt;/a&gt;";
	    System.err.println("Raw:       '" + raw + "'");
	    processed = transform(raw, transform);
	    System.err.println("Processed: '" + processed + "'");

	    transform = "GREENSTONE_TO_DOM";
	    System.err.println("Test " + transform);
	    raw = "A &lt;\\nand a <a href=\\\"here.html\\\"><font size=\\\'2\\\'URL</font></a>";
	    System.err.println("Raw:       '" + raw + "'");
	    processed = transform(raw, transform);
	    System.err.println("Processed: '" + processed + "'");

	    transform = "GREENSTONE_TO_TEXT";
	    System.err.println("Test " + transform);
	    raw = "These \\[ \\] should be escaped, and so should \\\\ that. These &quot; &apos; \\n are encoded.";
	    System.err.println("Raw:       '" + raw + "'");
	    processed = transform(raw, transform);
	    System.err.println("Processed: '" + processed + "'");

	    transform = "TEXT_TO_DOM";
	    System.err.println("Test " + transform);
	    raw = "A &lt;\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
	    System.err.println("Raw:       '" + raw + "'");
	    processed = transform(raw, transform);
	    System.err.println("Processed: '" + processed + "'");

	    transform = "TEXT_TO_GREENSTONE";
	    System.err.println("Test " + transform);
	    raw = "These [ ] should be escaped, and so should \\ that. These \" \' \n are encoded.";
	    System.err.println("Raw:       '" + raw + "'");
	    processed = transform(raw, transform);
	    System.err.println("Processed: '" + processed + "'");

	    transform = "TEXT_TO_SHELL";
	    System.err.println("Test " + transform);
	    if(Utility.isWindows()) {
		System.err.println("[Windows Version]");
		transform = "TEXT_TO_SHELL_WINDOWS";
	    }
	    else {
		System.err.println("[Unix Version]");
		transform = "TEXT_TO_SHELL_UNIX";
	    }
	    raw = "A &lt;\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
	    System.err.println("Raw:       '" + raw + "'");
	    processed = transform(raw, transform);
	    System.err.println("Processed: '" + processed + "'");

	    System.err.println("***** UNICODE TEST *****");
	    System.err.println("\\u0030 => " + transformUnicode("\\u0030"));
	    System.err.println("\\u0041 => " + transformUnicode("\\u0041"));
	    System.err.println("\\u007a => " + transformUnicode("\\u007a"));
	    System.err.println("\\u00e7 => " + transformUnicode("\\u00e7"));
	    System.err.println("&#48;   => " + transformUnicode("&#48;"));
	    System.err.println("&#65;   => " + transformUnicode("&#65;"));
	    System.err.println("&#122;  => " + transformUnicode("&#122;"));
	    System.err.println("&#231;  => " + transformUnicode("&#231;"));
	}
	else {
	    System.err.println("Raw:       '" + args[0] + "'");
	    System.err.println("Transform: " + args[1]);
	    String processed = transform(args[0], args[1]);
	    System.err.println("Processed: '" + processed + "'");
	}
    }
}
