/**********************************************************************
 *
 * gsdlunicode.cpp -- 
 * Copyright (C) 1999  The New Zealand Digital Library Project
 *
 * A component of the Greenstone digital library software
 * from the New Zealand Digital Library Project at the
 * University of Waikato, New Zealand.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 *********************************************************************/

#include "gsdlunicode.h"


// unitool is currently in mg, if mg is not being used it should
// be moved into GSDLHOME/lib 
// A copy of mgpp's unitool has now been moved into common-src/src/lib/
#include "unitool.h"

#include "fileutil.h"

#include <stdio.h>

#if defined(GSDL_USE_OBJECTSPACE)
#  include <ospace\std\iostream>
#  include <ospace\std\fstream>
#elif defined(GSDL_USE_IOS_H)
#  include <iostream.h>
#  include <fstream.h>
#else
#  include <iostream>
#  include <fstream>
#endif


// converts a unicode encode text_t string to a utf-8
// encoded text_t string
text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end) {
  text_t out;

  unsigned char thischar[MAXUTF8CHARLEN];
  int i, charlen;

  while (here != end) {
    charlen = output_utf8_char (*here, thischar, &thischar[MAXUTF8CHARLEN-1]);
    for (i=0; i<charlen; ++i) out.push_back(thischar[i]);
    ++here;
  }

  return out;
}

// converts a utf-8 encoded text_t string to a unicode
// encoded text_t string
text_t to_uni (const text_t &in) {
  text_t out;
  unsigned char *in_cstr = (unsigned char *)in.getcstr();
  unsigned char *here = in_cstr;
  unsigned char *end = in_cstr;

  unsigned short unichar;
  int charlen = 0;

  // get the last valid character in the string
  while (*end != '\0') ++end;
  --end;

  while ((charlen = parse_utf8_char (here, end, &unichar)) > 0) {
    out.push_back(unichar);
    here += charlen;
  }

  delete []in_cstr;

  return out;
}


// this works for all unicode values < 65536...
void utf16outconvertclass::convert (char *out, size_t maxlen, size_t &len, status_t &status) {
  // we should already have text_t* input set...
  if (input == NULL || out == NULL)
  {
    status = finished;
    return;
  }
  unsigned char *output = (unsigned char *)out;
  text_t::iterator textend = input->end();
  len = 0;
  if (maxlen % 2) --maxlen; // we need an even number of output bytes...
  while ((len < maxlen) && (texthere != textend)) {
    unsigned short int uni_char=(unsigned short int) *texthere;
    // big endian utf-16...
    if (uni_char < 256) {
      out[len]=0;   
      out[len+1]=uni_char;
    } else {
      out[len]=uni_char >> 8;
      out[len+1]=uni_char & 255;
    }
    len+=2;
    ++texthere;
  }
  if (texthere==textend)
    status=finished;
  else
    status=unfinished;
}


utf8inconvertclass::utf8inconvertclass () {
  utf8buflen = 0;
}

utf8inconvertclass::~utf8inconvertclass () {
  // nothing to do
}

void utf8inconvertclass::reset () {
  start = NULL;
  len = 0;
  utf8buflen=0;
}

void utf8inconvertclass::convert (text_t &output, status_t &status) {
  output.clear();
  output.reserve (len/3);
  
  if (start == NULL || len == 0) {
    if (utf8buflen == 0) status = finished;
    else status = stopped;
    return;
  }

  // don't want any funny sign conversions happening
  unsigned char *here = (unsigned char *)start;
  unsigned char *end = here+len-1;
  unsigned short c;
  size_t realcharlen;

  size_t charlen = getutf8charlen ();
  while (len > 0) {
    if (charlen == 0) {
      // start parsing a new character
      utf8buflen = 0;

      // fast common case
      while (len > 3) {
  	realcharlen = parse_utf8_char (here, end, &c);
  	output.push_back (c);
  	here += realcharlen;
  	len -= realcharlen;
      }

      utf8buf[utf8buflen++] = *here;
      ++here;
      --len;
      charlen = getutf8charlen ();

    } else if (utf8buflen < charlen) {
      // assumes charlen is always less than MAXUTF8CHARLEN
      utf8buf[utf8buflen++] = *here;
      ++here;
      --len;
    }

    if (utf8buflen == charlen) {
      // got a complete character
      realcharlen = parse_utf8_char (utf8buf, &utf8buf[utf8buflen-1], &c);
      output.push_back (c);
      
      // move any unparsed characters. If an error occurred some of
      // the characters might be unused.
      int i;
      int diff = utf8buflen - realcharlen;
      for (i=0; i < diff; ++i) utf8buf[i] = utf8buf[i+diff];
      utf8buflen = diff;
      charlen = getutf8charlen ();
    }
  }

  start = (char *)here; // save current position

  if (utf8buflen == 0) status = finished;
  else status = stopped;
}


// returns the length that the current contents of the 
// utf8buf should be
size_t utf8inconvertclass::getutf8charlen () {
  if (utf8buflen == 0) return 0;

  // one byte character
  if (utf8buf[0] < 0x80) return 1;

  // error, is not the start of a utf-8 character
  if (utf8buf[0] < 0xc0) return 1;

  // two bute character
  if (utf8buf[0] < 0xe0) return 2;

  // three byte character
  if (utf8buf[0] < 0xf0) return 3;

  // error, character too long for unicode
  return 1;
}


void utf8outconvertclass::reset () {
  input = NULL;
  outs = NULL;
  utf8buflen = 0;
  utf8bufhere = 0;
}

// note that convert does not null-terminate the
// output array of characters
void utf8outconvertclass::convert (char *output, size_t maxlen, 
				   size_t &len, status_t &status) {
  if (input == NULL || output == NULL) {
    if (utf8buflen == 0) status = finished;
    else status = unfinished;
    return;
  }

  // don't want any funny sign conversions happening
  unsigned char *uoutput = (unsigned char *)output;
  text_t::iterator textend = input->end();
  len = 0;
  while (len < maxlen) {
    // empty the contents of the internal buffer
    if (utf8buflen > 0) {
      while (len < maxlen && utf8bufhere < utf8buflen) {
	*uoutput = utf8buf[utf8bufhere];
	++uoutput;
	++len;
	++utf8bufhere;
      }

      if (utf8bufhere == utf8buflen) {
	utf8bufhere = 0;
	utf8buflen = 0;
      }
    }

    // fill up the buffer with the next character
    if (utf8buflen == 0) {
      if (texthere == textend) break; // finished!
      if (!rzws || (*texthere != 0x200b))
	utf8buflen = output_utf8_char (*texthere, utf8buf, 
				       &utf8buf[MAXUTF8CHARLEN-1]);
      ++texthere;
      utf8bufhere = 0;
    }
  }
  
  if (texthere == textend && utf8buflen == 0) status = finished;
  else status = unfinished;
}






mapdata_t::mapdata_t () {

  // reset all the map ptrs to be NULL
  for (int i=0; i<256; ++i) {
    ptrs[i] = (unsigned short *)NULL;
  }

  // say nothing has been loaded
  loaded = false;
}


mapconvert::mapconvert () {
  absentc = 0;
}

// setmapfile will cause loadmapfile to be called when conversion is
// needed
bool mapconvert::setmapfile (const text_t &themapfile, unsigned short theabsentc) {
  // check to see if the mapfile has been already loaded
  if (mapdata.loaded && mapfile == themapfile && absentc == theabsentc) return true;

  unloadmapfile ();
  mapfile = themapfile;
  absentc = theabsentc;
  
  return true;
}



// loadmapfile should be called before any conversion is done
bool mapconvert::loadmapfile (const text_t &themapfile,
			      unsigned short theabsentc) {
  FILE *mapfilein = (FILE *)NULL;

  // check to see if the mapfile has been already loaded
  if (mapdata.loaded && mapfile == themapfile && absentc == theabsentc) return true;

  unloadmapfile ();
  mapfile = themapfile;
  absentc = theabsentc;

  // open the map file
  char *cfilename = mapfile.getcstr();
  if (cfilename == (char *)NULL) return false;
  mapfilein = fopen(cfilename, "rb");
  delete []cfilename; cfilename = NULL;

  if (mapfilein == (FILE *)NULL) return false;

  unsigned char c, n1, n2;
  unsigned short *arrptr;
  int i;
  c = fgetc (mapfilein);
  while (!feof (mapfilein)) {
    if (mapdata.ptrs[c] == (unsigned short *)NULL) {
      // allocate a new array
      arrptr = new unsigned short[256];
      mapdata.ptrs[c] = arrptr;
    } else arrptr = mapdata.ptrs[c];

    // clear the array
    for (i=0; i<256; ++i) arrptr[i] = 0;
    
    // read in this block
    n1 = fgetc (mapfilein);
    n2 = fgetc (mapfilein);
    i=0;
    while (!feof (mapfilein)) {
      arrptr[i] = ((unsigned short)n1 << 8) | (unsigned short)n2;

      ++i;
      if (i >= 256) break;
      n1 = fgetc (mapfilein);
      n2 = fgetc (mapfilein);
    }

    c = fgetc (mapfilein);
  }

  mapdata.loaded = true;

  return true;
}

void mapconvert::unloadmapfile () {
  if (!mapdata.loaded) return;

  for (int i=0; i<256; ++i) {
    if (mapdata.ptrs[i] != (unsigned short *)NULL) {
      delete [] mapdata.ptrs[i];
      mapdata.ptrs[i] = (unsigned short *)NULL;
    }
  }

  mapdata.loaded = false;
}


unsigned short mapconvert::convert (unsigned short c) {
  if (!mapdata.loaded) {
    if (!mapfile.empty() && loadmapfile (mapfile, absentc)) {
      // do nothing, successfully loaded database
    } else return absentc;
  }

  if (c == 0) return 0; // 0 always maps to 0...

  unsigned short n1 = c >> 8;
  unsigned short n2 = c & 0xff;

  unsigned short *arrptr = mapdata.ptrs[n1];
  if (arrptr == (unsigned short *)NULL) return absentc;

  if (arrptr[n2] == 0) return absentc;
  return arrptr[n2];
}

text_t mapconvert::convert (const text_t &instr) {
  if (!mapdata.loaded) return absentc;

  text_t outstr;
  text_t::const_iterator here = instr.begin();
  text_t::const_iterator end = instr.end();

  while (here != end) {
    outstr.push_back(this->convert(*here));
    ++here;
  }
  
  return outstr;
}




mapinconvertclass::mapinconvertclass () {
  m_multibyte = 0;
  mapbuflen = 0;
}

void mapinconvertclass::reset () {
  start = NULL;
  len = 0;
  mapbuflen=0;
}

void mapinconvertclass::convert (text_t &output, status_t &status) {
  output.clear();

  if (start == NULL || len == 0) {
    if (mapbuflen == 0) status = finished;
    else status = stopped;
    return;
  }

  // don't want any funny sign conversions happening
  unsigned char *here = (unsigned char *)start;

  size_t charlen = getmapcharlen ();
  while (len > 0) {
    if (charlen == 0) {
      // start parsing a new character
      mapbuflen = 0;
      mapbuf[mapbuflen++] = *here;
      ++here;
      --len;
      charlen = getmapcharlen ();

    } else if (mapbuflen < charlen) {
      // assumes charlen is always less than MAXMAPCHARLEN
      mapbuf[mapbuflen++] = *here;
      ++here;
      --len;
    }

    if (mapbuflen == charlen) {
      // got a complete character
      if (charlen == 1) {
	if (mapbuf[0] < 0x80) {
	  // ascii character
	  output.push_back (mapbuf[0]);
	} else {
	  output.push_back (converter.convert((unsigned short)mapbuf[0]));
	}

      } else {
	// two byte character
	output.push_back (converter.convert(((unsigned short)mapbuf[0] << 8) |
					    (unsigned short)mapbuf[1]));
      }

      mapbuflen = 0;
      charlen = 0;
    }
  }

  start = (char *)here; // save current position

  if (mapbuflen == 0) status = finished;
  else status = stopped;
}



mapoutconvertclass::mapoutconvertclass () {
  m_multibyte = 0;
  mapbuflen=0; 
  mapbufhere=0;
}

void mapoutconvertclass::reset () {
  input = NULL;
  outs = NULL;
  mapbuflen = 0;
  mapbufhere = 0;
}

// note that convert does not null-terminate the
// output array of characters
void mapoutconvertclass::convert (char *output, size_t maxlen, 
				 size_t &len, status_t &status) {
  unsigned short outc;

  if (input == NULL || output == NULL) {
    if (mapbuflen == 0) status = finished;
    else status = unfinished;
    return;
  }

  // don't want any funny sign conversions happening
  unsigned char *uoutput = (unsigned char *)output;
  text_t::iterator textend = input->end();
  len = 0;
  while (len < maxlen) {
    // empty the contents of the internal buffer
    if (mapbuflen > 0) {
      while (len < maxlen && mapbufhere < mapbuflen) {
	*uoutput = mapbuf[mapbufhere];
	++uoutput;
	++len;
	++mapbufhere;
      }

      if (mapbufhere == mapbuflen) {
	mapbufhere = 0;
	mapbuflen = 0;
      }
    }

    // fill up the buffer with the next character
    if (mapbuflen == 0) {
      if (texthere == textend) break; // finished!
      if (!rzws || (*texthere != 0x200b)) {
	if (*texthere < 0x80) {
	  mapbuf[0] = (unsigned char)*texthere;
	  mapbuflen = 1;
	} else {
	  outc = converter.convert (*texthere);
	  if (m_multibyte) {
	    mapbuf[0] = (unsigned char)(outc >> 8);
	    mapbuf[1] = (unsigned char)(outc & 0xff);
	    mapbuflen = 2;
	  } else {
	    mapbuf[0] = outc;
	    mapbuflen = 1;
	  }
	}
      }

      ++texthere;
      mapbufhere = 0;
    }
  }
  
  if (texthere == textend && mapbuflen == 0) status = finished;
  else status = unfinished;
}


bool simplemapconvert::loadmapfile (bool in) {
  if (loaded) return true;
  if (mapfile.empty()) return false;

  char *cfilename = mapfile.getcstr();
#ifdef GSDL_USE_IOS_H
  ifstream mapfilein (cfilename, ios::in | ios::nocreate);
#else
  ifstream mapfilein (cfilename, ios::in);
#endif
  delete []cfilename;
  if (!mapfilein) return false;

  char cline[2048];
  text_t line;

  while (!mapfilein.eof()) {
    mapfilein.getline (cline, 2048);
    line.clear();
    line.appendcstr (cline);
    if (line.empty()) continue;
    // remove comments
    text_t::iterator end = line.end();
    text_t::iterator here = findchar (line.begin(), end, '#');
    if (here != end) {
      line.erase (here, end);
      if (line.empty()) continue;
    }
    
    text_tarray parts;
    splitchar (line.begin(), line.end(), '\t', parts);
    
    // do some simple sanity checks
    if (parts.size() < 2) continue;
    text_t::iterator begin1 = parts[0].begin();
    text_t::iterator begin2 = parts[1].begin();
    if (*begin1 != '0' || *(begin1+1) != 'x') continue;
    if (*begin2 != '0' || *(begin2+1) != 'x') continue;
    char *from = parts[0].getcstr();
    char *to = parts[1].getcstr();
    unsigned int f = 0, t = 0;
    sscanf (from, "%i", &f);
    sscanf (to, "%i", &t);
    delete []from;
    delete []to;
    
    if (in) mapping[(unsigned short)f] = (unsigned short)t;
    else mapping[(unsigned short)t] = (unsigned short)f;
  }

  loaded = true;
  return true;
}

unsigned short simplemapconvert::convert (unsigned short c, bool in) {

  if (!loaded) 
    if (!loadmapfile(in)) return absentc;
  
  return mapping[c];
}


void simplemapinconvertclass::convert (text_t &output, status_t &status) {
  output.clear();
  
  if (start == NULL || len == 0) {
    status = finished;
    return;
  }

  // don't want any funny sign conversions happening
  unsigned char *here = (unsigned char *)start;
  while (len > 0) {

    if (*here < 0x80)
      output.push_back (*here); // append this character
    else 
      output.push_back (converter.convert(*here, true));

    ++here;
    --len;
  }

  start = (char *)here; // save current position
  status = finished;
}


void simplemapoutconvertclass::convert (char *output, size_t maxlen, 
					size_t &len, status_t &status) {

  if (input == NULL || output == NULL) {
    status = finished;
    return;
  }

  // don't want any funny sign conversions happening
  unsigned char *uoutput = (unsigned char *)output;
  text_t::iterator textend = input->end();
  len = 0;
  while ((len < maxlen) && (texthere != textend)) {

    if (*texthere < 0x80) *uoutput = (unsigned char)(*texthere);
    else *uoutput = converter.convert (*texthere, false);

    ++uoutput;
    ++len;
    ++texthere;
  }
  
  if (texthere == textend) status = finished;
  else status = unfinished;
}
