/**********************************************************************
 *
 * gsdlunicode.h -- 
 * Copyright (C) 1999  The New Zealand Digital Library Project
 *
 * A component of the Greenstone digital library software
 * from the New Zealand Digital Library Project at the
 * University of Waikato, New Zealand.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 *********************************************************************/


#ifndef GSDLUNICODE_H
#define GSDLUNICODE_H

#include "text_t.h"


// converts a unicode encode text_t string to a utf-8
// encoded text_t string
text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end);
inline text_t to_utf8 (const text_t &in) {return to_utf8 (in.begin(), in.end());}

// converts a utf-8 encoded text_t string to a unicode
// encoded text_t string
text_t to_uni (const text_t &in);

#define MAXUTF8CHARLEN 3

// convert from a utf-8 char stream to the text_t class
class utf8inconvertclass : public inconvertclass {
public:
  utf8inconvertclass();
  virtual ~utf8inconvertclass();

  virtual void reset ();
  virtual void convert (text_t &output, status_t &status);

protected:
  // buffer to hold unconverted characters in a stream
  unsigned char utf8buf[MAXUTF8CHARLEN];
  size_t utf8buflen;

  // returns the length that the current contents of the 
  // utf8buf should be
  size_t getutf8charlen ();
};


// This class provides the option of removing zero width
// spaces (U+200B) during the output. By default this
// option is turned off. The functionality is actually
// implemented by the sub-classes, this class just provides
// the framework for these classes. 
//
// Note: by convention reset() should not reset the rzws flag.
class rzwsoutconvertclass : public outconvertclass {
public:
  rzwsoutconvertclass () {rzws = 0;};
  virtual ~rzwsoutconvertclass () {};

  void set_rzws (int new_rzws) {rzws = new_rzws;};

protected:
  int rzws;
};

// utf16 is almost the same as unicode, except for unicode values > 65535.
class utf16outconvertclass : public rzwsoutconvertclass {
public:
  utf16outconvertclass () {};
  virtual ~utf16outconvertclass () {};

  virtual void convert (char *out, size_t maxlen, size_t &len, status_t &status);
};


// Convert from a text_t class to a utf-8 char stream
class utf8outconvertclass : public rzwsoutconvertclass {
public:
  utf8outconvertclass () {utf8buflen=0; utf8bufhere=0;};
  virtual ~utf8outconvertclass () {};

  virtual void reset ();
  // note that convert does not null-terminate the
  // output array of characters
  void convert (char *output, size_t maxlen, 
		size_t &len, status_t &status);

protected:
  unsigned char utf8buf[MAXUTF8CHARLEN];
  size_t utf8buflen;
  size_t utf8bufhere;
};


// mapdata_t is used by mapconvert to hold the map file data
class mapdata_t {
public:
  mapdata_t();
  bool loaded;
  unsigned short *ptrs[256];
};

// mapconvert is used in situations where conversion is best
// done using a map file. The mapfile should reside in 
// gsdlhome/unicode.
class mapconvert {
public:
  mapconvert ();
  ~mapconvert () {unloadmapfile();};

  // setmapfile will cause loadmapfile to be called when conversion is
  // needed
  bool setmapfile (const text_t &themapfile, unsigned short theabsentc);

  // loadmapfile should be called before any conversion is done
  bool loadmapfile (const text_t &themapfile, unsigned short theabsentc);
  void unloadmapfile ();

  unsigned short convert (unsigned short c);

  // note that this version of convert has different semantics to
  // the convertclass version.
  text_t convert (const text_t &instr);

protected:
  text_t mapfile;
  unsigned short absentc;
  mapdata_t mapdata;
};



#define MAXMAPCHARLEN 2

// convert from a gb char stream to the unicode text_t class
class mapinconvertclass : public inconvertclass {
public:
  mapinconvertclass();
  virtual ~mapinconvertclass() {};

  // setmapfile will cause loadmapfile to be called when conversion is needed
  bool setmapfile (const text_t &themapfile, unsigned short theabsentc) {
    return converter.setmapfile (themapfile, theabsentc);
  };

  // loadmapfile should be called before any conversion takes
  // place
  bool loadmapfile (const text_t &themapfile, unsigned short theabsentc) {
    return converter.loadmapfile (themapfile, theabsentc);
  };

  void set_multibyte (int new_multibyte) {m_multibyte = new_multibyte;};

  void reset ();
  void convert (text_t &output, status_t &status);

protected:
  // buffer to hold unconverted characters in a stream
  unsigned char mapbuf[MAXMAPCHARLEN];
  size_t mapbuflen;
  int m_multibyte;

  // note: multiple instances of mapinconvert class are expensive
  // as each will have its own copy of the map file data. This
  // could be reduced by making map2unimap static, but then it
  // wouldn't be thread safe.
  mapconvert converter;

  // returns the length that the current contents of the 
  // mapbuf should be
  inline size_t getmapcharlen () {
    if (mapbuflen == 0) return 0;
    if (mapbuf[0] < 0x80) return 1;
    if (!m_multibyte) return 1;
    return 2;
  }
};


// Convert from a text_t class to a map char stream
class mapoutconvertclass : public rzwsoutconvertclass {
public:
  mapoutconvertclass ();
  virtual ~mapoutconvertclass() {};

  // setmapfile will cause loadmapfile to be called when conversion is needed
  bool setmapfile (const text_t &themapfile, unsigned short theabsentc) {
    return converter.setmapfile (themapfile, theabsentc);
  };

  // loadmapfile should be called before any conversion takes
  // place
  bool loadmapfile (const text_t &themapfile, unsigned short theabsentc) {
    return converter.loadmapfile (themapfile, theabsentc);
  };

  void set_multibyte (int new_multibyte) {m_multibyte = new_multibyte;};

  void reset ();
  void convert (char *output, size_t maxlen, 
		size_t &len, status_t &status);

protected:
  unsigned char mapbuf[MAXMAPCHARLEN];
  size_t mapbuflen;
  size_t mapbufhere;
  int m_multibyte;

  mapconvert converter;
};


// Simple input and output converter classes for use with 8 bit encodings
// using simple textual map files. Map files should contain (at least) two
// tab-separated fields. The first field is the mapped value and the second
// field is the unicode value.

struct ltus_t 
{
  bool operator()(const unsigned short &t1, const unsigned short &t2) const
  { return t1 < t2; }
};


class simplemapconvert {
public:
  simplemapconvert () {absentc=0; loaded=false;}
  unsigned short convert (unsigned short c, bool in);
  void setmapfile (const text_t &themapfile) {mapfile = themapfile;}

protected:
  bool loadmapfile (bool in);

  map <unsigned short, unsigned short, ltus_t> mapping;
  bool loaded;
  text_t mapfile;
  unsigned short absentc;
};


class simplemapinconvertclass : public inconvertclass {
public:
  virtual ~simplemapinconvertclass () {}

  void convert (text_t &output, status_t &status);

  void setmapfile (const text_t &themapfile) {converter.setmapfile(themapfile);}
  
protected:
  simplemapconvert converter;
};

class simplemapoutconvertclass : public rzwsoutconvertclass {
public:
  virtual ~simplemapoutconvertclass () {}

  void convert (char *output, size_t maxlen, 
		size_t &len, status_t &status);

  void setmapfile (const text_t &themapfile) {converter.setmapfile(themapfile);}
  
protected:
  simplemapconvert converter;
};




#endif
