/**********************************************************************
 *
 * mgsearch.cpp -- 
 * Copyright (C) 1999  The New Zealand Digital Library Project
 *
 * A component of the Greenstone digital library software
 * from the New Zealand Digital Library Project at the
 * University of Waikato, New Zealand.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 *********************************************************************/

#include "gsdlconf.h"
#include "mgsearch.h"
#include "fileutil.h"

#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>

#if defined(GSDL_USE_OBJECTSPACE)
#  include <ospace\std\iostream>
#elif defined(GSDL_USE_IOS_H)
#  include <iostream.h>
#else
#  include <iostream>
#endif

  
#include <assert.h>

#include "mgq.h"
// #include "locateinfo.h"
#include "gsdlunicode.h"
#include "unitool.h"


/////////////
// globals //
/////////////

static char *tempdoc = NULL;
static int templen = 0;


//////////////////////
// useful functions //
//////////////////////


// input and output are in utf8
text_t mgsearch_stemword (const text_t &word) {
  // allocate working stem space
  int maxstemlen = mgq_getmaxstemlen ();
  unsigned char *word_stem = new unsigned char [maxstemlen + 2];
  if (word_stem == NULL) return "";

  // copy word to word_stem
  int len = 0;
  text_t::const_iterator here = word.begin();
  text_t::const_iterator end = word.end();
  while (len < maxstemlen && here != end) {
    word_stem[len+1] = (unsigned char)(*here);
    ++len; ++here;
  }
  word_stem[len+1] = '\0';
  word_stem[0] = len;

  mgq_stemword (word_stem);

  // copy word_stem back to tempstr
  text_t tempstr;
  tempstr.setcarr((char *)(&word_stem[1]), word_stem[0]);

  delete [] word_stem;
  
  return tempstr;
}



////////////////////////
// callback functions //
////////////////////////

// This routine is called for each document found in a search
// it assumes that cache_num is set up correctly to point to
// a suitable result cache
int ourquerycallback(char * /*UDoc*/, int /*ULen*/, int DocNum, 
		     float Weight, void *info) {

  
  queryresultsclass *queryresults = (queryresultsclass * )info;

  // append this entry to the document results
  docresultclass docresult;
  docresult.docnum = DocNum;
  docresult.num_query_terms_matched = (int)(Weight/100.0); // will always be 0 on some versions of mg...
  docresult.docweight = Weight - docresult.num_query_terms_matched*100;
  
  queryresults->docs.docset[DocNum] = docresult;
  queryresults->docs.docorder.push_back(DocNum);
  
  return 0;
}

int termequivcallback(char *Word, int ULen,  int /*Freq*/, 
		      float /*Weight*/,  void *info) {
  text_tset *equivterms = (text_tset *)info;
  if (equivterms == NULL) return 0;

  text_t thisterm;
  thisterm.setcarr(Word, ULen);

  equivterms->insert(thisterm);
  
  return 0;
}


void mgsearch_equivterms (const text_t &word, text_tset &equivterms) {
  // allocate working stem space
  int maxstemlen = mgq_getmaxstemlen ();
  unsigned char *word_stem = new unsigned char [maxstemlen + 2];
  if (word_stem == NULL) return;

  // copy word to word_stem
  int len = 0;
  text_t::const_iterator here = word.begin();
  text_t::const_iterator end = word.end();
  while (len < maxstemlen && here != end) {
    word_stem[len+1] = (unsigned char)(*here);
    ++len; ++here;
  }
  word_stem[len+1] = '\0';
  word_stem[0] = len;

  // get the equivalent terms
  mgq_equivterms (word_stem, termequivcallback, (void *)(&equivterms));
  
  delete [] word_stem;

  return;
}

  text_tset utf8equivterms; // kept as utf8 string for fast matching


// This callback is called once for each term in the query
int termfreqcallback(char *Word, int ULen,  int Freq, 
		     float /*Weight*/,  void *info) {
  queryresultsclass *queryresults = (queryresultsclass *)info;
  if (queryresults == NULL) return 0;

  text_t term;
  term.setcarr(Word, ULen);
  termfreqclass termfreq;

  termfreq.termstr = to_uni(term);
  text_t utf8termstem = mgsearch_stemword (term);
  termfreq.termstemstr = to_uni (utf8termstem);

  mgsearch_equivterms (utf8termstem, termfreq.utf8equivterms);
  
  termfreq.termfreq = Freq;
  queryresults->orgterms.push_back(termfreq);
  
  return 0;
}

// this callback is called once for each variation of each term
int termvariantscallback(char *Word, int ULen, int /*Freq*/,
			 float /*Weight*/, void *info) {

  text_t term;
  term.setcarr(Word, ULen);
  queryresultsclass *queryresults = (queryresultsclass *)info;
  queryresults->termvariants.insert(to_uni(term));

  return 0;
}

// This callback is for getting document text
int doctextcallback(char *Doc, int ULen,  int /*Freq*/, 
		    float /*Weight*/,  void * /*info*/) {
  if (Doc != NULL) {
    // Make a copy of this string so we can unload the database without losing it
    tempdoc = new char[ULen + 1];
    strcpy(tempdoc, Doc);
  }
  templen = ULen;
  
  return 0;
}


text_t mgsearchclass::getindexsuffix (const text_t &collection, 
			      const text_t &index) {

  text_t indexsuffix = "index";  
  indexsuffix = filename_cat (indexsuffix, index);
  if (indexstem.empty()) {
    // no index stem, use the coll name
    indexsuffix = filename_cat (indexsuffix, collection);
  } else {
    indexsuffix = filename_cat (indexsuffix, indexstem);
  }
  return indexsuffix;
}




////////////////////
// mgsearch class //
////////////////////

mgsearchclass::mgsearchclass ()
  : searchclass() {
 
}

mgsearchclass::~mgsearchclass () 
{
  if (cache != NULL) 
    {
      delete cache;
      cache = NULL;
    }
}

void mgsearchclass::set_indexstem(const text_t &stem) {
  indexstem = stem;
  
}

// you only need to use this function before doing any stemming
// casefolding and stemming will be set if values for them are
// provided (0 or 1).
// makeindexcurrent returns true if it was able to load the database
bool mgsearchclass::makeindexcurrent (const text_t &index,
				      const text_t &subcollection,
				      const text_t &language,
				      const text_t &collection,
				      int casefolding,
				      int stemming) {
  bool databaseloaded = true;

  // get the names of the collection, index and text suffixes
  char *ccollection = collection.getcstr();
  assert (ccollection != NULL);
  char *idxsuffix = (getindexsuffix (collection, (index+subcollection+language))).getcstr();
  assert (idxsuffix != NULL);
  char *txtsuffix = (getindexsuffix (collection, "text")).getcstr();
  assert (txtsuffix != NULL);
#ifdef __WIN32__
  char *ccollectdir = (collectdir+"\\").getcstr(); assert (ccollectdir != NULL);
#else
  char *ccollectdir = collectdir.getcstr(); assert (ccollectdir != NULL);
#endif

  if (load_database(ccollection, ccollectdir, idxsuffix, txtsuffix)) {
    if (casefolding == 0) mgq_ask(".set casefold off");
    else if (casefolding > 0) mgq_ask(".set casefold on");
    if (stemming == 0) mgq_ask(".set stem off");
    else if (stemming > 0) mgq_ask(".set stem on");
    
  } else databaseloaded = false;

  // free up the c strings
  delete []ccollection;
  delete []idxsuffix;
  delete []txtsuffix;
  delete []ccollectdir;

  return databaseloaded;
}


// stem word uses the values set in the last call to makeindexcurrent
// to stem the word. It is assumed that word is in unicode
text_t mgsearchclass::stemword (const text_t &word) {
  return to_uni (mgsearch_stemword (to_utf8 (word)));
}

text_t mgsearchclass::stemword (text_t::const_iterator here, text_t::const_iterator end) {
  return to_uni (mgsearch_stemword (to_utf8 (here, end)));
}

/**
 * search directs the whole execution of the search; a number of other
 * functions in this class are called as a result, and precondition
 * checks are also made
 */
bool mgsearchclass::search(const queryparamclass &queryparams, 
			   queryresultsclass &queryresults) {
  //  assert (cache != NULL);

  // clear any previous results
  queryresults.clear();
  // first check the cache
  if (cache != NULL) {
    if (cache->find(queryparams, queryresults)) return true;
  }
  // make sure there is a query to be processed
  if (!has_unicode_letdig(queryparams.querystring)) return true;

  if (makeindexcurrent (queryparams.index, queryparams.subcollection,
			queryparams.language, queryparams.collection)) {
    // initialise the form of results
    setsearchmode (queryparams);

    // execute the query
    submitquery (queryparams);

    // retrieve the results
    getresults (queryparams, queryresults);
    unload_database();  // Important that local library doesn't leave any files open
    return true;
  }

  return false;
}

/* accumulator_method has been changed to use array rather than list.
list appears to be broken somewhat - for some ranked queries, it returned 
fewer results than it should have (eg 45 instead of 50). The three other
methods (array, splay_tree, hash_table) all return the same number of 
documents, in the same order, with the same ranks. list returns what 
appears to be the same documents (but less of them), but with different ranks,
and in a different order. Minimal time tests dont show any speed improvement
of list over array (maybe because its broken??).  [02/2001, kjm18]

... [sjboddie, also 02/2001] turns out that changing the accumulator_method
introduced a more serious bug than it fixed (i.e. occasionally when doing a
ranked search for a very common word you get no results at all). I've
changed it back to list for now, one day we should play with other
accumulator_methods but for now I don't have time and don't want to risk
introducing bugs (better the devil you know ;)
*/
void mgsearchclass::setsearchmode (const queryparamclass &queryparams)
{
  mgq_ask(".set expert true");
  mgq_ask(".set sorted_terms true");
  mgq_ask(".set accumulator_method list");
  mgq_ask(".set max_accumulators 500000");
  mgq_ask(".set maxparas 500000");
  mgq_ask(".set verbatim true");
  mgq_ask(".unset skip_dump");
  mgq_ask(".set mode docnums");

  switch (queryparams.search_type) 
    {
    case 0: mgq_ask(".set query boolean");  break;
    case 1:  mgq_ask(".set query ranked"); break;
    }
  switch (queryparams.casefolding) 
    {
    case 1: mgq_ask(".set casefold on");  break;
    case 0: mgq_ask(".set casefold off"); break;
    }
  switch (queryparams.stemming) 
    {
    case 1: mgq_ask(".set stem on");  break;
    case 0: mgq_ask(".set stem off"); break;
    }
  mgq_ask(".set heads_length 150");
  
  if (queryparams.maxdocs == -1) {
    mgq_ask(".set maxdocs all");
  } else {
    char maxdocstr[32];
    sprintf(maxdocstr, ".set maxdocs %i", queryparams.maxdocs);
    mgq_ask(maxdocstr);
  }

  char maxnumericstr[32];
  sprintf(maxnumericstr, ".set maxnumeric %i", queryparams.maxnumeric);
  mgq_ask(maxnumericstr);
  
}

/**
 * submitquery constructs the query string (into UTF8 encoding)
 * and submits it using mgq_ask to the mg search engine.  Most
 * of the processing will be done inside Greenstone
 */
void mgsearchclass::submitquery (const queryparamclass &queryparams)
{
  // sort out the query string; copy it, remove all special characters
  // and then convert it to a string in UTF8 format
  text_t ttquerystring = queryparams.querystring;
  filterquery (ttquerystring);
  char *querystring = to_utf8(ttquerystring).getcstr();

  // submit the query
  mgq_ask(querystring);

  // destroy the temporary character array
  delete []querystring;
}

/**
 * getrults is called to retrieve the required data on the docs
 * which responded to the query submitted in submitquery above.
 *
 * It calls the local mgquery (mgq) interface to MG several times,
 * to obtain the document numbers, term frequencies, term variants
 * etc.  All processing of the query will be done by Greenstone
 * thereafter
 */
void mgsearchclass::getresults (const queryparamclass &queryparams,
				queryresultsclass &queryresults) {
  // get the configuration for the maximum number of documents to
  // retrieve
  int howmany = queryparams.maxdocs;
  if (howmany == -1) howmany = MAXNUMDOCS;
  mgq_results(result_docnums, 0, howmany, 
	      ourquerycallback, (void *)(&queryresults));
  
  // get the term frequencies
  mgq_results(result_termfreqs, 0, MAXNUMTERMS, 
	      termfreqcallback, (void *)(&queryresults));
  queryresults.sortuniqqueryterms();

  // get term variants 
  mgq_results(result_terms, 0, MAXNUMTERMS,
              termvariantscallback, (void *)(&queryresults));

  // get the number of documents retrieved
  int total_retrieved = 0, is_approx = 0;
  mgq_docsretrieved (&total_retrieved, &is_approx);

  if (total_retrieved == 0) {
    // not available (or really was zero)
    queryresults.docs_matched = queryresults.docs.docset.size();
    if ((queryparams.maxdocs == -1) ||
	(queryresults.docs_matched < queryparams.maxdocs))
      queryresults.is_approx = Exact;
    else
      queryresults.is_approx = MoreThan;
  } else {
    queryresults.docs_matched = total_retrieved;
    if (is_approx) queryresults.is_approx = Approximate;
    else queryresults.is_approx = Exact;
  }
}

/**
 * Tidies the given querystring, removing special characters
 */
void mgsearchclass::filterquery (text_t &ttquerystring) {
  text_t::iterator ithere = ttquerystring.begin ();
  text_t::iterator itend = ttquerystring.end ();
  
  // remove all non alphanumeric characters (except
  // boolean operators
  while (ithere != itend) {
    if ((!is_unicode_letdig(*ithere)) && (*ithere != '!') &&
	(*ithere != '&') && (*ithere != '|') && (*ithere != '(') &&
	(*ithere != ')')) (*ithere) = ' ';
    ++ithere;
  }
}


// the document text for 'docnum' is placed in 'output'
// docTargetDocument returns 'true' if it was able to
// try to get a document
// collection is needed to see if an index from the 
// collection is loaded. If no index has been loaded
// defaultindex is needed to load one
bool mgsearchclass::docTargetDocument(const text_t &defaultindex,
				      const text_t &defaultsubcollection,
				      const text_t &defaultlanguage,
				      const text_t &collection,
				      int docnum,
				      text_t &output) {
  output.clear();

  // get the mg version of the document
  char *mgdoc = NULL;
  int doclen = 0;
  if (!mgdocument (defaultindex, defaultsubcollection, defaultlanguage,
		   collection, docnum, mgdoc, doclen)) return false;
  if (mgdoc == NULL) return false;

  // replace all control-Cs with spaces
  char *mgdoc_here = mgdoc;
  char *mgdoc_end = mgdoc + doclen;
  while (mgdoc_here < mgdoc_end) {
    if (*mgdoc_here == '\x3') *mgdoc_here = ' ';
    ++mgdoc_here;
  }

  // convert this document to unicode
  utf8inconvertclass inconvert;
  convertclass::status_t status;
  inconvert.reset ();
  inconvert.setinput (mgdoc, doclen);
  inconvert.convert (output, status);

  delete[] mgdoc;
  return true;
}


bool mgsearchclass::mgdocument (const text_t &defaultindex, 
				const text_t &defaultsubcollection,
				const text_t &defaultlanguage,
				const text_t &collection,
				int docnum,
				char *&UDoc, int &ULen) {
  int databaseloaded = 0;

  UDoc = NULL; ULen = 0;
  
  // see if we can make an appropriate database current
//    char *ccollection = collection.getcstr();
//    assert (ccollection != NULL);
//    databaseloaded = load_text_database (ccollection);
//    delete []ccollection;
  
  // try and load the database
//    if (!databaseloaded)
  databaseloaded = makeindexcurrent (defaultindex, defaultsubcollection,
				     defaultlanguage, collection);
  
  if (databaseloaded) {
    // retrieve the document from mg
    char docstr[32];
    sprintf(docstr, "%i", docnum);
    
    mgq_ask(".set mode text");
    mgq_ask(".set query docnums");
    mgq_ask(docstr);

    tempdoc = NULL;
    templen = 0;
    mgq_results (result_docs, 0, 1, doctextcallback, (void *)NULL);
    UDoc = tempdoc;
    ULen = templen;
  }

  unload_database();  // Important that local library doesn't leave any files open
  return (bool)databaseloaded;
}

// unload_database simply calls mgq's close_all_databases function to clear
// any cached databases - this is useful when attempting to completely
// remove all trace of a collectionserver at runtime (when using a
// persistent version of Greenstone like the windows local library)
void mgsearchclass::unload_database () {
  close_all_databases();
}
