/**********************************************************************
 *
 * highlighttext.cpp -- 
 * Copyright (C) 2002   DL Consulting Ltd
 *
 * A component of the Greenstone digital library software
 * from the New Zealand Digital Library Project at the
 * University of Waikato, New Zealand.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 *********************************************************************/

#include "highlighttext.h"
#include "unitool.h"


static void highlight_terms(const text_t &text, const TermInfo_tarray &terms, const text_t &shl,
			    const text_t &ehl, displayclass &disp, outconvertclass &outconvert,
			    ostream &textout);

static void highlight_phrases(const text_t &text, const text_t &querystring, const TermInfo_tarray &terms, 
			      const text_t &shl, const text_t &ehl, displayclass &disp, 
			      outconvertclass &outconvert, ostream &textout);

static void get_phrase_terms (const text_t &querystring, text_tarray &phrase_terms);

static void remove_space (text_t &qstring);

// highlights text string by adding _starthighlight_ and _endhightlight_
// around terms and/or phrases that match querystring 

// - at present this only handles phrase searches where the first and last
// characters are double quotes (i.e. it won't correctly handle a mixture
// of phrase and non-phrase terms or queries containing multiple phrases) -
// it also doesn't highlight stemmed variations of terms within a phrase
// because the terminfo returned by mgqueryfilter doesn't currently tell
// you which term variants belong to which term

// - this function can be forced to treat the querystring like a phrase
// even if it isn't one by setting the "hl" cgi argument to "2"
void highlighttext(const text_t &text, cgiargsclass &args, const TermInfo_tarray &terms, 
		   displayclass &disp, outconvertclass &outconvert, ostream &textout) {


  text_t &querystring = args["q"];

  // get the text to start and end a hightlight
  text_t shl = "<b><u>";
  text_t ehl = "</u></b>";
  if (disp.isdefaultmacro(displayclass::defaultpackage, "starthighlight")) {
    disp.expandstring(displayclass::defaultpackage, "_starthighlight_", shl);
  }
  if (disp.isdefaultmacro(displayclass::defaultpackage, "endhighlight")) {
    disp.expandstring(displayclass::defaultpackage, "_endhighlight_", ehl);
  }

  // remove leading and trailing whitespace
  remove_space(querystring);

  // Expand macros before highlighting -- by Jens Wille
  text_t text_expanded = "";
  disp.expandstring(text, text_expanded);

  if ((args["hl"] == 2) || ((*(querystring.begin()) == '"') && (*(querystring.end()-1) == '"'))) {
    highlight_phrases(text_expanded, querystring, terms, shl, ehl, disp, outconvert, textout);
  } else {
    highlight_terms(text_expanded, terms, shl, ehl, disp, outconvert, textout);
  }
}

void highlight_terms(const text_t &text, const TermInfo_tarray &terms, const text_t &shl,
		     const text_t &ehl, displayclass &disp, outconvertclass &outconvert,
		     ostream &textout) {

  text_tmap allterms;
  text_tmap::const_iterator it;

  // first load all the term variations into a map
  TermInfo_tarray::const_iterator this_term = terms.begin();
  TermInfo_tarray::const_iterator last_term = terms.end();
  while (this_term != last_term) {
    text_tarray::const_iterator this_var = (*this_term).matchTerms.begin();
    text_tarray::const_iterator last_var = (*this_term).matchTerms.end();
    while (this_var != last_var) {
      allterms[*this_var] = 1;
      ++this_var;
    }
    ++this_term;
  }

  text_t::const_iterator here = text.begin();
  text_t::const_iterator end = text.end();

  text_t word, buffer;
  while (here != end) {
    if (is_unicode_letdig(*here)) {
      // not word boundary
      word.push_back(*here);
      ++here;

    } else {
      // found word boundary
      // add last word if there was one
      if (!word.empty()) {
	it = allterms.find(word);
	if (it != allterms.end()) {
	  word = shl + word + ehl;
	}
	buffer += word;
        word.clear();
      }

      if (*here == '<') {
        // skip over rest of html tag
	while ((here != end) && (*here != '>')) {
	  buffer.push_back(*here);
	  ++here;
	}
      }

      buffer.push_back(*here);
      ++here;

      if (buffer.size() > 1024) {
	textout << outconvert << disp << buffer;
	buffer.clear();
      }
    }
  }
  textout << outconvert << disp << buffer;
}

void highlight_phrases(const text_t &text, const text_t &querystring, const TermInfo_tarray &terms, 
		       const text_t &shl, const text_t &ehl, displayclass &disp, 
		       outconvertclass &outconvert, ostream &textout) {

  text_tmap allterms;
  text_tarray phrase_terms;
  text_tmap::const_iterator it;

  get_phrase_terms(querystring, phrase_terms);
  int phraselen = phrase_terms.size();

  TermInfo_tarray::const_iterator this_term = terms.begin();
  TermInfo_tarray::const_iterator last_term = terms.end();
  bool first = true;
  while (this_term != last_term) {
    text_tarray::const_iterator this_var = (*this_term).matchTerms.begin();
    text_tarray::const_iterator last_var = (*this_term).matchTerms.end();
    while (this_var != last_var) {
      allterms[*this_var] = 1;
      ++this_var;
    }
    first = false;
    ++this_term;
  }

  text_t::const_iterator here = text.begin();
  text_t::const_iterator end = text.end();

  text_t word, buffer;
  int phrasecount = 0;
  while (here != end) {
    if (is_unicode_letdig(*here)) {
      // not word boundary
      word.push_back(*here);
      ++here;

    } else {
      // found word boundary
      // add last word if there was one
      if (!word.empty()) {
	it = allterms.find(word);
	if (it != allterms.end()) {
	  // found a word that matches somewhere in the phrase

	  text_t lcword = word; lc(lcword);
	  if (lcword == phrase_terms[phrasecount]) {

	    if (phrasecount == 0) {
	      // clear the buffer (from here on buffer will contain the phrase
	      // as it's built up)
	      textout << outconvert << disp << buffer;
	      buffer.clear();
	    }
	    ++phrasecount;
	  } else {
	    phrasecount = 0;
	  }
	} else {
	  phrasecount = 0;
	}
	buffer += word;
        word.clear();
	
	if (phrasecount == phraselen) {
	  // have found entire phrase
	  textout << outconvert << disp << shl << buffer << ehl;
	  buffer.clear();
	  phrasecount = 0;
	}
      }

      if (*here == '<') {
        // skip over rest of html tag
	while ((here != end) && (*here != '>')) {
	  buffer.push_back(*here);
	  ++here;
	}
      }

      buffer.push_back(*here);
      ++here;

      if (buffer.size() > 1024 && phrasecount == 0) {
	textout << outconvert << disp << buffer;
	buffer.clear();
      }
    }
  }
  textout << outconvert << disp << buffer;
}

void get_phrase_terms (const text_t &querystring, text_tarray &phrase_terms) {

  phrase_terms.erase(phrase_terms.begin(), phrase_terms.end());

  text_t::const_iterator here = querystring.begin();
  text_t::const_iterator end = querystring.end();

  text_t word;
  while (here != end) {
    if (is_unicode_letdig(*here)) {
      // not word boundary
      word.push_back(*here);

    } else {
      // found word boundary
      if (!word.empty()) {
	lc(word);
	phrase_terms.push_back(word);
	word.clear();
      }
    }
    ++here;
  }

  if (!word.empty()) {
    lc(word);
    phrase_terms.push_back(word);
  }
}

void remove_space (text_t &qstring) {

  text_t altered_string;
  text_t space;

  text_t::const_iterator here = qstring.begin();
  text_t::const_iterator end = qstring.end();
  while (here != end) {
    if (is_unicode_space(*here)) {
      space.push_back(*here);
    } else {
      if (!altered_string.empty()) {
	altered_string += space;
      }
      space.clear();
      altered_string.push_back(*here);
    }
    ++here;
  }

  qstring = altered_string;
}
