/**********************************************************************
 *
 * summarise.cpp -- 
 * Copyright (C) 1999  The New Zealand Digital Library Project
 *
 * A component of the Greenstone digital library software
 * from the New Zealand Digital Library Project at the
 * University of Waikato, New Zealand.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 *********************************************************************/

/* The function 'summarise' produces, given a document text and a query,
 * a (query-biased) summary. In the future, several types of summaries will
 * be supported.
 */

#include "summarise.h"
#include "unitool.h"
#include <string.h>

#include <iostream>
using namespace std;

/* **************** LOCAL PROTOTYPES **************** */

text_t summarise_startend(text_t &htmlstr, int summaryLength);
text_t summarise_keywords(text_t &htmlstr, text_t &query, int summaryLength);

text_t next_sentence(text_t::iterator& start, text_t::iterator& end);
text_t previous_sentence(text_t::iterator& start, text_t::iterator& end);
bool paragraph_tag(text_t::iterator start);


/****************************************************
 NAME: summarise
 DESC: produce a summary for a document
*****************************************************/

text_t summarise(text_t &htmlstr, text_t &query, int summaryLength) {
  //  return summarise_startend(htmlstr,summaryLength);
  return summarise_keywords(htmlstr,query,summaryLength);
}


/****************************************************
 NAME: summarise_startend
 DESC: return first and last sentences of a document
*****************************************************/

text_t summarise_startend(text_t &htmlstr, int summaryLength) {
  text_t::iterator str_start = htmlstr.begin(), str_end = htmlstr.end();
  text_t answer;

  // add first sentences up to half the summary length

  text_t::iterator str_current = str_start;
  while(str_current<str_end && answer.size()<(summaryLength/2)) {
    text_t sentence = next_sentence(str_current,str_end);
    answer.append(sentence);
  }

  summaryLength -= answer.size(); // summary length left for last sentences
  if(summaryLength<0)
    summaryLength = 0;

  str_end = str_current;
  str_current = htmlstr.end()-1;
  text_t lastSentence;
  while(str_current>str_end && 
        lastSentence.size()<summaryLength) {
    text_t sentence = previous_sentence(str_current,str_end);
    lastSentence = sentence + lastSentence;
  }

  answer += " ... ";
  answer += lastSentence;

  return answer;
}


/****************************************************
 NAME: summarise_keywords
 DESC: build a summary with sentences containing keyword(s).
       the sentences with most matching keywords are returned 
       first.
*****************************************************/

text_t summarise_keywords(text_t &htmlstr, text_t &query, int summaryLength) 
{

  if ((query.size()==0) || (htmlstr.size()==0)) {
	  return "";
  }

  text_tarray allterms, terms;
  splitchar(query.begin(),query.end(),' ',allterms);

  // consider only non-empty terms
  for (text_tarray::iterator term = allterms.begin();
      term < allterms.end(); ++term) {
    if (!(*term).empty())
      terms.push_back(*term);
  }

  if (terms.size()==0) {
	  return "";
  }

  text_tarray::iterator terms_start = terms.begin(), terms_end = terms.end();

  //text_tarray::iterator terms_current = terms_start;
  //strstr("merde",*terms_current);

  text_t::iterator str_start = htmlstr.begin(), str_end = htmlstr.end();

  vector<text_tarray> answers(terms.size()); 
    // an array of array of sentences for the summary:
    //   answers[0] contains sentences with 1 keyword
    //   answers[1] contains sentences with 2 keywords, etc.

  vector<int> answersSize(terms.size());
    // answersSize[0] is the combined size of sentences with 1 keyword, etc.
  for(vector<int>::iterator size = answersSize.begin();
      size<answersSize.end(); ++size) {
    *size = 0; // initialise sentence size
  }

  int totfound  = 0;
  text_t::iterator str_current = str_start;
  while (str_current<str_end && answersSize[terms.size()-1]<summaryLength) {
    // if the size of best sentences is greater than summary, that's enough!
    text_t sentence = next_sentence(str_current,str_end);

    text_tarray::iterator terms_current = terms_start;
    int nFound = 0;
    while (terms_current!=terms_end) {
      text_t::iterator word = findword(sentence.begin(),sentence.end(),
				       *terms_current);
      if (word!=sentence.end()) {
        ++nFound; 
	++totfound; 
      }
      ++terms_current;
    }

    if (nFound>0 && answersSize[nFound-1]<summaryLength) {
      answers[nFound-1].push_back(sentence);
      answersSize[nFound-1] += sentence.size();
    }
  }

  text_t answer;

  // Changed to using reverse iterator, as there is some concern as to
  // whether the operations encoded with the usual iterator -- e.g. 
  // answers.end()-1 and so forth -- are safe.  Certainly the code
  // works out tidier using the reverse iterator and the segmentation
  // fault that was occurring in this block went away

  for (vector<text_tarray>::reverse_iterator sentarray = answers.rbegin();
      sentarray<answers.rend(); ++sentarray) {
    for (text_tarray::iterator sentence = (*sentarray).begin(); 
        sentence < (*sentarray).end(); ++sentence) {
      answer.append(*sentence);

      if(answer.size()>=summaryLength) {
        return answer;
      }
    }
  }

  if (!answer.empty()) {
    return answer;
  }

  return summarise_startend(htmlstr,summaryLength);
}


/* *********************** LOCAL FUNCTIONS ******************* */

/* NAME: next_sentence
   DESC: returns next sentence, text-only (ie. HTML markup is removed)
 */

text_t next_sentence(text_t::iterator& start, text_t::iterator& end) {
  text_t sentence;                 // the sentence to be returned
  bool   foundPunctuation = false; // set to true by '.', '!' or '?'
  while(start<end && !foundPunctuation) {
    switch (*start) {
    case '<': // skip over rest of html tag
      if(paragraph_tag(start) && has_unicode_letdig(sentence))
        foundPunctuation = true;
      while ((start<end) && (*start!='>'))
	++start;
      if(start<end) ++start;
      break;
    case '.':
    case '!':
    case '?':
      sentence.push_back(*start);
      ++start;
      if(start>=end || 
         (is_unicode_space(*start) && (*(start-2)<'A' || *(start-2)>'Z'))) {
        foundPunctuation = true;
      }
      break;
    default:
      sentence.push_back(*start);
      ++start;
      break;
    }
  }
  return sentence;
}


/* NAME: previous_sentence
   DESC: returns previous sentence, text-only (ie. HTML markup is removed)
 */

text_t previous_sentence(text_t::iterator& start, text_t::iterator& end) {
  text_t sentence;                    // the sentence to be returned
  bool   found1stPunctuation = false, // set to true by '.', '!' or '?'
                                      // first punct. is included in results
         found2ndPunctuation = false; // second punct. is stop condition,
                                      // and is not included in results
  while(start>end && !found2ndPunctuation) {
    switch (*start) {
    case '>': // skip over rest of html tag
      while ((start>end) && (*start!='<')) // backtrack to beginning of tag
	--start;
      if(start>end) {
        if(paragraph_tag(start) && has_unicode_letdig(sentence))
          found2ndPunctuation = true;
        --start;
      }
      break;
    case '.':
    case '!':
    case '?':
      if(!is_unicode_space(*(start+1)) ||
         (start-1>end && *(start-1)>='A' && *(start-1)<='Z')) {
        // if next character is not a blank, or preceding character is
	// a capital letter, we guess it's an acronym (e.g. "U.S.A.")
        sentence.text_as_usvector().insert(sentence.text_as_usvector().begin(),
          start,start+1);
        --start;
      } else
        if(has_unicode_letdig(sentence) || found1stPunctuation)
          found2ndPunctuation = true;
        else {
          sentence.text_as_usvector().insert(
              sentence.text_as_usvector().begin(),start,start+1);
          --start;
          found1stPunctuation = true;
        }
      break;
    default:
      sentence.text_as_usvector().insert(
        sentence.text_as_usvector().begin(),start,start+1);
      --start;
      break;
    }
  }
  return sentence;
}


// start is positioned on the '<'
bool paragraph_tag(text_t::iterator start) {
  if(*start=='<') {
    ++start;
    if(*start=='p' || *start=='P') {
      ++start;
      if(is_unicode_space(*start) || *start=='>')
        return true;
    }
  }
  return false;
}
