/**************************************************************************
 *
 * QueryParser.cpp -- Query parser for a simple query language
 * Copyright (C) 2000  Rodger McNab
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 **************************************************************************/

#include "GSDLQueryParser.h"
#include "GSDLQueryLex.h"
#include "words.h"

static QueryNode *ParseExpression (UCArray::const_iterator &here,
				   UCArray::const_iterator end, 
				   int defaultBoolCombine,
				   int defaultStemMethod);

static QueryNode *AndAdd (QueryNode *t1, QueryNode *t2) {
  if (t1 == NULL) return t2;
  if (t2 == NULL) return t1;

  AndQueryNode *andNode = new AndQueryNode;
  andNode->leftNode = t1;
  andNode->rightNode = t2;
  return andNode;
}

static QueryNode *OrAdd (QueryNode *t1, QueryNode *t2) {
  if (t1 == NULL) return t2;
  if (t2 == NULL) return t1;

  OrQueryNode *orNode = new OrQueryNode;
  orNode->leftNode = t1;
  orNode->rightNode = t2;
  return orNode;
}

static QueryNode *NotAdd (QueryNode *t1, QueryNode *t2) {
  if (t1 == NULL) return t2;
  if (t2 == NULL) return t1;

  NotQueryNode *notNode = new NotQueryNode;
  notNode->queryNode = t1;
  notNode->notNode = t2;
  return notNode;
}

// expects the opening bracket to have already been parsed
// and discarded
static QueryNode *ParseBracketExpression (UCArray::const_iterator &here,
					  UCArray::const_iterator end, 
					  int defaultBoolCombine,
					  int defaultStemMethod) {
  // get everything in the expression
  QueryNode *curTree = ParseExpression (here, end, defaultBoolCombine,
					defaultStemMethod);
  
  // gobble up tokens until a closing bracket is found
  // or the end of the string
  LexEl el;
  while (ParseLexEl (here, end, el)) {
    if (el.lexType == CloseBracketE) break;
  }

  return curTree;
}

static int ParseInt (UCArray::const_iterator &here,
		     UCArray::const_iterator end) {
  LexEl el;
  UCArray::const_iterator oldHere = here;
  if (ParseLexEl (here, end, el) && el.lexType == IntegerE)
    return el.num;
  
  here = oldHere; // not an integer
  return 0;
}

// default is within 20 words
static void SetRangeValues (TermNode &termNode,
			    UCArray &nearby,
			    bool reverse) {
  UCArray NEARBY; SetCStr(NEARBY, "NEAR", 4);
  UCArray WITHIN; SetCStr(WITHIN, "WITHIN", 6);
  
  if (nearby == NEARBY) { // no modifier
    termNode.startRange = (NEAR_DEFAULT+1)*-1;
    termNode.endRange = NEAR_DEFAULT;

  } else if (nearby == WITHIN) { // no modifier
    if (reverse) {
      termNode.startRange = (NEAR_DEFAULT+1)*-1;
      termNode.endRange = -1;
    } else {
      termNode.startRange = NEAR_DEFAULT;
      termNode.endRange = 0;
    }
  }
  else { // extract number
    UCArray::const_iterator here;
    bool within = false;
    if (PrefixLen(nearby, WITHIN)==6) {
      within=true;
      here = nearby.begin()+6;
    } else {
      here = nearby.begin()+4;
    }
    UCArray::const_iterator end = nearby.end();
    int size=0;
    while (here != end) {
      size = size*10 + (*here-'0');
      ++here;
    }
    if (within) {
      if (reverse) {
	termNode.startRange = size;
	termNode.endRange = 0;
      } else {
	termNode.startRange = -1 * (size+1);
	termNode.endRange = -1;
      }
    } else {
      termNode.startRange = -1 * (size+1);
      termNode.endRange = size;
    }
  }
}

static mg_u_long GetStemMethod(LexEl &el, int defaultStemMethod) {
  // here expect el to contain some of c,s,i,u,f,a -- see stemmer.h CHAR_FLAG_STEM_* constants
  mg_u_long stem = (mg_u_long)defaultStemMethod;
      
  UCArray::const_iterator here = el.text.begin();
  UCArray::const_iterator end = el.text.end();

  /* [JFG - Mar 06: Accent folding patch] */
  /* Changed to use CHAR_FLAG_STEM* constants from mg_files.h */
  while(here != end) {
    unsigned char ch = *here; 
    if (strchr (CHAR_FLAG_STEM_Validator, ch) == NULL)
      return STEM_INVALID; // incorrect format
    
    switch(ch) {
    case CHAR_FLAG_STEM_CaseFold:		// ignore case (fold)
      stem |= STEM_CaseFolding;
      break;
    case CHAR_FLAG_STEM_NoCaseFold:		// case sensitive 
      stem &= (~STEM_CaseFolding);
      break;
    case CHAR_FLAG_STEM_Stemming:		// stem words
      stem |= STEM_Stemming;
      break;
    case CHAR_FLAG_STEM_NoStemming:		// do not stem words
      stem &= (~STEM_Stemming);
      break;
#ifdef ENABLE_ACCENTFOLD 
   case CHAR_FLAG_STEM_AccentFold:		// accent fold
      stem |= STEM_AccentFolding;
      break;
    case CHAR_FLAG_STEM_NoAccentFold:	// do no accent folding
      stem &= (~STEM_AccentFolding);
      break;
#endif
    };
    
    ++here;	    
  }
  return stem;
}
  

static void ParseTermModifiers (UCArray::const_iterator &here,
				UCArray::const_iterator end,
				TermNode &termNode,
				int defaultStemMethod) {
 
  termNode.stemMethod = defaultStemMethod;
  bool partial_match = false;
  LexEl el;
  UCArray::const_iterator oldHere = here;
  while (ParseLexEl (here, end, el)) {
    if (el.lexType == TermWeightE) {
      termNode.termWeight = ParseInt (here, end);
      
    } else if (el.lexType == StemMethodE) {
      oldHere = here;
      LexEl stem;
      if (ParseLexEl (here, end, stem) && stem.lexType == TermE) {
	termNode.stemMethod = GetStemMethod(stem, defaultStemMethod);
    /* [JFG - Mar 06: Accent folding patch] */
    /* use STEM_INVALID instead of hardcoded 4 */
	if (termNode.stemMethod == STEM_INVALID) { // error so backtrack
	  here = oldHere;
	  termNode.stemMethod = (mg_u_long)defaultStemMethod;
	} 
      } else here = oldHere; //ignore - wrong syntax

    } else if (el.lexType == RangeE) {
      termNode.startRange = ParseInt (here, end);
      termNode.endRange = ParseInt (here, end);
      
    } else if (el.lexType == AtE) {
      termNode.startRange = termNode.endRange = ParseInt (here, end);
    } else if (el.lexType == StarE) {
      partial_match = true;
    } else {
      // no term modifiers
      here = oldHere;
      break;
    }
    
    if (partial_match) {
      /* [JFG - Mar 06: Accent folding patch] */
      /* use STEM_PARTIAL_MATCH flag */
      termNode.stemMethod |= STEM_PARTIAL_MATCH; // set partial match flag
      termNode.stemMethod &= (~STEM_Stemming); // we dont have stemming on if doing partial matching.
      termNode.stemMethod &= (~STEM_AccentFolding); // we dont have accentfolding on if doing partial matching.
    }
    oldHere = here;
  }  
}

static void ParseProxModifiers (UCArray::const_iterator &here,
				UCArray::const_iterator end,
				ProxMatchQueryNode *proxNode) {
  // so far only have one - the tag stuff
  LexEl el;
  UCArray::const_iterator oldHere = here;
  while (ParseLexEl (here, end, el)) {
    if (el.lexType == TagE) {
      oldHere = here; // don't backtrack past here
      if (ParseLexEl (here, end, el) && el.lexType == TermE) {
	proxNode->tagNodePtr = new TagNode;
	proxNode->tagNodePtr->tagName = el.text;
	
      }
      else { // error in tag
	here = oldHere;
      }
    } // TagE
    // add in other cases here
    else {
      // no modifiers
      here = oldHere;
      break;
    }
    oldHere = here;
  }//while


}

// expects starting brackets to have been parsed
// sets error to true if something has gone wrong
static ProxMatchQueryNode *ParseSquareBrackets(UCArray::const_iterator &here,
				UCArray::const_iterator end,
				/*ProxMatchQueryNode *proxNode,*/
				int defaultStemMethod,
				bool & error) {

  ProxMatchQueryNode *proxNode = new ProxMatchQueryNode;
  LexEl el;
  bool phrase=false;
  bool first=true;
  bool prox = false;
  UCArray near_string;
  while (ParseLexEl (here, end, el)) {
    // cant have AND, OR, NOT in square brackets, so assume they are words
    if (el.lexType == TermE || el.lexType == IntegerE || el.lexType == AndOpE || el.lexType == OrOpE || el.lexType == NotOpE) {
      TermNode termNode;
      termNode.term = el.text;
      ParseTermModifiers (here, end, termNode, defaultStemMethod);
      if (phrase) {
        if (first) first=false;
        else {
          termNode.startRange = -2;
          termNode.endRange = -1;
        }
      } else if (prox) {
	SetRangeValues(termNode, near_string, false);
	prox = false;
      }
      proxNode->terms.push_back(termNode);
    }
    else if (el.lexType == CloseSquareBracketE) {
      break;
    }
    else if (el.lexType == QuoteE) {
      // phrase inside square brackets
      if (phrase) { // end of phrase
	phrase=false;
	first = true;
      } else {
	phrase=true; // start of phrase
      }
    } else if (el.lexType == NearOpE || el.lexType == WithinOpE) {
      if (phrase) {
	// cant have proximity  op in a phrase - just assume its an actual word
	TermNode termNode;
	termNode.term = el.text;
	ParseTermModifiers (here, end, termNode, defaultStemMethod);
	proxNode->terms.push_back(termNode);
      } else {
	// its a NEAR or within op
	prox = true;
	near_string = el.text;
      }
	  
    } 
    else if (el.lexType == UnknownE) {
      // just ignore it 
    }
    else {
      //error - we set the proxNode to NULL,
      cerr <<"GSDLQueryParser: bad syntax inside []\n";
      error = true;
      return NULL;
    }
  } // while
  return proxNode;
}
// expects the starting quote to have been parsed
// and discarded
// now phrases use the case and stem preference options
// ie can search for a phrase ignoring case
static void ParsePhrase (UCArray::const_iterator &here,
			 UCArray::const_iterator end,
			 ProxMatchQueryNode &proxNode, 
			 int defaultStemMethod,
			 bool &error) {
  LexEl el;
  bool first = true;
  while (ParseLexEl (here, end, el)) {
    if (el.lexType == TermE || el.lexType == IntegerE) {
      TermNode termNode;
      termNode.term = el.text;
      //termNode.stemMethod = defaultStemMethod;
      ParseTermModifiers (here, end, termNode, defaultStemMethod);
      if (first) {
	first = false;
      }
      else {
	termNode.startRange = -2;
	termNode.endRange = -1;
      }
      proxNode.terms.push_back (termNode);
      
    } else if (el.lexType == QuoteE) {
      break;
      
    } else if (el.lexType == UnknownE) {
      // just ignore it
    } else {
      // error 
      error = true;
      return;
    }
  }
}

static QueryNode *ParseTerm (UCArray::const_iterator &here,
			     UCArray::const_iterator end,
			     int defaultBoolCombine,
			     int defaultStemMethod) {
  LexEl el;

  UCArray::const_iterator oldHere = here;
  if (!ParseLexEl (here, end, el)) return NULL;

  if (el.lexType == OpenBracketE)
    return ParseBracketExpression (here, end, defaultBoolCombine,
				   defaultStemMethod);

  ProxMatchQueryNode *proxNode = new ProxMatchQueryNode;

  if (el.lexType == TermE || el.lexType == IntegerE) {
    TermNode termNode;
    termNode.term = el.text;
    ParseTermModifiers (here, end, termNode, defaultStemMethod);
    oldHere = here;  // dont backtrack past here
    if (ParseLexEl(here, end, el) && (el.lexType == NearOpE || el.lexType == WithinOpE )) {
	delete proxNode;
	oldHere = here;
      // this is calling ParseTerm again, but only a subset of the things accepted by ParseTerm are appropriate here. add in some hacks to avoid segmentation faults - kjdon, 04/2003
      
      // if the next element is a '(' have a syntax error, return NULL 
      LexEl temp_el;
      if (ParseLexEl(here, end, temp_el) && temp_el.lexType == OpenBracketE) {
	cerr << "GSDLQueryParser: NEAR/WITHIN cannot be followed by a '('\n";
	return NULL;
      }
      here = oldHere; // else backtrack
	
      proxNode = (ProxMatchQueryNode *)ParseTerm(here, end, defaultBoolCombine,
						 defaultStemMethod);
      SetRangeValues(termNode, el.text, true);
      proxNode->terms.push_back (termNode);
      return proxNode;
      
    } else {
      here = oldHere; // backtrack
      proxNode->terms.push_back (termNode);
      ParseProxModifiers(here, end, proxNode);
      return proxNode;
    }
  } else if (el.lexType == QuoteE) {
    bool error = false;
    ParsePhrase (here, end, *proxNode, defaultStemMethod, error);
    if (error) {
      delete proxNode;
      return NULL;
    }
    return proxNode;
  }
  else if (el.lexType == OpenSquareBracketE) {
    bool error = false;
    proxNode = ParseSquareBrackets (here, end, /*proxNode, */defaultStemMethod, error);
    if (error) {
      delete proxNode;
      return NULL;
    }
    ParseProxModifiers (here, end, proxNode);
    return proxNode;
  }

  // not a term
  here = oldHere;
  delete proxNode;
  return NULL;
}


static QueryNode *ParseExpression (UCArray::const_iterator &here,
				   UCArray::const_iterator end,
				   int defaultBoolCombine,
				   int defaultStemMethod) {
  LexEl el;
  QueryNode *curTree = NULL;
  UCArray::const_iterator oldHere = here;
  while (ParseLexEl (here, end, el)) {
    if (el.lexType == CloseBracketE) {
      // parsebracketexpression is waiting for the last bracket, so put it back
      here = oldHere;
      break;
      
    } else if (el.lexType == OpenSquareBracketE ||
	       el.lexType == OpenBracketE ||
	       el.lexType == TermE ||
	       el.lexType == QuoteE ||
	       el.lexType == IntegerE ) { 
      
      // some type of term, back track and parse it
      here = oldHere;

      //  parse the term
      QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine, 
				       defaultStemMethod);
      if (newTerm == NULL) {
	delete curTree;
	return NULL;
      }

      // if default==1, AND, else if==0, OR
      if (defaultBoolCombine) {
	curTree = AndAdd (curTree, newTerm);
      }
      else {
	curTree = OrAdd (curTree, newTerm); 
      }
      
    } else if (el.lexType == AndOpE) {
      QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
				       defaultStemMethod);
      if (newTerm == NULL) {
	delete curTree;
	return NULL;
      }
      curTree = AndAdd (curTree, newTerm);
      
    } else if (el.lexType == OrOpE) {
      QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
				       defaultStemMethod);
      if (newTerm == NULL) {
	delete curTree;
	return NULL;
      }
      curTree = OrAdd (curTree, newTerm);
      
    } else if (el.lexType == NotOpE) {
      QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
				       defaultStemMethod);
      if (newTerm == NULL) {
	delete curTree;
	return NULL;
      }
      curTree = NotAdd (curTree, newTerm); 
      
    } else if (el.lexType == UnknownE) {
      // just ignore it 
    } else {

      // syntax error, return NUll
      delete curTree;
      return NULL;
    }

    oldHere = here;
  }

  return curTree;
}

QueryNode *ParseQuery (const UCArray &queryStr, int defaultBoolCombine,
		       int defaultStemMethod, int maxnumeric) {
  if (4 < maxnumeric && maxnumeric < 512) {
    MAXNUMERIC = maxnumeric;
  }
  UCArray::const_iterator here = queryStr.begin();
  UCArray::const_iterator end = queryStr.end();
  return ParseExpression (here, end, defaultBoolCombine, defaultStemMethod);
}
