package org.greenstone.gatherer.util;

import java.io.Reader;
import java.io.IOException;

import java.io.File;
import java.io.FileReader;

import java.util.regex.Pattern;

public class RemoveContentBeforeRootElementXMLReader extends Reader {

  static final Pattern[] xmlIndicators;
  static Pattern commentStart;
  static Pattern commentStop;

  static {
    // generate the xml starting sequences we will try to match against
    xmlIndicators = new Pattern[3];
    try{
      xmlIndicators[0] = Pattern.compile("<\\?xml"); //usual xml declaration
      xmlIndicators[1] = Pattern.compile("<!DOCTYPE"); //doctype declaration
      xmlIndicators[2] = Pattern.compile("<[a-zA-Z-]+[ >/]"); // the beginning of a root node

      commentStart = Pattern.compile("<!--");
      commentStop = Pattern.compile("-->");

    } catch ( java.util.regex.PatternSyntaxException pse ) {
        System.err.println( "Pattern no good. " + pse );
        xmlIndicators[0] = xmlIndicators[1] = xmlIndicators[2] = null;
        commentStart = commentStop = null;
    }
  }

  Reader ur;

  String finalBuffer = null;
  int finalBufferIndex = 0;

  public RemoveContentBeforeRootElementXMLReader( Reader ur ) {

    this.ur = ur;

    int found = -1;
    boolean inComment = false;
    StringBuffer buffer = null;

    for ( int c = 0;  c != -1 && found == -1; ) {

      //read a character
      try {
        c = ur.read();
      } catch( Exception e ) {
        System.err.println( "Exception while reading underlying Reader in RemoveContentBeforeRootElementXMLReader" );
      }

      //break out if we have reached the end of the input
      if ( c == -1 ) {
        break;
      }

      //we start buffering when we come across the first <
      //regardless of whether it turns out to be a relevant < or not
      if ( buffer == null && (char)c == '<' )  {
        buffer = new StringBuffer();
      }

      //if not buffering, just display the character and move onto next character
      if ( buffer == null ) {
         System.err.print( (char)c );
        continue;
      }

      buffer.append( (char)c );

      //check for comment open or close
      if ( !inComment ) {
        if ( commentStart.matcher(buffer.toString()).find() ) {
          inComment = true;
          System.err.print( buffer.toString() );
          buffer = new StringBuffer();
        }
      } else {
        if ( commentStop.matcher(buffer.toString()).find() ) {
          inComment = false;
          System.err.print( buffer.toString() );
          buffer = new StringBuffer();

          //skip to reading next character
          continue;
        }
      }

      if ( !inComment ) {

        //check each indicator to see if found
        for ( int i = 0; i < xmlIndicators.length && found == -1; i++ ) {
          if ( xmlIndicators[i].matcher(buffer.toString()).find() ) {
            found = i;
            String line = buffer.toString();
            int lastIndex = line.lastIndexOf('<');
            //flush the previous characters in the buffer to the console
            System.err.print(line.substring(0, lastIndex));
            buffer.delete(0, lastIndex);
            finalBuffer = buffer.toString();
          }
        }

      }
    }

    if ( found == -1 ) {
      System.err.println( "RemoveContentBeforeRootElementXMLReader:\n" +
        "The XML being loaded was not valid: couldn't find start of XML input" );
    }
  
  }

  public int read( char[] cbuf, int off, int len ) throws IOException {

    for ( int i=off; i<off+len && i<cbuf.length; i++ ) {

      //read from underlying reader
      int c = read();

      //catch end of stream
      if ( c == -1 ) {
        if ( i == off ) {
          return -1;
        }
        return i - off;
      }

      //insert character into the array
      cbuf[i] = (char)c;
    }
    return len;

  }

  public int read() throws IOException {

    //flush the buffer containing the opening XML sequence
    if ( finalBuffer != null && finalBuffer.length() > finalBufferIndex ) {
      char c = finalBuffer.charAt(finalBufferIndex++);
      if ( finalBufferIndex == finalBuffer.length() ) {
        finalBuffer = null;
      }
      return c;
    }

    return ur.read();
  }

  public void close() throws IOException {
    ur.close();
  }

  public static void main ( String[] args ) {

    //init
    System.out.println( "------------\nWill now initialise the test reader\n------------" );
    RemoveContentBeforeRootElementXMLReader parser = null;
    try {
      parser = new RemoveContentBeforeRootElementXMLReader( 
        new FileReader( new File("text.xml") ) );
    } catch ( java.io.FileNotFoundException fnfe ) {
      System.err.println( "Please create text.xml to test this class" );
      System.exit(-1);
    }

    //read the rest of the input
    System.out.println( "------------\nWill now read the rest of the input\n------------" );
    try {
      int c = 0;
      while ( ( c = parser.read() ) != -1 ) {
        System.out.print( (char)c );
      }
    } catch ( Exception e ) {
      System.err.println("Exception: " + e);
    }
    
  }


}
