/**********************************************************************
 *
 * hashfile.cpp -- 
 * Copyright (C) 1999  The New Zealand Digital Library Project
 *
 * A component of the Greenstone digital library software
 * from the New Zealand Digital Library Project at the
 * University of Waikato, New Zealand.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 *********************************************************************/

#include <stdio.h>

#define MAXNUMLEN  100   // bytes


// The number class is used to store very long unsigned integers.
// The least significant of each number is stored first. Numbers can 
// be no longer than MAXNUMLEN bytes long. A number with length=0
// is considered to be equal to 0. There should be no extra zeros
// at the front of the number.

struct number {
  unsigned int len;
  unsigned char num[MAXNUMLEN];

  number () {len = 0;}
};


// evaluates a = a + b
// a and b can be the same number
void my_inc (number &a, number &b) {
  unsigned int carry = 0;
  unsigned int num = 0;
  unsigned int i = 0;

  while ((carry || (i < a.len) || (i < b.len)) && (i < MAXNUMLEN)) {
    num = carry;
    if (i < a.len) num += a.num[i];
    if (i < b.len) num += b.num[i];

    if (num >= 256) {
      num -= 256; // assumes that each number in the list < 256
      carry = 1;
    } else {
      carry = 0;
    }
    a.num[i] = (unsigned char)num;

    ++i;
  }

  a.len = i;
}


// evaluates a = a - b if a >= b
// returns true if the subtraction was possible
int my_ifpos_dec (number &a, number &b) {
  int carry = 0;
  int num = 0;
  int i = 0;
  unsigned int len = 0; // the last non-zero digit

  // make sure a >= b
  if (b.len > a.len) return 0;
  if (b.len == a.len) {
    i = a.len - 1;
    while (i >= 0) {
      if (a.num[i] > b.num[i]) break;
      if (a.num[i] < b.num[i]) return 0;
      --i;
    }
  }

  unsigned int j = 0;
  len = 0;
  while ((j < a.len) || (j < b.len)) {
    num = -carry;
    if (j < a.len) num += a.num[j];
    if (j < b.len) num -= b.num[j];

    if (num < 0) {
      num += 256; // assumes each number in the list < 256
      carry = 1;
    } else {
      carry = 0;
    }

    a.num[j] = (unsigned char) num;
    
    ++j;
    if (num != 0) len = j;
  }

  a.len = len;

  if (carry) {
    // panic, this should NOT happen
    fprintf (stderr, "panic: error in my_ifpos_dec\n");
  }

  return 1;
}


char *my_convert_num (number &a) {
  char *result = new char[(a.len*2)+1];
  int i,len;
  char convert[16] = {'0','1','2','3','4','5','6','7',
		      '8','9','a','b','c','d','e','f'};

  // zero might be a special case
  if (a.len == 0) {
    result[0] = '0';
    result[1] = '\0';
    return result;
  }

  for (i=a.len-1, len=0; i >= 0; --i) {
    result[len++] = convert[a.num[i]/16];
    result[len++] = convert[a.num[i]%16];
  }
  result[len] = '\0';

  return result;
}



// create a hash string from the contents of a file.
//
// The file is treated as a large base 256 number (each char is a digit),
// and the hash value is the remainder when this number is divided by a
// very large prime.
//
// PROBLEM: This is a flawed hash function because the rightmost (highest)
// value in remainder is more likely to be a "1" than it is to be any other
// number.  
//
// EVIDENCE: About 50% of the files in any GSDL directory have a hash code
// that starts with 01* which implies (based on the my_convert_num function)
// that the rightmost "digit" of the remainder = z such that (z % 16 = 0)
// and (z / 16 = 1) which means (z = 1).
//
// MAJOR REASON: suppose our prime number was 19.  Then, if we have a 
// reasonably random distribution of numbers for which we are going to 
// calculate (N % 19), we expect to get a roughly uniform distribution
// of remainders where the possible values are 0, 1, 2, 3... 18.  The
// problem is that 9 out of the 19 possible values (10,11... 18) start
// with the digit 1.  Thus our hascode will start with "01".
//
// ANOTHER PROBLEM: Characters in the file are read one at a time; after each 
// one is read it is prepended to the remainder, then the remainder is
// recalculated on the string thus far seen.  I am sure the math here 
// is wrong - if I try calculating (111 mod 7) by the same algorithm, it
// simply does nor work.
//
// ANOTHER POSSIBLE PROLEM: Each character from the file is read into 
// remainder at the most significant end, and when that character
// is a zero you get a number like "01" which would be considered 
// larger than a number like "8" because it is longer (two  digits
// instead of 1).  
//
// These comments added by Gordon Paynter (gwp@cs.waikato.ac.nz) in
// June 2000.  I didn't write any code, however.

char *hashfile (char *filename) {
  FILE *infile = (FILE *)NULL;
  int i;

  number primepow[8];
  number pow;
  number remainder;
  unsigned char c;

  // our prime number (618970019642690137449562111)
  pow.num[0] = 255;  pow.num[1] = 255;
  pow.num[2] = 255;  pow.num[3] = 255;
  pow.num[4] = 255;  pow.num[5] = 255;
  pow.num[6] = 255;  pow.num[7] = 255;
  pow.num[8] = 255;  pow.num[9] = 255;
  pow.num[10] = 255; pow.num[11] = 1;
  pow.len = 12;

  // calculate 8 multiples of the prime number.
  // These are used to find the remainder using only subtraction operations
  for (i=0; i<8; ++i) {
    primepow[i] = pow;
    my_inc (pow, pow);
  }

  // The "remainder" after division by the prime.  Our result.
  remainder.len = 0;

  // open the file
  infile = fopen (filename, "rb");
  if (infile == NULL) {
    return (char *)NULL;
  }
  c = (unsigned char)fgetc(infile);
  
  while (!feof (infile)) {

    // make sure the remainder has not grown too large
    if (remainder.len == MAXNUMLEN-1) {
      fprintf (stderr, "ERROR - number overflow\n");
      return (char *)NULL;
    }

    // remainder = remainder * 256 + c
    for (i=remainder.len; i>0; --i) {
      remainder.num[i] = remainder.num[i-1];
    }
    remainder.num[0] = c;
    if (remainder.len > 0 || c != 0) remainder.len = remainder.len+1;
    
    // remainder = (remainder % large-prime-number)
    for (i=7; i>=0; --i) {
      my_ifpos_dec (remainder, primepow[i]);
    }

    // read a new character from the file
    c = (unsigned char)fgetc(infile);
  }

  fclose (infile);

  return my_convert_num (remainder);
}


int main (int argc, char *argv[]) {
  int i;
  char *result;

  for (i = 1; i < argc; ++i) {
    result = hashfile (argv[i]);
    if (result == NULL) {
      printf ("ERROR - could not process %s\n", argv[i]);
    } else {
      printf ("%s: %s\n", argv[i], result);
      delete [] result;
    }
  }

  return 0;
}
