###########################################################################
#
# NutchTextDumpPlugin.pm -- plugin for dump.txt files generated by Nutch
#
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 2002 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

# This plugin originally created to process Nutch dump.txt files produced from recrawling commoncrawl (CC)
# results for pages detected by CC as being in Maori.
# It splits each web site's dump.txt into its individual records: as each record represents a web page,
# this produces one greenstone document per web page.
#
# For a commoncrawl collection of siteID-labelled folders containing dump.txt files each,
# - set <importOption name="OIDtype" value="dirname"/>
# - Create 2 List browsing classifiers (with bookshelf_type set to always) on ex.siteID and ex.srcDomain
# both sorted by ex.srcURL, and an ex.Title classifier.
# For the ex.srcDomain classifier, set removeprefix to: https?\:\/\/(www\.)?
# An alternative is to build that List classifier on ex.basicDomain instead of ex.srcDomain.
# Set this List classifier's "partition_type_within_level" option to "per_letter".
# - Add search indexes on text (default), Title, basicDomain, siteID, Identifier, srcURL (not working)
#
# Finally, in the "display" format statement, add the following before the "wrappedSectionText" to
# display the most relevant metadata of each record:
  # <gsf:template name="documentContent">
  #   <div id="nutch-dump-txt-record">
  #     <h3>Record:</h3>
  #     <br/>
  #     <dl>
  #       <dt>URL:</dt>
  #       <dd>
  #         <gsf:metadata name="srcURL"/>
  #       </dd>
  #       <dt>Title:</dt>
  #       <dd>
  #         <gsf:metadata name="ex.Title"/>
  #       </dd>
  #	  <dt>Identifier:</dt>
  #       <dd>
  #         <gsf:metadata name="Identifier"/>
  #       </dd>
  #       <dt>SiteID:</dt>
  #       <dd>
  #         <gsf:metadata name="siteID"/>
  #       </dd>
  #       <dt>Status:</dt>
  #       <dd>
  #         <gsf:metadata name="status"/>
  #       </dd>
  #       <dt>ProtocolStatus:</dt>
  #       <dd>
  #         <gsf:metadata name="protocolStatus"/>
  #       </dd>
  #       <dt>ParseStatus:</dt>
  #       <dd>
  #         <gsf:metadata name="parseStatus"/>
  #       </dd>
  #       <dt>CharEncodingForConversion:</dt>
  #       <dd>
  #         <gsf:metadata name="CharEncodingForConversion"/>
  #       </dd>
  #       <dt>OriginalCharEncoding:</dt>
  #       <dd>
  #         <gsf:metadata name="OriginalCharEncoding"/>
  #       </dd>
  #     </dl>
  #   </div>

# + DONE: remove illegible values for metadata _rs_ and _csh_ in the example below before
# committing, in case their encoding affects the loading/reading in of this perl file.
#
# Example record in dump.txt to process:
	# https://www.whanau-tahi.school.nz/	key:	nz.school.whanau-tahi.www:https/
	# OR: http://yutaka.it-n.jp/apa/750010010.html        key:    jp.it-n.yutaka:http/apa/750010010.html
	# baseUrl:	null
	# status:	2 (status_fetched)
	# fetchTime:	1575199241154
	# prevFetchTime:	1572607225779
	# fetchInterval:	2592000
	# retriesSinceFetch:	0
	# modifiedTime:	0
	# prevModifiedTime:	0
	# protocolStatus:	SUCCESS, args=[]
	# signature:	d84c84ccf0c86aa16a19e03cb1fc5827
	# parseStatus:	success/ok (1/0), args=[]
	# title:	Te Kura Kaupapa Māori o Te Whānau Tahi
	# score:	1.0
	# marker _injmrk_ : 	y
	# marker _updmrk_ : 	1572607228-9584
	# marker dist : 	0
	# reprUrl:	null
	# batchId:	1572607228-9584
	# metadata CharEncodingForConversion : 	utf-8
	# metadata OriginalCharEncoding : 	utf-8
	# metadata _rs_ :
	# metadata _csh_ :
	# text:start:
	# Te Kura Kaupapa Māori o Te Whānau Tahi He mihi He mihi Te Kaupapa Ngā Tāngata Te Kākano Te Pihinga Te Tipuranga Te Puāwaitanga Te Tari Te Poari Matua Whakapā mai He mihi He mihi Te Kaupapa Ngā Tāngata Te Kākano Te Pihinga Te Tipuranga Te Puāwaitanga Te Tari Te Poari Matua Whakapā mai TE KURA KAUPAPA MĀORI O TE WHĀNAU TAHI He mihi Kei te mōteatea tonu nei ngā mahara ki te huhua kua mene atu ki te pō, te pōuriuri, te pōtangotango, te pō oti atu rā. Kua rite te wāhanga ki a rātou, hoki mai ki te ao tūroa nei Ko Io Matua Kore te pūtaketanga, te pūkaea, te pūtātara ka rangona whānuitia e te ao. Ko tāna ko ngā whetū, te marama, te haeata ki a Tamanui te rā. He atua i whakateretere mai ai ngā waka i tawhiti nui, i tawhiti roa, i tawhiti mai rā anō. Kei nga ihorei, kei ngā wahapū, kei ngā pukumahara, kei ngā kanohi kai mātārae o tō tātou nei kura Aho Matua, Te Kura Kaupapa Māori o Te Whanau Tahi. Anei rā te maioha ki a koutou katoa e pūmau tonu ki ngā wawata me ngā whakakitenga i whakatakotoria e ngā poupou i te wā i a rātou. Ka whakanuia hoki te toru tekau tau o tēnei kura mai i tōna orokohanga timatanga tae noa ki tēnei wā Ka pūmau tōnu mātou ki te whakatauki o te kura e mea ana “Poipoia ō tātou nei pūmanawa” Takiritia tonutia te ra ki runga i Te Kura Kaupapa Maori o Te Whanau Tahi . Back to Top " Poipoia ō tātou nei pūmanawa -  Making our potential a reality "   ©  Te Kura Kaupapa Māori o Te Whānau Tahi, 2019  Cart ( 0 )
	# text:end:
	#
	# https://www.whanau-tahi.school.nz/cart	key:	nz.school.whanau-tahi.www:https/cart
	# baseUrl:	null
	# status:	2 (status_fetched)
	# ...
#
# - Some records may have empty text content between the text:start: and text:end: markers,
# while other records may be missing these markers along with any text.
# - Metadata is of the form key : value, but some metadata values contain ":", for example
# "protocolStatus" metadata can contain a URL for value, including protocol that contains ":".
# - metadata _rs_ and _csh_ contain illegible values, so this code discards them when storing metadata.
#
# If you provide a keep_urls_file when configuring NutchTextDumpPlugin, then if relative the path is relative
# it will check the collection's etc folder for a urls.txt file.


package NutchTextDumpPlugin;

use SplitTextFile;

use Encode;
use unicode;
use util;

use strict;
no strict 'refs'; # allow filehandles to be variables and viceversa


# Seems to be
# nohup command
# Not: nohup command > bla.txt 2&>1 &
# nor even: nohup command &
#    nohup.out (possibly both STDERR and STDOUT, do a quick test first and then delete nohup.out before re-running)
#    in the folder the command is run
# Delete nohup.out when re-running command.
# Tripped up and unhappy only when commands require keyboard input at any stage.
#
#
# TODO:
# Use "od" to print out bytevalues of the dump.txt file to check _rs_ and _csh_
# Also google Nutch about what those fields mean.
# od -a
# every byte as ASCII character
# od -ab
# ASCII and bytevalue:
# First comes byteoffset and then ascii character (sp for space). Line underneath the numeric byte values in hex of the individual characters.
#
# + 1. Split each dump.txt file into its individual records as individual docs
# + 2. Store the meta of each individual record/doc
# ?3. Name each doc, siteID.docID else HASH internal text. See EmailPlugin?
# + In SplitTextFile::read(), why is $segment which counts discarded docs too used to add record ID
# rather than $count which only counts included docs? I am referring to code:
# 	$self->add_OID($doc_obj, $id, $segment);
# Because we get persistent URLs, regardless of whitelist urls file content!
# The way I've solved this is by setting the OIDtype importOption. Not sure if this is what was required.
# + 4. Keep a map of all URLs seen - whitelist URLs.
# + 5. Implement the optional input file of URLs: if infile provided, keep only those records
#    whose URLs are in the map. Only these matching records should become docs.
# 6. Rebuild full collection of all dump.txt files with this collection design.
#
# TIDY UP:
# + Create util::trim()
# + Add to perl's strings.properties: NutchTextDumpPlugin.keep_urls_file
#
# CLEANUP:
# + Remove MetadataRead functions and inheritance
#
# QUESTIONS:
# - encoding = utf-8, changed to "utf8" as required by copied to_utf8(str) method. Why does it not convert
# the string parameter but fails in decode() step? Is it because the string is already in UTF8?
# - Problem converting text with encoding in full set of nutch dump.txt when there encoding is windows-1252 and Shift-JIS.
# - TODOs
#

# CHECK:
# + title fallback is URL. Remove domain/all folder prefix (unless nothing remains), convert underscores and hyphens to spaces.
# + util::tidy_up_OID() prints warning. SiteID is foldername and OIDtype=dirname, so fully numeric
# siteID to OID conversion results in warning message that siteID is fully numeric and gets 'D' prefixed.
# Is this warning still necessary?
# - Ask about binmode usage (for debugging) in this file


# To get all the isMRI results, I ran Robo-3T against our mongodb as
# in the instructions at http://trac.greenstone.org/browser/other-projects/maori-lang-detection/MoreReading/mongodb.txt
# Then I launched Robo-3T and connected to the mongodb
#
# Then in the "ateacrawldata" database, I ran the following queries
# to get a URL listing of all the Webpages where isMRI = true as determined
# by apache openNLP.
#
#db.getCollection('Webpages').find({isMRI:true}).count();
#7830
#
#db.getCollection('Webpages').find({isMRI:true},{URL: 1, _id: 0});
#
#Then I set robo-3T's output display to display 8000 results on a page, then copied the results into this file below.
#
# I cleaned out all the JSON from the results using regex in Notepad++.
# This then becomes our urls.txt file, which I put into the cc nutch crawl
# GS3 collection's etc folder under the name isMRI_urls.txt,
# to consider processing only webpages apache Open-NLP detected as isMRI
# into our collection.
# Remember to configure the NutchTextDumpPlugin with option "keep_urls_file" = isMRI_urls.txt to make use of this.
#
# + ex meta -> don't add with ex. prefix
# + check for and call to setup_keep_urls(): move into process() rather than doing this in more convoluted way in can_process_this_file()
# + util::tidy_up_oid() -> print callstack to find why it's called on every segment
# X- binmode STDERR: work out what default mode on STDERR is and reset to that after printing debug messages in utf8 binmode
# - test collection to check various encodings with and without to_utf8() function - tested collection 00436 in collection cctest3.
# The srcURL .../divrey/shaar.htm (Identifier: D00436s184) is in Hebrew and described as being in char encoding iso-8859-8.
# But when I paste the build output when using NutchTextDumpPlugin.pm_debug_iso-8859-8
# into emacs, the text for this record reads and scrolls R to L in emacs.
# When previewing the text in the full text section in GS3, it reads L to R.
# The digits used in the text seem to match, occurring in reverse order from each other between emacs and GS3 preview.
# Building displays error messages if to_utf8() called to decode this record's title meta or full text
# using the discovered encoding.
    
sub BEGIN {
    @NutchTextDumpPlugin::ISA = ('SplitTextFile'); 
    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
}

my $arguments = 
    [ { 'name' => "keep_urls_file",
	'desc' => "{NutchTextDumpPlugin.keep_urls_file}",
	'type' => "string",
	#'deft' => "urls.txt",
	'reqd' => "no" },
      { 'name' => "process_exp",
	'desc' => "{BaseImporter.process_exp}",
	'type' => "regexp",
	'reqd' => "no",
	'deft' => &get_default_process_exp() },
      { 'name' => "split_exp",
	'desc' => "{SplitTextFile.split_exp}",
	'type' => "regexp",
	'reqd' => "no",
	'deft' => &get_default_split_exp() } 
      ];

my $options = { 'name'     => "NutchTextDumpPlugin",
		'desc'     => "{NutchTextDumpPlugin.desc}",
		'abstract' => "no",
		'inherits' => "yes",
		'explodes' => "yes",
		'args'     => $arguments };

sub new {
    my ($class) = shift (@_);
    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
    push(@$pluginlist, $class);
    
    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
    push(@{$hashArgOptLists->{"OptList"}},$options);
 
    my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
    
    if ($self->{'info_only'}) {
	# don't worry about the options
	return bless $self, $class;
    }    
    
    $self->{'keep_urls_processed'} = 0;
    $self->{'keep_urls'} = undef;	
    
    #return bless $self, $class;
    $self = bless $self, $class;
    # Can only call any $self->method(); AFTER the bless operation above, so from this point onward
    return $self;
}


sub setup_keep_urls {
    my $self = shift (@_);
    
    my $verbosity = $self->{'verbosity'};
    my $outhandle = $self->{'outhandle'};
    my $failhandle = $self->{'failhandle'};
    
    $self->{'keep_urls_processed'} = 1; # flag to track whether this method has been called already during import
    
    #print $outhandle "@@@@ In NutchTextDumpPlugin::setup_keep_urls() - this method should only be called once and only during import.pl\n";    
	
    if(!$self->{'keep_urls_file'}) {
	my $msg = "NutchTextDumpPlugin INFO: No urls file provided.\n" .
	    "    No records will be filtered.\n";
	print $outhandle $msg if ($verbosity > 2);
		
	return;
    }
    
    # read in the keep urls files
    my $keep_urls_file = &util::locate_config_file($self->{'keep_urls_file'});
    if (!defined $keep_urls_file)
    {
	my $msg = "NutchTextDumpPlugin INFO: Can't locate urls file $keep_urls_file.\n" .
	    "    No records will be filtered.\n";
	
	print $outhandle $msg;
	
	$self->{'keep_urls'} = undef;
	# TODO: Not a fatal error if $keep_urls_file can't be found: it just means all records
	# in dump.txt will be processed?
    }
    else {	
	#$self->{'keep_urls'} = $self->parse_keep_urls_file($keep_urls_file, $outhandle);
	#$self->{'keep_urls'} = {};
	$self->parse_keep_urls_file($keep_urls_file, $outhandle, $failhandle);
    }
    
    #if(defined $self->{'keep_urls'}) {
    #	print STDERR "@@@@ keep_urls hash map contains:\n";
    #	map { print STDERR $_."=>".$self->{'keep_urls'}->{$_}."\n"; } keys %{$self->{'keep_urls'}};
    #}
    
}


sub parse_keep_urls_file {
    my $self = shift (@_);
    my ($urls_file, $outhandle, $failhandle) = @_;
    
    # https://www.caveofprogramming.com/perl-tutorial/perl-hashes-a-guide-to-associative-arrays-in-perl.html
    # https://stackoverflow.com/questions/1817394/whats-the-difference-between-a-hash-and-hash-reference-in-perl
    $self->{'keep_urls'} = {}; # hash reference init to {}
	
    # What if it is a very long file of URLs? Need to read a line at a time!
    #my $contents = &FileUtils::readUTF8File($urls_file); # could just call $self->read_file() inherited from SplitTextFile's parent ReadTextFile
    #my @lines = split(/(?:\r?\n)+/, $$textref);
    
    # Open the file in UTF-8 mode https://stackoverflow.com/questions/2220717/perl-read-file-with-encoding-method
    # and read in line by line into map	
    my $fh;
    if (open($fh,'<:encoding(UTF-8)', $urls_file)) {
	while (defined (my $line = <$fh>)) {
	    $line = &util::trim($line); #$line =~ s/^\s+|\s+$//g; # trim whitespace

	    if($line =~ m@^https?://@) { # add only URLs
		# remove any ",COUNTRYCODE" at end
		# country code can be NZ but also UNKNOWN, so not 2 chars
		$line =~ s/,[A-Z]+$//; 
		#print STDERR "LINE: |$line|\n";
		$self->{'keep_urls'}->{$line} = 1; # add the url to our perl hash
	    }
	}
	close $fh;	
    } else {
	my $msg = "NutchTextDumpPlugin ERROR: Unable to open file keep_urls_file: \"" .
	    $self->{'keep_urls_file'} . "\".\n " .
	    "    No records will be filtered.\n";
	print $outhandle $msg;
	print $failhandle $msg;
	# Not fatal. TODO: should it be fatal when it can still process all URLs just because
	# it can't find the specified keep-urls.txt file?
    }
    
    # If keep_urls hash is empty, ensure it is undefined from this point onward
    # Use if(!keys %hash) to SECURELY test for an empty hash
    # https://stackoverflow.com/questions/9444915/how-to-check-if-a-hash-is-empty-in-perl
    #
    # But may not do: keys $hashref, only: keys %hash.
    # Unable to work out how to dereference the hashref that is $self->{'keep_urls'},
    # in order for me to then finally get the keys of the hashmap it refers to
    # Googled: perl convert reference to hashmap
    # The way to dereference hashref and get the keys is at https://www.thegeekstuff.com/2010/06/perl-hash-reference/
    #      keys % { $hash_ref };
    my $hashmap_ref = $self->{'keep_urls'};
    my %urls_map = %$hashmap_ref;
    if(!keys %urls_map) {
	$self->{'keep_urls'} = undef;
    }
    
}

# Accept "dump.txt" files (which are in numeric siteID folders),
# and txt files with numeric siteID, e.g. "01441.txt"
# if I preprocessed dump.txt files by renaming them this way.
sub get_default_process_exp {
    my $self = shift (@_);
    
    return q^(?i)((dump|\d+)\.txt)$^;
}


sub get_default_split_exp {
    
    # prev line is either a new line or start of dump.txt
    # current line should start with url protocol and contain " key: .... http(s)/"
    # \r\n for msdos eol, \n for unix
    
    # The regex return value of this method is passed into a call to perl split.
    # Perl's split(), by default throws away delimiter
    # Any capturing group that makes up or is part of the delimiter becomes a separate element returned by split
    # We want to throw away the empty newlines preceding the first line of a record "https? .... key: https?/"
    # but we want to keep that first line as part of the upcoming record.
    #   - To keep the first line of a record, though it becomes its own split-element, use capture groups in split regex:
    #    https://stackoverflow.com/questions/14907772/split-but-keep-delimiter
    #   - To skip the unwanted empty lines preceding the first line of a record use ?: in front of its capture group
    #    to discard that group:
    #    https://stackoverflow.com/questions/3512471/what-is-a-non-capturing-group-in-regular-expressions
    #   - Next use a positive look-ahead (?= in front of capture group, vs ?! for negative look ahead)
    #	 to match but not capture the first line of a record (so the look-ahead matched is retained as the
    #    first line of the next record):
    #    https://stackoverflow.com/questions/14907772/split-but-keep-delimiter
    #    and http://www.regular-expressions.info/lookaround.html
    #   - For non-greedy match, use .*?
    #    https://stackoverflow.com/questions/11898998/how-can-i-write-a-regex-which-matches-non-greedy
    return q^(?:$|\r?\n\r?\n)(?=https?://.+?\skey:\s+.*?https?/)^;
    
}

# TODO: Copied method from MARCPlugin.pm and uncommented return statement when encoding = utf8
# Move to a utility perl file, since code is mostly shared?
# The bulk of this function is based on read_line in multiread.pm
# Unable to use read_line original because it expects to get its input
# from a file.  Here the line to be converted is passed in as a string

# TODO:
# Is this function even applicable to NutchTextDumpPlugin?
# I get errors in this method when encoding is utf-8 in the decode step.
# I get warnings/errors somewhere in this file (maybe also at decode) when encoding is windows-1252.

sub to_utf8
{
    my $self = shift (@_);
    my ($encoding, $line) = @_;

    if ($encoding eq "utf8") {
	# nothing needs to be done
	return $line;
    } elsif ($encoding eq "iso_8859_1" || $encoding eq "windows-1252") { # TODO: do this also for windows-1252?
	# we'll use ascii2utf8() for this as it's faster than going
	# through convert2unicode()
	#return &unicode::ascii2utf8 (\$line);
	$line = &unicode::ascii2utf8 (\$line);
    } else {

	# everything else uses unicode::convert2unicode
	$line = &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
    }
    # At this point $line is a binary byte string
    # => turn it into a Unicode aware string, so full
    # Unicode aware pattern matching can be used.
    # For instance: 's/\x{0101}//g' or '[[:upper:]]'
    
    return decode ("utf8", $line);
}



# do plugin specific processing of doc_obj
# This gets done for each record found by SplitTextFile in marc files.
sub process {
    my $self = shift (@_);
    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;

    # Only load the urls from the keep_urls_file into a hash if we've not done so before.
    # Although this method is called on each dump.txt file found, and we want to only setup_keep_urls()
    # once for a collection and only during import and not buildcol, it's best to do the check and setup_keep_urls()
    # call here, because this subroutine, process(), is only called during import() and not during buildcol.
    # During buildcol, can_process_this_file() is not called on dump.txt files but on folders (archives folder).
    # Only if this plugin's called on can_process_this_file() is called on a dump.txt, will this process() be called
    # on each segment of the dump.txt file
    # So this is the best spot to ensure we've setup_keep_urls() here, if we haven't already:
    
    if(!$self->{'keep_urls_processed'}) {
	$self->setup_keep_urls();
    }

    
    my $outhandle = $self->{'outhandle'};
    my $filename = &util::filename_cat($base_dir, $file);
    
    
    my $cursection = $doc_obj->get_top_section();

    # https://perldoc.perl.org/functions/binmode.html
    # "To mark FILEHANDLE as UTF-8, use :utf8 or :encoding(UTF-8) . :utf8 just marks the data as UTF-8 without further checking,
    # while :encoding(UTF-8) checks the data for actually being valid UTF-8. More details can be found in PerlIO::encoding."
    # https://stackoverflow.com/questions/27801561/turn-off-binmodestdout-utf8-locally
    # Is there anything useful here:
    # https://perldoc.perl.org/PerlIO/encoding.html and https://stackoverflow.com/questions/21452621/binmode-encoding-handling-malformed-data
    # https://stackoverflow.com/questions/1348639/how-can-i-reinitialize-perls-stdin-stdout-stderr
    # https://metacpan.org/pod/open::layers
    # if() { # Google: "what is perl choosing to make the default char encoding for the file handle". Does it take a hint from somewhere, like env vars? Look for env vars
    #  # is there a perl env var to use, to check char enc? If set to utf-8, do this
    #binmode(STDERR, ':utf8'); ## FOR DEBUGGING! To avoid "wide character in print" messages, but modifies globally for process!
    #}
    # Then move this if-block to BEGIN blocks of all perl process files.
    
    #print STDERR "---------------\nDUMP.TXT\n---------\n", $$textref, "\n------------------------\n";
    
    
    # (1) parse out the metadata of this record
    my $metaname;
    my $encoding;
    my $title_meta;
    
    my $line_index = 0;
    my $text_start_index = -1;
    my @lines = split(/(?:\r?\n)+/, $$textref);
    
    foreach my $line (@lines) {
	#$line =~ s@\{@\\{@g; # escape open curly braces for newer perl
	
	# first line is special and contains the URL (no metaname)
	# and the inverted URL labelled with metaname "key"
	if($line =~ m/^https?/ && $line =~ m/\s+key:\s+/) {
	    my @vals = split(/key:/, $line);
	    # get url and key, and trim whitespace simultaneously
	    my $url = &util::trim($vals[0]);
	    my $key = &util::trim($vals[1]);
	    
	    # if we have a keep_urls hash, then only process records of whitelisted urls
	    if(defined $self->{'keep_urls'} && !$self->{'keep_urls'}->{$url}) {
		# URL not whitelisted, so stop processing this record
		print STDERR "@@@@@@ INFO NutchTextDumpPlugin::process(): discarding record for URL not whitelisted: $url\n"
		    if $self->{'verbosity'} > 3;
		return 0;
	    } else {
		print STDERR "@@@@@@ INFO NutchTextDumpPlugin::process(): processing record of whitelisted URL $url...\n"
		    if $self->{'verbosity'} > 3;
	    }
	    $doc_obj->add_utf8_metadata ($cursection, "srcURL", $url);
	    $doc_obj->add_utf8_metadata ($cursection, "key", $key);
	    
	    
	    # let's also set the domain from the URL, as that will make a
	    # more informative bookshelf label than siteID
	    # For complete domain, keep protocol:// and every non-slash after.
	    # (This avoids requiring presence of subsequent slash)
	    # https://stackoverflow.com/questions/3652527/match-regex-and-assign-results-in-single-line-of-code
	    # Can clean up protocol and www. in List classifier's bookshelf's remove_prefix option
	    # or can build classifier on basicDomain instead.
	    
	    my ($domain, $basicDomain) = $url =~ m@(^https?://(?:www\.)?([^/]+)).*@;	    
	    #my ($domain, $protocol, $basicdomain) = $url =~ m@((^https?)://([^/]+)).*@; # Works
	    $doc_obj->add_utf8_metadata ($cursection, "srcDomain", $domain);
	    $doc_obj->add_utf8_metadata ($cursection, "basicDomain", $basicDomain);
	    
	}
	# check for full text
	elsif ($line =~ m/text:start:/) {
	    $text_start_index = $line_index;
	    last; # if we've reached the full text portion, we're past the metadata portion of this record
	}
	elsif($line =~ m/^[^:]+:.+$/) { # look for meta #elsif($line =~ m/^[^:]+:[^:]+$/) { # won't allow protocol://url in metavalue
	    my @metakeyvalues = split(/:/, $line); # split on first :
	    
	    my $metaname = shift(@metakeyvalues);
	    my $metavalue = join("", @metakeyvalues);
	    
	    # skip "metadata _rs_" and "metadata _csh_" as these contain illegible characters for values
	    if($metaname !~ m/metadata\s+_(rs|csh)_/) {
		
		# trim whitespace
		$metaname = &util::trim($metaname);
		$metavalue = &util::trim($metavalue);
		
		if($metaname eq "title") { # TODO: what to do about "title: null" cases?
		    ##print STDERR "@@@@ Found title: $metavalue\n";
		    #$metaname = "Title"; # will set "title" as "Title" metadata instead
		    # TODO: treat title metadata specially by using character encoding to store correctly?
		    
		    # Won't add Title metadata to docObj until after all meta is processed,
		    # when we'll know encoding and can process title meta
		    $title_meta = $metavalue;
		    $metavalue = ""; # will force ex.Title metadata to be added AFTER for loop
		}
		elsif($metaname =~ m/CharEncodingForConversion/) { # TODO: or look for "OriginalCharEncoding"?
		    ##print STDERR "@@@@ Found encoding: $metavalue\n";
		    $encoding = $metavalue; # TODO: should we use this to interpret the text and title in the correct encoding and convert to utf-8?
		    
		    if($encoding eq "utf-8") {
			$encoding = "utf8"; # method to_utf8() recognises "utf8" not "utf-8"
		    } else {
			my $srcURL = $doc_obj->get_metadata_element($cursection, "srcURL");
			print STDERR "@@@@@@ WARNING NutchTextDumpPlugin::process(): Record's Nutch-assigned CharEncodingForConversion was not utf-8 but $encoding\n\tfor record: $srcURL\n";
		    }		    
		    
		}
		
		# move occurrences of "marker " or "metadata " strings at start of metaname to end
		#$metaname =~ s/^(marker|metadata)\s+(.*)$/$2$1/;
		# remove "marker " or "metadata " strings from start of metaname
		$metaname =~ s/^(marker|metadata)\s+//;
		# remove underscores and all remaining spaces in metaname
		$metaname =~ s/[ _]//g;			
		
		# add meta to docObject if both metaname and metavalue are non-empty strings
		if($metaname ne "" && $metavalue ne "") {
		    # when no namespace is provided as here, adds as ex. meta.
		    # Don't explicitly prefix ex., as things becomes convoluted when retrieving meta
		    $doc_obj->add_utf8_metadata ($cursection, $metaname, $metavalue);
		    #print STDERR "Added meta |$metaname| = |$metavalue|\n"; #if $metaname =~ m/ProtocolStatus/i;
		}
		
	    }
	} elsif ($line !~ m/^\s*$/) { # Not expecting any other type of non-empty line (or even empty lines)
	    print STDERR "NutchTextDump line not recognised as URL meta, other metadata or text content:\n\t$line\n";
	}
	
	$line_index++;
    }
    
    
    # Add fileFormat as the metadata
    $doc_obj->add_metadata($cursection, "FileFormat", "NutchDumpTxt");
    
    # Correct title metadata using encoding, if we have $encoding at last
    # https://stackoverflow.com/questions/12994100/perl-encode-pm-cannot-decode-string-with-wide-character
    # Error message: "Perl Encode.pm cannot decode string with wide character"
    # "That error message is saying that you have passed in a string that has already been decoded
    # (and contains characters above codepoint 255). You can't decode it again."
    if($title_meta && $title_meta ne "" && $title_meta ne "null") {
	#$title_meta = $self->to_utf8($encoding, $title_meta) if ($encoding);
    } else { # if we have "null" as title metadata, set it to the record URL?
	my $srcURL = $doc_obj->get_metadata_element($cursection, "srcURL");
	if(defined $srcURL) {
	    # Use the web page name without file ext for doc title, if web page name present,
	    # else use basicURL for title for title instead of srcURL,
	    # else many docs get classified under "Htt" bucket for https
	    
	    my ($basicURL) = $srcURL =~ m@^https?://(?:www\.)?(.*)$@;
	    my ($pageName) = $basicURL =~ m@([^/]+)$@;
	    if (!$pageName) {		
	    	$pageName = $basicURL;
	    } else {
	    	# remove any file extension
	    	$pageName =~ s@\.[^\.]+@@;
		# replace _ and - with spaces
		$pageName =~ s@[_\-]@ @g;
	    }
	    
	    print STDERR "@@@@ null/empty title for $basicURL to be replaced with: $pageName\n"
	    	if $self->{'verbosity'} > 3;
	    $title_meta = $pageName;
	}
    }
    
    $doc_obj->add_utf8_metadata ($cursection, "Title", $title_meta);
    
    
    # When importOption OIDtype = dirname, the base_OID will be that dirname
    # which was crafted to be the siteID. However, because our siteID is all numeric,
    # a D gets prepended to create baseOID. Remove the starting 'D' to get actual siteID.
    my $siteID = $self->get_siteID($doc_obj, $file);
    #print STDERR "BASE OID: " . $siteID . "\n";
    $siteID =~ s/^D//;
    $doc_obj->add_utf8_metadata ($cursection, "siteID", $siteID);
    
    
    # (2) parse out text of this record
    # if($text_start_index != -1 && pop(@lines) =~ m/text:end:/) { # we only have text content if there were "text:start:" and "text:end:" markers.
    # 	# TODO: are we guaranteed popped line is text:end: and not empty/newline?
    # 	@lines = splice(@lines,0,$text_start_index+1); # just keep every line AFTER text:start:, have already removed (popped) "text:end:"
	
    # 	# glue together remaining lines, if there are any, into textref
    # 	# https://stackoverflow.com/questions/7406807/find-size-of-an-array-in-perl
    # 	if(scalar (@lines) > 0) {
    # 	    # TODO: do anything with $encoding to convert line to utf-8?
    # 	    foreach my $line (@lines) {				
    # 		$line = $self->to_utf8($encoding, $line) if $encoding; #if $encoding ne "utf-8";
    # 		$$textref .= $line."\n";
    # 	    }
    # 	}
    # 	$$textref = "<pre>\n".$$textref."</pre>";
    # } else {
    # 	print STDERR "WARNING: NutchTextDumpPlugin::process: had found a text start marker but not text end marker.\n";
    # 	$$textref = "<pre></pre>";
    # }
    
    # (2) parse out text of this record	
    my $no_text = 1;
    if($text_start_index != -1) { # had found a "text:start:" marker, so we should have text content for this record

	if($$textref =~ m/text:start:\r?\n(.*?)\r?\ntext:end:/) {
	    $$textref = $1;
	    if($$textref !~ m/^\s*$/) {
		#$$textref = $self->to_utf8($encoding, $$textref) if ($encoding);
		$$textref = "<pre>\n".$$textref."\n</pre>";
		$no_text = 0;
	    }
	}
    }
    if($no_text) {
	$$textref = "<pre></pre>";
    }
    
    # Debugging
    # To avoid "wide character in print" messages for debugging, set binmode of handle to utf8/encoding
    # https://stackoverflow.com/questions/15210532/use-of-use-utf8-gives-me-wide-character-in-print
    # if ($self->{'verbosity'} > 3) {
    #     if($encoding && $encoding eq "utf8") {
    # 	binmode STDERR, ':utf8';
    #     }
    
    #     print STDERR "TITLE: $title_meta\n";
    #     print STDERR "ENCODING = $encoding\n" if $encoding;
    #     #print STDERR "---------------\nTEXT CONTENT\n---------\n", $$textref, "\n------------------------\n";
    # }
    
    
    $doc_obj->add_utf8_text($cursection, $$textref);
    
    return 1;
}

# returns siteID when file in import of form siteID.txt
# returns siteID when import contains siteID/dump.txt (as happens when OIDtype=dirname)
# Returns whatever baseOID in other situations, not sure if meaningful, but shouldn't have
# passed can_process_this_file() test for anything other than siteID/dump.txt and siteID.txt anyway
sub get_siteID {
    my $self = shift(@_);
    my ($doc_obj, $file) = @_;

    my $siteID;
    if ($file =~ /(\d+).txt/) {
	# file name without extension is site ID, e.g. 00001.txt
	$siteID = $1;
    }
    else { # if($doc_obj->{'OIDtype'} eq "dirname") or even otherwise, just use baseOID
	# baseOID is the same as site ID when OIDtype is configured to dirname because docs are stored as 00001/dump.txt
	# siteID has no real meaning in other cases
	$siteID = $self->{'dirname_siteID'} || $self->get_base_OID($doc_obj);
	
    } 	
    if(!$self->{'siteID'} || $siteID ne $self->{'siteID'}) {
	$self->{'siteID'} = $siteID;
    }
    return $self->{'siteID'};
}


# SplitTextFile::get_base_OID() has the side-effect of calling SUPER::add_OID()
# in order to initialise segment IDs.
# This then ultimately results in calling util::tidy_up_OID() to print warning messages
# about siteIDs forming all-numeric baseOIDs that require the D prefix prepended.
# In cases where site ID is the same as baseOID and is needed to set siteID meta, we want to avoid
# the warning messages but don't want to prevent the important side-effects of SplitTextFile::get_base_OID()
# So instead of overriding this method to calculate and store baseOID the first time and return
# the stored value subsequent times (which has the undesirable result that the side-effect from
# ALWAYS calling super's get_base_OID() even when there's a stored value), we just always store
# the return value before returning it. Next, we push the check for first testing for a stored value
# to use, else forcing it to be computed by calling this get_base_OID(), onto a separate function that
# calls this one, get_siteID(). Problem solved.
sub get_base_OID {
    my $self = shift(@_);
    my ($doc_obj) = @_;

    #if(!defined $self->{'dirname_siteID'}) { # DON'T DO THIS: loses essential side-effect of always calling super's get_base_OID()
    # this method is overridden, so it's not just called by this NutchTextDumpPlugin

    $self->{'dirname_siteID'} = $self->SUPER::get_base_OID($doc_obj); # store for NutchTextDumpPlugin's internal use
    #}
    return $self->{'dirname_siteID'}; # return superclass return value as always
}
1;
