#!/usr/bin/perl

use Storable;
use Data::Dumper;
use Digest::MD5  qw(md5 md5_base64);
use Roman;

my $indexfile = "./aquinasindex.storable";
my @sources;

use lib "./";
# use Aquinas qw(indexfile books);
require "./Aquinas.pl";

my %index;
my %links;
my %nodes;
my %done;
my $count = 0;
my $allBooks;
my @allBooksList;
my @replacements = ();

foreach my $b (keys(%books)) {
#	print "B: $b, @{$b}\n";
	push (@allBooksList, @{$books{$b}});
	}

print qq!PRE:\n!;
print Dumper %books;

foreach my $b (keys(%books)) {
	my $idx = 0;
	print "B: $b, @{$books{$b}}\n";
	foreach $bk(@{$books{$b}}) {
		print "$bk -> $b\n";
		
		# Some books contain a space. We replace that space with an underscore
		# before we store the book in reverseBooks.
		# We also push that book to a list of text replacements to be done on each chapter
		
		# This might possibly be done more effectively using a queue, but this will also work
				
		if ($bk =~ m/[\s\.\,\:\-\;\(\)\[\]\|]+/) { 
			push (@replacements, $bk); 
			$bk =~ s/[\s\.\,\:\-\;\(\)\[\]\|]+/_/g; 
			push (@replacements, $bk); 
			$books{$b}[$idx] = $bk;
			
			}
		
		# Build the reversBooks hash
		$reverseBooks{$bk} = $b;
		$idx ++;
		
		}
	}
# Do the same with the english book-names, but we don't need the reverseBooks index, since latin is "master"
# foreach my $b (keys(%englishbooks)) {
# 	print "B: $b, @{$englishbooks{$b}}\n";
# 	foreach $bk(@{$englishbooks{$b}}) {
# 		print "$bk -> $b\n";
# 		
# 		# Some books contain a space. We replace that space with an underscore
# 		# before we store the book in reverseBooks.
# 		# We also push that book to a list of text replacements to be done on each chapter
# 		
# 		# This might possibly be done more effectively using a queue, but this will also work
# 				
# 		if ($bk =~ m/[\s\.\,\:\-\;\(\)\[\]\|]+/) { 
# 			push (@replacements, $bk); 
# 			$bk =~ s/[\s\.\,\:\-\;\(\)\[\]\|]+/_/g; 
# 			push (@replacements, $bk); 
# 			$books{$b}[$idx] = $bk;
# 			
# 			}
# 		$idx ++;
# 		}
# 	}
		

print qq!POST:\n!;
print Dumper %books;


$allBooks = join('|', @allBooksList);

# These can also be chapters
# @alsoChapters = ("ultimo");
%alsoChapters = ("ultimo" => 1, "Ultimo" => 1, "ult." => 1);

$allChapters = "[MDCLXVI]+|ultimo";

# Read the file with URLs
open(FH, '<', "./aquinas_urls.txt") or die $!;
while (<FH>) {
	$url = $_;
	$url =~ m/.*\/(.*)$/;
	if ($1) {
		my $file = $1;
		$file =~ s/\~//g;
		push @sources, $file;
		}
	}

my $tick = 0;

print "ALL BOOKS: $allBooks\n";
print "ALL CHAPTERS $allChapters\n";
print "ALL REPLACEMENTS: ";
print Dumper @replacements;
# exit;

# With the array, read the local files (see fetch.pl)
# Read the files in reverse order, to find on the latest (more specific URL first)
while (my $file = shift @sources) { 
	open FILE, "<", "./source/$file";
	
	my @splits = split(/\./, $file);
	my $section = $splits[0];
	chomp($section);
	print "\n\n OPENING ./source/$file\n\n";
	my $wholefile = do { local $/; <FILE> };

	$wholefile =~ m/<div id="vl_ul" .*?>(.*?)<\/div>/sm;
	$content = $1;


	#print "WHOLEFILE: $wholefile\n";
	#print "CONTENT: $content\n";
	# last if ++$tick > 10;
	
	close FILE;

	if ($content eq $remembered) { 
		# print "Duplicate contetn\n"; 
		next; 
		}

	$remembered = $content;

	while ($content =~ m/<vl-r id="(.*?)">(.*?)<\/vl-r>/gsm) {
		my $node  = $1;
		my $block = $2;
		
		my $hash = md5_base64($block);
		$hash =~ s/\s//g;
		$hash =~ s/^\+//;
		
		
		# Nodes occure in multiple files
		# We want to know which files each node exists in, but we don't need
		# to process them more than once.
		# We include a hash of the content in our arrays, since the node number
		# is reused across various parts of the corpus.
		
		push(@{$links{"$section-$hash-$node"}}, "https://aquinas.cc/la/en/~$file");
		if ($done{"$section-$hash-$node"} == 1) { 
			# print "+"; 
			next; 
			}
		$done{"$section-$hash-$node"} = 1; # print ".";
		
		my $book = undef;
		# print "-----\n";
		# print "BLOCK: $block\n";
		
		my $ofInterest = 0;
		
		my $iterator = 0;
		while (my $from = $replacements[$iterator]) {
			my $to = $replacements[++ $iterator];
			$iterator ++;
			if ($block =~ s/\b$from\b/$to/gi) {
				# print qq!Replacing $from with $to\n!;
				# $ofInterest = 1;
				}
			}

		$block =~ m/<vl-c.*?>(.*?)<\/vl-c>.*?<vl-c.*?>(.*?)<\/vl-c>/sm;
		
		
		#my $english = $2;
		#my $latin   = $1;
		my $english = $2;
		my $latin   = $1;
		
		# Simple strip HTML
		$english =~ s|<.+?>||g;
		$latin   =~ s|<.+?>||g;

		# Not all sources have english version, no need to store it then
		if ($latin eq $english) {
			$english = "";
			}

		# We need a "working-copy"
		$engcopy = $english;
		
		my %holdhits = ();

		$orig = $$version;
	
#		my @words = split(/\s+/, $latin);
		my @words = split(/[\s\.\,\:\-\;\(\)\[\]\|]+/, $latin);
		my $distance;
	
		# Hold the position of the match
		my $wc      = 0;
		my $start   = 0;
		my $end     = 0;
		my $wc_e    = 0;
		my $start_e = 0;
		my $end_e   = 0;
		
		my $paragraphhits = 0;
		
	
		while ($word = shift(@words)) {

			# Our split will sometimes makes words that starts with none word characters, Like "(Gen)"
			$word =~ s/^\W(\w+)/$1/;
		
			print "$word - " if $ofInterest;

			# See if we have a book
	#		if (indexOf($word, @allBooksList)) {
			if ($reverseBooks{$word}) {
				# $book = $1;
				$book = $word;
				$fullname = $reverseBooks{$book};
				
	 			#print "https://aquinas.cc/la/en/~$file\n";
#	 			print "Found BOOK: $book\n"  if $ofInterest;
#	 			print "IN BLOCK: $orig\n"  if $ofInterest;
				$distance = 5;
				$start = $wc; 

				# A few books does not have chapters, just pretend it's chapter I
#				if (indexOf($fullname, @dontHaveChapters)) {
				if ($fullname eq "Philemon" || $fullname eq "2 John" || $fullname eq "3 John" || $fullname eq "Jude") {
					print "Assuming chapter I ($word)\n";
					$word = "I";
					}
				else { next; }
				}

			# See if we have a chapter
#			if ($word =~ m/\b[MDCLXVI0]+\b/ || $word =~ m/\b[1234567890]+\b/|| $alsoChapters{$word}) {
			if ($word =~ m/\b[MDCLXVI]+\b/i || $word =~ m/\b[1234567890]+\b/|| $alsoChapters{$word}) {
				$chapter = $word;
		 		print "Found CHAPTER: $word\n"  if $ofInterest;;

				# See if we have both chapter and verse (standard form)
				if ($chapter =~ m/(\d+)\:(\d+)/) {
					$match = $1;
					$verse = $2;
					$chapter = uc(roman($match)); # convert chapter to roman, keep verse as arabic
					# print "Found verse from $word\n";
					}

				elsif ($chapter =~ m/(\d+)/) {
					$match = $1;
					$chapter = uc(roman($match));
					# print "Arabic $match ($word) -> $chapter in book $book\n";
					}

		
				if ($book && $chapter) {

					$end = $wc;
					
					# did we find a verse?, if not try to look ahead. If we find a number (arabic or roman)
					# as next or next-again word, we asume it's the verse. 
					
					if (! $verse) {
					
#						if ($words[0] =~ m/\b([MDCLXVI0])+\b/ || $words[0] =~ m/\b([1234567890])+\b/) {
						if ($words[0] =~ m/\b([MDCLXVI]+)\b/i || $words[0] =~ m/\b([1234567890]+)\b/) {
							$match = $1;
							if ($match =~ m/\b([MDCLXVI]+)\b/i) {
								$verse = arabic($match);
								}
							else {
								$verse = int($match);
								}
							shift(@words); # So as to not pollute the next match
							$end = $wc + 1;
						
							print "Found verse from looking 1 ahead: $match / $verse\n";

							}
#						elsif ($words[1] =~ m/\b([MDCLXVI0])+\b/ || $words[1] =~ m/\b([1234567890])+\b/) {
						elsif ($words[1] =~ m/\b([MDCLXVI]+)\b/i || $words[1] =~ m/\b([1234567890]+)\b/i) {
							$match = $1;
							if ($match =~ m/\b([MDCLXVI]+)\b/i) {
								# Some odd roman numbers in the source
								$verse =~ s/IIII/IV/i;
								$verse = arabic($match);
								}
							else {
								$verse = int($match);
								}
							shift(@words); 
							shift(@words); # So as to not pollute the next match
							$end = $wc + 2;

							print "Found verse from looking 2 ahead: $match / $verse\n";

							}
						
						}

					else {
						$end = $wc; # marks the end of current match
						}


					# Keep the english version in sync
					# See first if we can find book-chapter-verse
					my $combinedenglish = join("|", @{$englishbooks{$fullname}}); # Creates a string with english version of "book" separated by "|" for use in regexp.
					print "Looking for $combinedenglish derived from $book\n";

					if ($engcopy =~ s/^(.*?)\b($combinedenglish)[\s\.\,\:\-\;\(\)\[\]\|]*(\d+)[\s\:\.\,\-\;\(\)\[\]\|](\d+)/ /) {
						my $cut = $1;
						$engbook = $2;
						$engchapter = $3;
						$engverse   = $4;
						
						my @cuts = split(/\s+/, $cut);
						my $length = scalar(@cuts);
						$wc_e += $length; # Add numbers of words in "$cut" to word-count-english
						print "Counting: wc_e is now $wc_e . Cut ($cut) is $length\n";
						$start_e = $wc_e;
						$end_e   = $wc_e + 3;
						}
					
					# No verse, see if we have book and chapter
					elsif ($engcopy =~ s/^(.*?)\b($combinedenglish)[\s\.\,\:\-\;\(\)\[\]\|]*(\d+)/ /) {
						my $cut = $1;
						$engbook = $2;
						$engchapter = $3;
						
						my @cuts = split(/\s+/, $cut);
						my $length = scalar(@cuts);
						$wc_e += $length; # Add numbers of words in "$cut" to word-count-english
						print "Counting: wc_e is now $wc_e . Cut ($cut) is $length\n";
						$start_e = $wc_e;
						$end_e   = $wc_e + 2;
						}
					
					$paragraphhits ++;
		 			$count ++;
					#print "IN BLOCK: $orig\n";
					$distance = 5;
				
					my %instanse;
					$instanse{latin}   = $latin;
					$instanse{english} = $english;
#					$instance{start}   = $start;
#					$instance{end}     = $end;
				
					# Store the "node" if we don't have it already
					if (! $nodes{"$section-$hash-$node"}) {
						$nodes{"$section-$hash-$node"} = \%instanse;
						 }
				
					my %hit;
				
					$hit{verse} = "";
					$hit{node} = "$section-$hash-$node";
					$hit{start} = $start;
					$hit{end}	= $end;
					$hit{start_e} = $start_e;
					$hit{end_e}	= $end_e;
					$hit{verse} = $engverse || $verse || 0; # English version have more occurrences of verse

					my $fullname = $reverseBooks{$book};
					
					push(@{$index{$fullname}{uc($chapter)}}, \%hit);
#					push(@{$links{"$section-$node"}}, "https://aquinas.cc/la/en/~$file");

					print "BOOK AND CHAPTER: $fullname/$chapter/$verse/$engverse - $start / $end / $start_e / $end_e \n";
					# print "$section-$hash-$node / $file\n";
					#print "$latin\n";
		 			# print "-------\n";
	 			
		 			# Reset book and chapter
		 			$book       = undef; 
		 			$chapter    = undef;
		 			$verse      = undef;
		 			$engbook    = undef;
		 			$engchapter = undef;
		 			$engverse   = undef;
					
					}
				}
		
			if ($distance > 0) { $distance --; } else { $book = undef; $chapter = undef; $verse = undef; }
				
			$wc ++;
			
			}

		# print "EN: $english\nLA: $latin\n-----\n\n";
		}
	}


my @all = ( \%index, \%links, \%nodes );
store \@all, $indexfile;

#store \%index, $indexfile;
 
sub indexOf { 
	($word, @array) = @_; 
	foreach (0..$#array) { 
		if ($array[$_] eq $word) { 
			return $_ || 1; }
		} 
	return 0; 
	}

print "ALL BOOKS: $allBooks\n";
print "Hits: $count\n"