#!/usr/bin/perl -w
#
# htm2thm: convert an HTML document to ThML. 
#
# So for this has mainly been tested on HTML documents that
# came from Word 2000 via de-ms.
# 
# The program reads teh bookID.meta file for the ThML.head section.
# It converts word named styles to appropriate ThML elements. E.g.
# word-style footnotes and endnotes are converted to <note> elements.
# It also adds insertIndex elements as needed.
#
# 2000-03-08, v. 0.31, improve the legality of the resulting ThML. Now,
#    at least one input file generates valid ThML output. However,
#    those align=center attributes are left in to work around a netscape
#    formatting bug, even though they aren't legal ThML. See the line
#    of this program containing "align=".
#    Also made it convert characters styles Citation, Code, Comment,
#    Default, Name, Unclear and paragraph styles HR, HR30, Attribution,
#    Term, Definition, Verse, Verse2, Verse3, Verse4 to appropriate
#    ThML elements.
# 2000-03-01, v. 0.3
# 1999-12-31, v. 0.2, Harry Plantinga
#
use strict;
use ThMLutil;

my $val;
my $oldref;
my $ref="";

my $input = "";
while (<>) { 
  $input .= $_; 
}
print STDERR "\nhtm2thm: " . length($input) . " bytes read\n";

my $thmlhead = "";
$input =~ m|<bookID>(.*?)</bookID>|;
my $bookID = $1;
$bookID ||= $ARGV;
$bookID =~ s/\.\w+//;

open META, "<$bookID.meta" or die $!;
while (<META>) {
   $thmlhead .= $_ ;
}
$thmlhead = "<ThML>\n" . $thmlhead;

my $stylesheet="";
$stylesheet = $1 if $input =~ s|(<style.*?</style>)||is;
#print STDERR "Found a stylesheet.\n" if $1;

$input =~ s|</?html[^>]*>||gis;		# delete <html> and </html> tags
$input =~ s|</?body[^>]*>||gis;		# delete <body> and </body> tags
$input =~ s|<head[^>]*>(.*?)</head>||is;# delete <head> but keep contents
my $headstuff = $1 || "";
$headstuff =~ s|<title>.*?</title>\s*||si;
$thmlhead =~ s/(<\/ThML.head>)/\n$stylesheet\n$headstuff\n$1/;


my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) =
    localtime;
my $date = "" . $year + 1900 . "-$mon-$mday";

my ($index_start, $subject_idx, $scripRef_idx, $scripCom_idx, $cite_idx);
my ($name_idx, $greek_idx, $hebrew_idx, $latin_idx, $german_idx);
my ($french_idx, $page_idx, $index_end);


#
#fix up the document here
#
$_ = $input;

print STDERR "Processing notes: ";
my (%notes, $fnid, $fn, $notelist);
s/(<div class="(end|foot)note-list".*<\/div>\s*<\/div>)//si;
$notelist = $1 || "";
$notelist =~ s|<p class="MsoFootnoteText">|<p class="footnote">|gs;
$notelist =~ s|<p class="MsoEndnoteText">|<p class="endnote">|gs;

while ($notelist =~ s/\s*<div class="(end|foot)note" id="([^"]*)"[^>]*>\s*(.*?)\s*<\/div>//si)
{
   $fnid = $2;
   $fn = $3;
   $fn =~ s/<a class="(end|foot).*?<\/a>//s;
   $notes{$fnid} = $fn;
#  print STDERR "found note $fnid: $fn\n";
}

#
#now put the notes back into the doc as <note> elements
#
s/<a class="(end|foot)note" href="\#_(.*?)".*?<\/a>/&note($1,$2)/gsie;
print STDERR "last footnote $ref\n";


#
# general cleanup
#
s|^\s*<div\s+class="Section1">||gsi;
s|</div>\s*||gsi;
s|(<[^>]*=)([^"'][^ >\n\r]*)|$1"$2"|gsi;   #add quotes around attributes
s|(<[^>]*=)([^"'][^ >\n\r]*)|$1"$2"|gsi;   #add quotes around more attributes
#s!\s*align="?(center|left|right)"?!!gsi;
# these align attributes should be deleted, but are needed to work around
# a netscape formatting bug.
print STDERR "Missing or single quote -- $1\n" if m/(<[^>]*=[^"]\w*)/;

print STDERR "Cleaning up: a";

#
# Process character and paragraph styles to convert
# template styles to appropriate XML.
#
s|<span\s+class="Comment"><span[^>]*>[^<]*</span></span>||gs;
s|<span\s+class="Code"><span[^>]*>([^<]*)</span></span>|$1|gse;
s|<span\s+class="Default">(.*?)</span>|$1|gis;
s|(<p\s+class="HeaderInfo.*?</p>)|&unescape(&detag($1))|gsie;
s|<p class="HR">.*?</p>|<hr />|gis;
s|<p class="HR30">.*?</p>|<hr class="HR30" />|gis;
s|<span\s+class="Name">(<span.*?</span>)</span>|<name>$1</name>|gis; 
s|<span\s+class="Name">(.*?)</span>|<name>$1</name>|gis; 

print STDERR "b";

#
# change classes (from word stylesheet names) to corresp. elements
#
s|<p class="Attribution">(.*?)</p>|<attr>$1</attr>|gis;
s|<p class="Term">(.*?)</p>|<term>$1</term>|gis;
s|<p class="Definition">(.*?)</p>|<def>$1</def>|gis;
s|<p class="Verse">(.*?)</p>|<l class="t1">$1</l>|gis;
s|<p class="Verse2">(.*?)</p>|<l class="t2">$1</l>|gis;
s|<p class="Verse3">(.*?)</p>|<l class="t3">$1</l>|gis;
s|<p class="Verse4">(.*?)</p>|<l class="t4">$1</l>|gis;
s|<span\s+class="Citation">(<span[^>]*>.*?</span>)</span>|<cite>$1</cite>|gis;
s|<span\s+class="Citation">(.*?)</span>|<cite>$1</cite>|gis;
s|<span\s+class="Unclear">(<span[^>]*>.*?</span>)</span>|<unclear>$1</unclear>|gis;
s|<span\s+class="Unclear">(.*?)</span>|<unclear>$1</unclear>|gis;
s|<span\s+class="XML"><span[^>]*>([^<]*?)</span></span>|&unescape($1)|gsie;
s|<span\s+class="XML">(.*?)</span>|&unescape($1)|gsie;

# fix stylesheet entries
s|span.Citation|cite|gsi;
s|span.Name|name|gsi;
s|span.Unclear|unclear|gsi;
s|p.HR30.*?\{.*?\}|.HR30\t{width: 30%}|gsi;

print STDERR "c";

#
# remove paragraphs around some tags; move end/start tags inside divs
#
s/<p[^>]*>(\s*(<\/?(pb|div|added|deleted|scripContext|insertIndex|glossary|verse)[^>]*>)+)\s*<\/p>/$1/gsi;
s/(<pb[^>]*>)\s*(<(added|deleted)[^>]*>)/$2\n$1/gsi;
s|(</div\d>)\s*(</p>)|$2\n$1|gsi;
s|(</div\d>)\s*(</p>)|$2\n$1|gsi;
s|<div [^>]*>||gsi;

#delete blank lines that occur after the end of a division
s|(</div\d>)\s*<p [^>]*>\s*&nbsp;\s*</p>\s*|$1\n|gsi;	
#move the stuff after the end of a division up into the div
s!((</div\d[^>]*>\s*)+)(.*?)(</?(div|added|deleted))!$3$1$4!gsi; 

#remove span styles surrounding unstyled stuff, like <pb>
s/<span[^>]*>(\s*(<\/?(pb|div|added|deleted|scripContext|insertIndex|glossary|verse|name|date|cite|unclear)[^>]*>)+)\s*<\/span>/$1/gsi;
s/<span[^>]*>(\s*(<\/?(pb|div|added|deleted|scripContext|insertIndex|glossary|verse|name|date|cite|unclear)[^>]*>)+)\s*<\/span>/$1/gsi;

# delete some useless spans. Assume not nested more than 4 deep.
s!<span\s*>[^<]*</span>!!gsi;
s!<span\s*>\s*([^<]*(</?(name|date|cite|unclear|i)[^>]*>[^<]*)+)\s*</span>!$1!gsi;
#s!<span\s*>\s*([^<]*<(name|date|cite|unclear|i)[^>]*>[^<]*</\2>[^<]*)\s*</span>!$1!gsi;

print STDERR "d\n";


my $bodpos = index($_, "<ThML.body", 0);
my $head = "";
my $body = $_;
if ($bodpos >= 0) {
   $head = substr($_, 0, $bodpos-1);
   $body = substr($_, $bodpos);
   print STDERR "\nLength:" . length($_) . " head:" . length($head) .
	        " body:" . length($body) . "\n";
   }

# greek, hebrew fonts: add lang attribute
$body =~ s|(<span\s+)([^>]*Sil Galatia)|$1lang="EL" $2|gsi;
$body =~ s|(<span\s+)([^>]*Sil Ezra)|$1lang="HE" $2|gsi;


# add indexes as needed.

if ( $body =~ m/<(index|scripRef|scripCom|cite|name|pb|foreign)/) {
  print STDERR "Making indexes.\n";
  &make_indexes;
  $body .= $index_start;
  $body .= $subject_idx if $body =~ m/<index/;
  $body .= $scripRef_idx if $body =~ m/<scripRef/;
  $body .= $scripCom_idx if $body =~ m/<scripCom/;
  $body .= $cite_idx if $body =~ m/<cite/;
  $body .= $name_idx if $body =~ m/<name/;
  $body .= $greek_idx if $body =~ m/lang="EL/;
  $body .= $hebrew_idx if $body =~ m/lang="HE/;
  $body .= $latin_idx if $body =~ m/lang="LA/;
  $body .= $german_idx if $body =~ m/lang="DE/;
  $body .= $french_idx if $body =~ m/lang="FR/;
  $body .= $page_idx if $body =~ m/<pb/;
  $body .= $index_end;
}


#
# sanity check on head
#
$thmlhead =~ s/^\s*//s;
$thmlhead = "<!DOCTYPE ThML PUBLIC 
  \"-//CCEL/DTD Theological Markup Language//EN\" \"dtd/ThML10.dtd\">
  " . $thmlhead unless $thmlhead =~ m/DOCTYPE/;
$thmlhead = "<?xml version=\"1.0\"?>\n".$thmlhead 
	unless $thmlhead =~ m/<?xml ver/;

#
# process body a bit
#
#$body =~ s|(ThML.body>).*?(<div\d)|$1$2|si;
$body =~ s|.*?(<div\d)|$1|si;
$body =~ s|<span[^>]>\s*</span>||gsi;
$body =~ s|<span\s*>([^<]*)</span>|$1|gsi;
$body =~ s|<p\s+[^>]>\s*</p>||gsi;

# move </divn><divn> tags before preceeding <p>, <pb>, <p><pb>, etc.
$body =~ s=((<(p|pb)\s[^>]*>\s*)+)((</?div\d[^>]*>\s*)+)=$4$1=gis;

# convert lang=".." attributes into full-fledged <foreign> elements
$body =~ s|(<(\w+)\s+[^>]*(lang="..").*?</\2>)|<foreign $3>$1</foreign>|gsi;


#
#now print it out as a "here document"
#

print <<"!!!";
$thmlhead
<ThML.body>
<!--==========original HTML document===============-->
$body

</ThML.body>
</ThML>
!!!


sub note
{
  $oldref = $ref;
  my $place=shift;
  $ref = shift;
# print STDERR $place."note $ref\n";
  my $oldnum = ($oldref =~ m/\d*/);
  my $newnum = ($ref =~ m/\d*/);
  warn "\nHey -- note before $ref missing\n" unless ($newnum == $oldnum++);
  warn "\nHey -- note $ref doesn't exist!\n" unless $notes{$ref};
  return "<note place=\"$place\" id=\"$ref\">$notes{$ref}</note>";
}

sub make_indexes
{
  $index_start = "\n<div1 title=\"Indexes\">\n<h1>Indexes</h1>\n";
  $index_end = "</div1>\n";

  $subject_idx = "<div2 title=\"Subject Index\">\n<h2>Subject Index</h2>\n";
  $subject_idx .= "<insertIndex type=\"subject\"/>\n</div2>\n";

  $scripRef_idx = "<div2 title=\"Index of Scripture References\">\n";
  $scripRef_idx .= "<h2>Index of Scripture References</h2>\n";
  $scripRef_idx .= "<insertIndex type=\"scripRef\"/>\n</div2>\n";

  $scripCom_idx = "<div2 title=\"Index of Scripture Commentary\">\n";
  $scripCom_idx .= "<h2>Index of Scripture Commentary</h2>\n";
  $scripCom_idx .= "<insertIndex type=\"scripCom\"/>\n</div2>\n";

  $cite_idx = "<div2 title=\"Index of Citations\">\n";
  $cite_idx .= "<h2>Index of Citations</h2>\n";
  $cite_idx .= "<insertIndex type=\"cite\"/>\n</div2>\n";

  $name_idx = "<div2 title=\"Index of Names\">\n";
  $name_idx .= "<h2>Index of Names</h2>\n";
  $name_idx .= "<insertIndex type=\"name\"/>\n</div2>\n";

  $greek_idx = "<div2 title=\"Greek Words and Phrases\">\n";
  $greek_idx .= "<h2>Index of Greek Words and Phrases</h2>\n";
  $greek_idx .= "<insertIndex type=\"foreign\" lang=\"EL\"/>\n</div2>\n";

  $hebrew_idx = "<div2 title=\"Hebrew Words and Phrases\">\n";
  $hebrew_idx .= "<h2>Index of Hebrew Words and Phrases</h2>\n";
  $hebrew_idx .= "<insertIndex type=\"foreign\" lang=\"HE\"/>\n</div2>\n";

  $latin_idx = "<div2 title=\"Latin Words and Phrases\">\n";
  $latin_idx .= "<h2>Index of Latin Words and Phrases</h2>\n";
  $latin_idx .= "<insertIndex type=\"foreign\" lang=\"LA\"/>\n</div2>\n";

  $german_idx = "<div2 title=\"German Words and Phrases\">\n";
  $german_idx .= "<h2>Index of German Words and Phrases</h2>\n";
  $german_idx .= "<insertIndex type=\"foreign\" lang=\"DE\"/>\n</div2>\n";

  $french_idx = "<div2 title=\"French Words and Phrases\">\n";
  $french_idx .= "<h2>Index of French Words and Phrases</h2>\n";
  $french_idx .= "<insertIndex type=\"foreign\" lang=\"FR\"/>\n</div2>\n";

  $page_idx = "<div2 title=\"Index of Pages of the Print Edition\">\n";
  $page_idx .= "<h2>Index of Pages of the Print Edition</h2>\n";
  $page_idx .= "<insertIndex type=\"pb\"/>\n</div2>\n";
}
