#!/usr/local/bin/perl -w
# 
# thm2htm,  Harry Plantinga.  This program may be copied
# under the terms of the Artistic License.
#
# This script is a first attempt at ThML to HTML conversion.
#   (Probably this will be done eventually with an XSL stylesheet.)
#
# v0.24, 99-01-30  Fixed various bugs to make it work better with Voyager
#   version of DTD
# v0.23, 99-01-12  Fixed bug that prevented scriprefs in footnotes from
#   being linked to bible gateway.
# v0.23, 99-01-02  Modified to work with ThML0.99 and DC header
# v0.22, 12/7/98.  Modified to work with new, ccel-style URLs
# v0.21, 12/1/98.  Modified to use the division ID as its name, rather
#   than trying to recompute what the ID should be.  Hopefully this
#   will make it work when div elements have manually-inserted ids.
# v0.2, 11/25/98. This version makes a number of unspecified improvements.
#   The program works reasonably well for one or two files, but it has 
#   not been tested very well yet and needs added features.
#
# Eventual hopes: 
# - Generate a preferences panel to set things such as font size, 
#   scripture translation preference, etc. in cookies. 
# - Table of contents as expandable/collapsable outline.  
# - Optional footnote display along right side. 
# - Navigation panel showing context in separate frame. 
# - Left and right arrow keys page backward and forward. 
# - Generate link elements specifying previous and next documents
# - Generate meta tags based on title, author, subject, etc.
# - Etc. etc. etc.
#
use strict;
use ThMLutil;

my ($bookID, $author, $authorID, $publisherID, $version, $title, $input);
my ($head,$body,$prev,$filename,$notenum,$footnotes,$rights);
my ($DCpublisher, $DCdate, $URL, $description);
my $debug = 0;

while (<>) 			#read entire file into $input
  { $input .= $_; }

$input =~ s|<deleted.*?</deleted>||gsi;	#delete deleted stuff
$input = &dumbquo($input);		#dumb down quotes for HTML
$_ = $input;

&getInfo;			#get author, title, etc out of header
mkdir ("$bookID",0774) unless -e "$bookID"; #make directory for html
#`cp $bookID.xml $bookID`;

# fix up references to  other sections of same document
$input =~ s|(href=")#(.*?)(\.p.*?")|$1$2.htm#$2$3|gsi;

# separate out head and body
$input =~ m|^(.*</ThML.head\s*>).*?(<ThML.body\s*>.*)|si;
$head = $1; $body = $2;

$body =~ s/href="(.*?)"/&uri2url($1)/gsie;
&processHead($head);	#process ThML.head -- make info page
&processBody($body);	#process ThML.body -- make web

# this is really only true if there is a division called TitlePage...
# should create a special index.htm and start there.
print "HTML starts at $bookID/About.htm\n";


#-------------------------subroutines----------------------------
#
# This subroutine converts ccel-style URIs to URLs that will work
# with the multiple htm files generated by this program.
sub uri2url
{
  my $url=shift;
# warn "processing $url\n";
  if ($url =~ s@(/ccel/$authorID/$bookID.htm)?\|@@) {
    if ($url =~ m/\.p/) {
      $url =~ s/((.*?)\.p.*)/$2.htm#$1/; 
    } else {
      $url .= ".htm";
    }
  }

# print "returning $url\n";
  return "href=\"$url\"";
}


# this subroutine gets some important info from the header: 
# title, author, bookID, authorID
#
sub getInfo
{
  m|<DC.Title.*?>(.*?)</DC.Title\s*>|is; $title = $1 || "";
  m|<DC.Creator.*?>(.*?)</DC.Creator\s*>|is; $author = $1 || "";
  m|<DC.Rights.*?>(.*?)</DC.Rights\s*>|is; $rights = $1 || "";
  m|<DC.Publisher.*?>(.*?)</DC.Publisher\s*>|is; $DCpublisher = $1 || "";
  m|<DC.Date.*?>(.*?)</DC.Date\s*>|is; $DCdate = $1 || "";
  m|<DC.Identifier.*?>(http.*?)</DC.Identifier\s*>|is; $URL = $1 || "";
  $bookID=$1        if m|<bookID\s*>(.*?)</bookID\s*>|is; 
  $authorID=$1      if m|<authorID\s*>(.*?)</authorID\s*>|is; 
  $publisherID=$1   if m|<publisherID\s*>(.*?)</publisherID\s*>|is; 
  $version=$1       if m|<version\s*>(.*?)</version\s*>|is; 
  $description=$1   if m|<description\s*>(.*?)</description\s*>|is;
  $description||=$1 if m|<DC.description\s*>(.*?)</DC.description\s*>|is;
  print "Processing $title by $author\n";
}
  

#
#  Make an info page out of the ThML.head information
#
sub processHead
{
  $_ = shift;

  #output the stylesheet which is common for all sections of this doc
  my $name=">$bookID/styles.css";
  open STYLES, $name or die $!;
  my $styles="";
  $styles = $1 if s|<style.*?>(.*?)</style\s*>||is;
  s|<link.*?/>||gis;
  print STYLES $styles;
  close STYLES;

  my $front = head("About $title");
  $front .= "<h1 class=\"title\">About <i>$title</i></h1>\n";
  $front .= "<h2 class=\"subhead\">by $author</h2>\n";
  $front .= "<h3 class=\"subhead\"><i>CCEL Edition v$version</i></h3><hr>\n";
  $front .= "<p class=\"First\">$description</p>\n" if $description;
  $front .= "<h3 class=\"subhead\"><a class=\"TOC\" href=\"TOC.htm\">Table of Contents";
  $front .= "</a></h3><hr>\n";
  my $back = "</body>\n</html>\n";

  s|<(.*?)></\1\s*>\s*||gm;  #delete empty <x></x> tags
  retag("!DOCTYPE",	"_detag");
  retag("ThML",	"_detag");
  retag("ThML.head",	"_detag");
  retag("meta",	"_detag");

  #
  # process each of generalInfo, printSourceInfo, elecEdInfo separately
  #
  s@<(generalInfo|printSourceInfo|electronicEdInfo)\s*>(.*?)</\1\s*>@headsect($1,$2)@egs;

  $name=">$bookID/About.htm";
  open INFO, $name or die $!;
  print INFO $front . $_ . $back;
  close INFO;
}


# 
#  Put the Table of Contents and each <divn>...</divn> into a 
#  separate file.
#
sub processBody
{
  $_ = shift;
  $filename = "_none";
  
  retag('attr',	      'p class="Attribution"', "p");
  retag('argument',   'p class="Argument"', "p");
  retag('meter',      'p class="meter"', "p");
  retag('sectionInfo','p class="sectionInfo"', 'p');
  retag('name',       'span class="Name"', 'span');
  retag('date',       'span class="Date"', 'span');
  retag('unclear',    'span class="unclear"', 'span');
  retag('l',          'p', 'p');
  retag('verse',      'div class="Verse"', 'div');

  open TOC, ">$bookID/TOC.htm" or die $!;
  print TOC head("$title - TOC");
  print TOC "<h1 class=\"title\">$title</h1>\n";
  print TOC "<p class=\"by\">by</p>\n";
  print TOC "<h2 class=\"subhead\">$author</h2>\n";
  print TOC "<hr><h1 class=\"title\">Table of Contents</h1>\n";
  print TOC "<p class=\"TOC1\"><a class=\"TOC\" href=\"About.htm\">";
  print TOC   "<i>About This Edition</i></a></p>\n";
  
  #
  # For each divn tag we add a TOC entry, then put the contents
  # into a separate file.
  #
  my ($oldlevel, $level, $rest, $content, $r, $n);
  $level="0";
  while (m|<div[1-7]|)		# while there is a remaining <div
  {
    $oldlevel = $level;
    if (m|<div[1-7].*?<div[1-7]|s) 	#if there are two divns left
    { 
      s|.*?<div([1-7])(.*?)>(.*?)(<div[1-7])|$4|s; 
      $level = $1;
      $rest = $2;
      $content = $3;
    } 
    else
    { 
      s|.*?<div([1-7])(.*?)>||s; 
      $level = $1;
      $rest = $2;
      $content = $_;
    }
    $n = "" if $level ne $oldlevel;
    my ($divID, $divtitle, $nextID, $nexttitle, $next);
    $prev = $filename;				#remember last division
    ($divID, $divtitle, $filename) = getName($rest);
    
    $next = "_none";
    ($nextID, $nexttitle, $next) = getName($1) if m|.*?<div[1-7](.*?)>|s;

    print TOC "<p class=\"TOC$level\"><a class=\"TOC\" href=\"$filename\">";
    print TOC "$divtitle</a></p>\n";

#   print "In processBody--about to call processDiv prev=$prev next=$next\n";
    open OF, ">$bookID/".$filename or die $!;
    print OF processDiv($content,$divID,$prev,$next);
    close OF;
  }
  
  print TOC "\n<hr>\n<b>Also Available:</b>\n";
  print TOC "<ul><li><a href=\"About.htm\">About <i>$title</i></a>\n";
  opendir DIR, $bookID or die "Couldn't open dir $bookID!\n";
  my @files = readdir DIR;
  closedir DIR;
  my $f;			# add a link to non-.htm files in the same
  foreach $f (@files) {		# directory (e.g. xml, thm, txt versions of
    next if $f =~ m/\.htm$/;	# the file)
    next if $f =~ m/\.$/;  #skip ., .. directories
    next if $f =~ m/styles.css$/;
    next if $f =~ m/dbimport.sql$/;
    my $size = -s "$bookID/$f";
    if ($size < 1024) {
      $size .= " bytes";
    } else {
      $size >>= 10;	#divide by 1024
      $size .= " KB";
    }
    print TOC "<li><a href=\"$f\">$f</a> [$size]\n";
  }

  print TOC "</ul>\n</body></html>\n";
  close TOC;
} 



sub processDiv
{
  my $div = shift; 
  my $sect = shift;
  my $prev = shift;
  my $next = shift;
  $notenum = 1;
  $footnotes = "";

  my $front = head("$title $sect");	#construct HTML head
  my $back = "</body></html>\n";
  my $nav = navbar($prev, $next);

  $div =~ s|<scripRef(\s+[^>]*parsed="(.*?)"[^>]*)>(.*?)</scripRef>|&bglink($1,$2,$3)|gsie;
  $div =~ s|(<note.*?>.*?</note>)|&note($1,$notenum)|gsie;

# $div =~ s|<note.*?>(.*?)</note>|<span class="Note" title='$1'>*</span>|gs;
# $div =~ s|(title='[^']*?<[^']*?>[^']*?')|&detag($1)|gse;
         #change notes to title="..." and detag inside

  $div =~ s|\&line;|<br>|g;		#change &line; to <br>
  $div =~ s|(<p.*?>)(</p>)|$1&nbsp;$2|gs;#add space to blank paragraphs

  #now for something really nasty: lists were generated as 
  #  <li><ul>: the <li> is required in valid HTML4.
  #But it looks terrible, with blank lines where they're not wanted.
  #This hack deletes that extra <li>, resulting in invalid (but better?)
  #HTML.

  $div =~ s/<li>(<ul class="Index)/$1/g;
  return "$front$nav\n$div\n$footnotes$nav$back";
}


sub note
{
  my $note=shift;
  print "Processing footnote $note -- number $notenum\n" if $debug;
  $footnotes = "\n<hr class=\"Note\">\n" if $footnotes eq "";

  $note =~ s|</?note[^>]*>||g;
  my $bref="<a class=\"Note\" name=\"_fnf$notenum\" " .
     "href=\"#_fnb$notenum\"><sup class=\"Note\">$notenum</sup></a>";
  $note =~ s|^(<p.*?>)|$1$bref |;
  $note .= "\n";
  print "After processing: Footnote $note\n" if $debug;
  $footnotes .= $note;

  my $fref="<a class=\"Note\" name=\"_fnb$notenum\" " .
     "href=\"#_fnf$notenum\"><sup class=\"Note\">". 
     $notenum++ . "</sup></a>";

  return $fref; 
}

# link scripture references to bible gateway
sub bglink
{
  my ($atts, $parsed, $text) = @_;
  my ($s, $bg, $version, $book, $fch, $fv, $tch, $tv, $id);

# print "in bglink: got atts=$atts parsed=$parsed text=$text\n";
  $atts =~ m|(id=".*?")|; $id=$1;
  $bg="";

  foreach $s (split /;/, $parsed) {
#   print "ref: $s\n ";
    ($version, $book, $fch, $fv, $tch, $tv) = split /\|/, $s;
#   print "$version $book $fch $fv $tch $tv\n";
    $bg .= $book;
    $bg .= "+$fch" if $fch;
    $bg .= ":$fv" if $fv;
    $bg .= "-" if $tch;
    $bg .= "$tch:" if $tch && $tch ne $fch;
    $bg .= $tv if $tv;
    $bg .= ",";
  }
  $bg =~ s/ //g;
  $bg = "<a href=\"http://bible.gospelcom.net/bible?passage=" . $bg ;
# $version = "Vulgate&language=Latin" if $version eq "VUL";
  $version = "" if $version eq "VUL";	#would like English vulgate...
  $bg .= "&version=$version" if $version;
  $bg .= "\" class=\"scripRef\" $id>$text</a>";
# print "  returning ref $bg\n";
  return $bg;
}

#------------tag-hack subroutines-----------------


#
# Process a head section: generalInfo, printSourceInfo, or electronicEdInfo
# First parameter is tag; second is contents of element
#
sub headsect
{
  my $section = shift;
  my $contents = shift;
  print "Processing $section\n" if $debug;
  my $result = "<h3>Information on the ";
  $result .= "Book" if $section eq "generalInfo";
  $result .= "Print Source" if $section eq "printSourceInfo";
  $result .= "Electronic Edition" if $section eq "electronicEdInfo";
  $result .= "</h3>\n";
  if ($section eq "generalInfo") {
    $result .= "<p class=\"HeadItem\"><b>Copyright</b>: $rights</p>\n";
    $result .= "<p class=\"HeadItem\"><b>CCEL Identifier</b>: $publisherID/";
    $result .= "$authorID/$bookID/$version</p>\n";
    $result .= "<p class=\"HeadItem\"><b>How to Reference This Edition</b>: ";
    $result .= "$DCpublisher, $DCdate, v$version, URL $URL</p>\n";
  }
  $contents =~ s|<([^>]*?)>(.*?)</\1>|&headItem($1,$2)|gse;
  $contents =~ s|<([^>]*?)/>|&emptyHeadItem($1)|gse;

  $result .= $contents . "\n\n";
  return $result;
}


#
#  Process one header element inside generalInfo, etc:
#  Modify element names to make nice titles: 
#    printSourceInfo --> Print Source Info 
#
sub headItem
{
  my $tag = shift;
  my $item = shift;
# print "HeadItem: processing $tag $item\n";
  return "" if $tag eq "bookID"  || $tag eq "authorID" ||
               $tag eq "version" || $tag eq "publisherID";
  return processDC($item) if $tag eq "DC";
  
  my $flowitem = ($tag eq "revisionHistory" || $tag eq "status" ||
                  $tag eq "editorialComments"); 

  $tag =~ s|([A-Z]{2})([a-z])|$1 $2|g;  #space after acronym
  $tag =~ s|([a-z])([A-Z])|$1 $2|g;	#add in spaces
  $tag = ucfirst($tag);			#make first letter upper case

  #tags have a content model of "Flow" may have their own <p>, etc.
# return "<p class=\"HeadItem\"><b>$tag:</b></p>$item";
  return "<p class=\"HeadItem\"><b>$tag:</b>$item</p>";
}

sub emptyHeadItem
{
  my $tag  = shift;
# print STDERR "EmptyHead: $tag\n";
  return "" if $tag =~ m/image/;
  return "<$tag/>";
}

sub processDC
{
  my $DC=shift;
# print "Processing DC record: $DC\n" if $debug;
  $DC =~ s|text/xml|text/html|;
  $DC =~ s|\s+sub="(\w+)"|.$1|g;			#for DC subelement
  $DC =~ s|<(DC[A-Za-z._]+)\s+scheme="(.*?)"\s*>(.*?)</DC.*?>|<tr><td>$1<td>$2<td>$3</tr>|gs;
  $DC =~ s|<(DC[A-Za-z._]+)\s*>(.*?)</DC.*?>|<tr><td>$1<td><td>$2</tr>|gs;
# print "Returning $DC\n" if $debug;
  my $ret = "<h4>Dublin Core Record</h4>\n";
  $ret .= "<table border=\"2\">\n";
  $ret .= "<tr><td><b>Element</b><td><b>Scheme</b><td><b>Content</b></tr>\n";
  $ret .= "$DC</table>\n";
  return $ret;
}
  


#
#given a title, return head of document, through <body>.
#
sub head
{
  my $title = shift;
  my $h = "<html><head>\n";
  $h .= "<title>$title</title>\n";
  $h .= '<link rel="stylesheet" type="text/css" href="/css/ThML0993.css"/>';
  $h .= "\n<style>\n<!--\n  \@import url(styles.css);\n-->\n</style>\n";
  $h .= "\n</head><body>\n\n";

  return $h;
}


#
#build a nav bar
#
sub navbar
{
  my $prev = shift;
  my $next = shift;
# print "In navbar: prev=$prev next=$next\n";

  my $nav  = "<p class=\"Center\">";
  $nav .= "<a href=\"$prev\">"
       .  "<img src=\"/pix/mroonppv.gif\" alt=\"BACK\" "
       .  "border=\"NO\"></a>" if $prev ne "_none";
  $nav .= "<a href=\"TOC.htm\">"
       .  "<img src=\"/pix/mroontoc.gif\" alt=\"UP\" border=\"NO\"></a>";
  $nav .= "<a href=\"$next\">"
       .  "<img src=\"/pix/mroonpnx.gif\" alt=\"NEXT\" "
       . "border=\"NO\"></a>" if $next ne "_none";
  $nav .= "</p>\n";
  return $nav;
}
