#!/usr/bin/perl -w
#
# index v0.5, Harry Plantinga, Nov. 8, 1998. This program may be 
# distributed and used under the terms of the Artistic License.
#
# index: add indexes for <insertIndex /> elements.
#
# v0.52, Dec. 3, 1998. Modified to generate new, generalized URLs --
#          /ccel/authorID/bookID.htm|ID -- which may need conversion to html
#          or a server script or client mods to interpret correctly (e.g. 
#          first search local hard drive, then CD-ROM, then get over network)
# v0.51, Nov. 20, 1998.  Changed the way scripture indexes are made --
#    use <P>s instead of <UL><LI>
# v0.52, Mar. 2, 2000. Shorten long index entries; print number of each
#    type of index entry; add index of pages of the print edition
# 
use strict;
my (%bookID, %idfile);	#book name or alias => bookID; id => filename 
my @bookName;		#gives official book name for bookID
my @chapters;		#gives number of chapters for bookID

my $input = "";
while (<>) { 
  $input .= $_; 
}
print STDERR "\nindex: " . length($input) . " bytes read\n";

&readbookID;		#read in Bible book names, IDs, aliases, chapters

$_ = $input;

# get default version and book, if any
my ($versionDefault) = m|<scripContext[^>]*version="(.*?)"|s;
my ($bookDefault) = m|<scripContext[^>]*passage="(\w*)|s;
my ($authorID) = m|<authorID\s*>(.*?)</authorID\s*>|s;
my ($bookID) = m|<bookID\s*>(.*?)</bookID\s*>|s;
my ($url) = "/ccel/$authorID/$bookID.htm";

# time to delete <deleted> stuff so it doesn't get indexed
s|<deleted.*?</deleted>||gs;

s|(<insertIndex.*?>)|insertIndex($1)|gsie;
s|</insertIndex\s*>||gs;

print;	
exit(0);
  

#-----------------------index subroutines-----------------------

sub insertIndex
{
  my $tag = shift;
  my $rec = "";
  my ($type) = $tag=~ m|type="(.*?)"|g;
  my $idx  = "\n<!-- Start of automatically inserted $type index -->\n";
  $idx .= "<DIV class=\"Index\">\n";
  my ($r, $s, $id, $name, $n, $passage, @entries, @index, %index);
 

  #--------------------------------------------------------------
  # make scripture-related indexes: scripRef, scripCom, scripture.
  # Make list of references and sort biblically. Name is book, 
  # chapter, verse list.
  #
  my ($oldbk,$bk,$bkid,$fch,$tch,$fv,$tv); 
  if ($type =~ m|^scrip|) { 
    (@entries) = m|(<$type[^>]*parsed.*?>)|gs;
    print STDERR "Building scrip index -- $#entries entries\n";

    foreach $s (@entries) {             # process each entry
#     print "Found $s\n";
      $rec = &attributes($s);		# get reference attributes and parse
      my (@parsed) = split /;/, $rec->{"parsed"};

      foreach $r (@parsed) {            # for each resulting parsed entry,
#       print "  Parsed ref: $r\n";
        my $i = {};
        ($i->{"version"}, $i->{"book"}, $i->{"fch"}, $i->{"fv"}, 
         $i->{"tch"}, $i->{"tv"}) = split /\|/,$r;
        $i->{"bookID"} = $bookID{$i->{"book"}};
        $i->{"url"} = $rec->{"target"} || $rec->{"id"};
#       print "Adding to index: $i->{'version'}|$i->{'book'}|$i->{'fch'}\n";
        push @index, $i;                # add to index
      }
    }
 
    # sort scripture reference index
    sub byref { $a->{"bookID"} <=> $b->{"bookID"} or
                $a->{"fch"} <=> $b->{"fch"} or
                $a->{"fv"} <=> $b->{"fv"} or
                $a->{"tch"} <=> $b->{"tch"} or
                $a->{"tv"} <=> $b->{"tv"} or
                $a cmp $b };
    @index = sort byref @index;
 
#   print "scripRef index after sort: \n";
#   printah(@index);
 
    my $continue=0;
    $oldbk="";
#   $idx .= "<UL class=\"bbook\">\n";

    foreach $s (@index) {              # add each entry to index
      $bk = $bookName[$s->{"bookID"}];
      $idx .= " </P>\n" if $continue and $oldbk ne $bk;
      $idx .= "<P class=\"bbook\">$bk</P>\n <P class=\"bref\">\n" 
              if $oldbk ne $bk;
      $oldbk=$bk;
      $continue=1;
      $idx .= " <A class=\"TOC\" href=\"$url|$s->{'url'}\">".$s->{"fch"};
      $idx .= ":$s->{'fv'}" if $s->{'fv'};
      $idx .= "-$s->{'tv'}" if $s->{'fch'} eq $s->{'tch'};
      $idx .= "-$s->{'tch'}:$s->{'tv'}" 
                if $s->{'tv'} and $s->{'fch'} ne $s->{'tch'};
      $idx .= "</A>&#160;&#160;\n";
    }
    $idx .= " </P>\n" if $continue;
    $idx .= "</DIV>\n<!-- End of $type index -->\n\n";

#   print $idx;
    return "$idx\n";


  #--------------------------------------------------------------------
  # handle subject index entries.  Gather all <index /> elements, 
  # sort by subjects, then build hierarchical index.
  #
  } elsif ($type eq "subject") { 
    (@entries) = m|(<index[^>]*?/>)|gs;	#make list of all index entries
    print STDERR "Processing subject index -- $#entries entries\n";
    foreach $s (@entries) {		#for each index entry
      $rec = &attributes($s);		#get references
      $rec->{"url"} = $rec->{"target"} || $rec->{"id"}; #if no target, us id
      $rec->{"name"} = $rec->{"title"};
      if (!$rec->{"name"}) {		#if no title, use ID/URL (cleaned up)
        $rec->{"name"} = $rec->{"url"};
        $rec->{"name"} =~ s/(\.p\d+)?\.\d+(_\d+)?$//s;	#delete paragraph, etc
        $rec->{"name"} =~ s/\./ /gs;
        $rec->{"name"} =~ s/ $//gs;
      }
      # cosmetic bug: if this is a genuine URL, not an ID, it'll get mangled

      push @index, $rec;
    }

#   printah(@index);                    #print out index entries 
    my ($s1, $s2, $s3, $s4, $new);
    $s1=$s2=$s3=$s4="";

    $SIG{__WARN__} = sub {};	#turn off warnings (uninitialized subjects)
    # sort subject index by subject4, subject3, etc.
    sub by_subj { $a->{"subject1"} cmp $b->{"subject1"} or
                  $a->{"subject2"} cmp $b->{"subject2"} or
                  $a->{"subject3"} cmp $b->{"subject3"} or
                  $a->{"subject4"} cmp $b->{"subject4"} or
                  $a->{"name"}    cmp $b->{"name"} }
    @index = sort by_subj @index;	#perl rocks!

    # output subject index as nested lists.
    # level is the depth of open lists; event is the lowest-level 
    # subject change.  For each new index entry, close off open lists
    # from level downto event, then open new lists as needed.

    my ($level, $event, $r);
    $level=1;
    $idx .= "<UL class=\"Index1\">\n";

    foreach $r (@index) {			#process all index entries
      $event=5;                                 #compute event level (5
      $event=4 if $s4 ne $r->{"subject4"};	#means no event, i.e. no 
      $event=3 if $s3 ne $r->{"subject3"};      #change in subjects
      $event=2 if $s2 ne $r->{"subject2"};
      $event=1 if $s1 ne $r->{"subject1"};
#     print "<br>lev$level event$event: $r->{'subject1'}|$r->{'subject2'}";
#     print "|$r->{'subject3'}|$r->{'subject4'}|$r->{'name'}\n";
      $s4=$r->{"subject4"}; $s3=$r->{"subject3"}; 
      $s2=$r->{"subject2"}; $s1=$r->{"subject1"}; 
         
      while ($event < $level) {	                #close off open lists
        $level--;
        $idx .= " "x$level . "</UL>\n";
      }

#     my $li = "";	#legal HTML requires <LI><UL>, but that makes the
      my $li = "<LI>";	#spacing of the list look bad! Extra blank line!
#     $idx .= " [First new subject: no new list needed]\n";
      if ($level==$event) {
        if ($level==1) {
          $idx .=" <LI>$r->{'subject1'}</LI>\n $li<UL class=\"Index2\">\n"; 
          $level++; 
        }
        elsif ($level==2) {
          $idx .="  <LI>$r->{'subject2'}</LI>\n  $li<UL class=\"Index3\">\n";
          $level++; 
        }
        elsif ($level==3) {
          $idx .="   <LI>$r->{'subject3'}</LI>\n   $li<UL class=\"Index4\">\n";
          $level++; 
        }
        elsif ($level==4) {
          $idx .="    <LI>$r->{'subject4'}</LI>\n    $li<UL class=\"Index5\">\n";
          $level++; 
        }
      }
        
#     $idx .= " [Now open any new needed lists]\n";
      if ($level==2 && $r->{'subject2'}) {
        $idx .= "  <LI>$r->{'subject2'}</LI>\n  $li<UL class=\"Index3\">\n"; 
        $level++; 
      }
      if ($level==3 && $r->{'subject3'}) {
        $idx .= "   <LI>$r->{'subject3'}</LI>\n   $li<UL class=\"Index4\">\n";
        $level++; 
      }
      if ($level==4 && $r->{'subject4'}) {
        $idx .= "    <LI>$r->{'subject4'}</LI>\n    $li<UL class=\"Index5\">\n";
        $level++; 
      }
        
      $idx .= "     <LI><A class=\"TOC\" href=\"$url|$r->{'url'}\">" .
                                               "$r->{'name'}</A></LI>\n";
    }

    while ($level-- > 0) {
      $idx .= " "x$level . "</UL>\n"; }
    $idx .= "</DIV>\n<!-- End of $type index -->\n\n";
    $SIG{__WARN__} = sub {warn $_[0];};		#turn warnings back on
    return $idx;

    
  #----------------------------------------------------------------
  #handle insertIndex type="contents", i.e. table of contents. If level
  #attribute is present, only index divs up to that level.
  } elsif ($type eq "contents") {
    my $count=1;
    my (@ids, @levels, @names, $i);
    my ($maxlev) = $tag =~ m|level="(.*?)"|;

    (@entries) = m|(<div[1-7].*?>)|g;
    print STDERR "Processing inserted TOC -- $#entries entries\n";
    
    foreach $s (@entries) {		   # for each <div element...
      my ($level) = $s =~ m|<div(\d)|;     # check if we're too deep
      next if (defined($maxlev) and ($level gt $maxlev));
      $levels[$count] = $level;
      my ($type) = $s =~ m|type="(.*?)"|;
      my ($n) = $s =~ m|n="(.*?)"|;	   # if there is a type and name,
      my ($name) = $s =~ m|title="(.*?)"|; # set name to type n. name
      $name ||= "";
#     print "Type=$type n=$n name=$name\n";
      $name = $type." ".$n.". ".$name if $type and $n;
      $names[$count] = $name;
      ($ids[$count++]) = $s =~ m|id="(.*?)"|;
    }

    for ($i=1; $i<$count; $i++) {          # now make table of contents
      $idx .= "<P class=\"TOC$levels[$i]\"><A class=\"TOC\" " .
              "href=\"$url|$ids[$i]\">" .
              "$names[$i]</A></P>\n";
    }
    $idx .= "</DIV>\n<!-- End of $type index -->\n\n";
    return $idx;

  #----------------------------------------------------------------
  # index of pages of print edition
  #
  } elsif ($type eq "pb") {
    (@entries) = m|(<pb[^>]*>)|gs;
    print STDERR "Processing page index -- $#entries entries\n";

  $idx .= "<UL class=\"Index1\">\n";

  my $count=0;
  foreach $s (@entries) {
    ($id) = $s =~ m|.*?id="(.*?)"|s;
    ($n) = $s =~ m|n="(.*?)"|s;
    $idx .= "<a class=\"TOC\" href=\"$url|$id\">$n</a>&nbsp;\n";
    $count++;
#   $idx .= "<br/>\n" if ($count % 10) == 0;
  }
  $idx .= "</UL>\n</DIV>\n<!-- End of page index -->\n\n";

  return $idx;

  #----------------------------------------------------------------
  # 1. foreign language index.  Language given by lang=".." attribute.
  # 2. all other indexes.
  # After finding the <index> elements, handle them the same.
  #
  } elsif ($type eq "foreign") {
    my ($lang) = $tag =~ m/lang="(.*?)"/;
    (@entries) = m|(<foreign[^>]*lang="$lang"[^>]*>.*?</foreign>)|gsi;
    my $num= $#entries + 1;
    print STDERR "Processing foreign($lang) index -- $num entries\n";
  } else {	#handle the rest: name, date, citation, etc.
    (@entries) = m|(<$type.*?</$type>)|gs;
    my $num= $#entries + 1;
    print STDERR "Processing $type index -- $num entries\n";
  }

  # at this point @entries contains all relevant index entries.
  # find pertinent info, sort, and return index.
  # (cases that already did this already returned.)
  foreach $s (@entries) {
#   print "Processing entry---------\n$s\n--------\n";
    ($id) = $s =~ m|.*?id="(.*?)"|s;
    ($name) = $s =~ m|.*?>(.*)<.*?|s;
    $name = $1 if $s =~ m|title="(.*?)"|s;
    ($n) = $s =~ m|n="(.*?)"|s;
    $name = $n unless $name;
    $name =~ s/id=".*?"//g;
    $name =~ s/(>[^<]{60,60}[^<]*?\w+)[^<]*/$1.../gsi;	  #shorten long bits
    $index{$name} = $id;
#   print "-->name $name\n";
  }
  
  $idx .= "<UL class=\"Index1\">\n";
  foreach $s (sort keys %index) {
    $idx .= " <LI><A class=\"TOC\" href=\"$url|$index{$s}\">$s</A></LI>\n";
  }
  $idx .= "</UL>\n</DIV>\n<!-- End of $type index -->\n\n";
  return $idx;
}


# this subroutine finds all of the attributes of the form xxx="yyy" and
# returns them as a reference to a hash.
sub attributes
{
  my $tag = shift;
  my $rec = {};
  my $s;

  while ($tag =~ s|(\w+)="(.*?)"||s) {		#find all attributes
    $rec->{$1} = $2; }

  return $rec;
}
  

# this sub, used for debugging, prints contents of an array of hashes
sub printah
{
  print "-------------\n";
  my ($role, $i);

  for $i ( 0 .. $#_ ) {
    print "$i is { ";
    for $role ( keys %{ $_[$i] } ) {
      print "$role=$_[$i]{$role} ";
    }
    print "}\n";
  }
  print "-------------\n"
}


sub readbookID
{
  open BOOKID, "</home/hplantin/bin/bookID.unl" or die "!$\n";
  my $n;
  while (<BOOKID>) {
    chop;
    s|#.*||;
    next if m|^\s*$|;
#   print "\nRead $_\n";

    my @names  = split /:/, $_;
    my $bkID   = shift @names;
    my $bkname = shift @names;
    my $chaps  = shift @names;
    $bookName[$bkID] = $bkname;
    $chapters[$bkID] = $chaps;
    $bookID{$bkname} = $bkID;
    foreach $n (@names) {
      $bookID{$n} = $bkID; }
  }

  close BOOKID;
}

