#!/usr/bin/perl -w
#
# identify v0.5, Harry Plantinga, Nov. 8, 1998. This program may be 
# distributed and used under the terms of the Artistic License.
#
# v0.54.  1999-11-05, modified not to generate IDs for a few tag types.
# v0.53.  Dec. 3.  Fixed IDs generated when there were previous IDs.
# v0.52.  Nov. 25. Corrected unique ID code.
# v0.51.  Nov. 20.  
#   - Ensure that IDs created are unique. 
#   - Don't add ids to tags that already have them. 
#   - Write output directly, to decrease memory requirements.
# 
# identify: add ids to all body elements. ids are based on divs:
# they have the form nn.nn.nn.pmm.ee, where the nns are the 
# numbers (or first 12 characters of the name) of the containing
# div1, div2, and div3 (etc.).  The mm is the number of the 
# paragraph in the current div, and the ee is the number of the
# element in the current paragraph.
#
# bugs:  
#   - it is SLOW with big files!!! 
#   - It really ought to read through the file and put any IDs it
#     finds into %ids_uses, so they can't be reused.
#
use strict;
my ($input, @id, %ids_used);
my $debug=0;

while (<>) 
  { $input .= $_; }
$_ = $input;
print STDERR "identify: read " . length($input) . " bytes\n";

  #first, change n attribute of PB tags to Page_n id
  s|(<pb[^>]*n=")([^"]*)("[^>]*?)\s*(/?>)|$1$2$3 id="Page_$2"$4|gs;

  &id();			#add ids to tags

  exit(0);

  

#------------ add tag ids --------------
# 
# This subroutine adds an id attribute to each element after the
# first <div1>. The id is of the form xxx.xxx.xxx.pyy.zz, where
# each xxx identifies a div, yy is the paragraph number in the
# division, and zz is the element number.
#


sub id
{
# $_ = shift;
  my $p = "0";
  my $t = "0";
  my ($level, $oldlevel, $stuff, $tag);
  my $sect = "";

  s|(.*?)(<div\d)|$2|s;
  print $1;
  
  while (s|^([^<]*)(<[^>]*>)||s) {
    print $1;
    $tag = $2;

    # dont give IDs to these tags
    if ($tag =~ m/^(<i>|<b>|<br|<hr|<sup|<note|<span)/) {
        print $tag;
        next;
    }

#   print STDERR "Processing $tag\n";

    if ($tag =~ m|id="|) {		#already had an id?
      print $tag;
      if ($tag =~ m|<div|) {		#if it's a div, remember section ID
         $sect = $1 if $tag =~ m|id="(.*?)"|; }
      next;
    }

    if ($tag =~ m/(<!--.*?-->|<\/.*?>)/s) { 	#skip comments, end tags
      print $1;
      }
    elsif ($tag =~ m|<div([1-7])([^>]*)>|s) #a div without an ID...
    {
#added
      my ($x,$xsect);
#end added
      $oldlevel = $level;
      $level = $1;			#new level
      $stuff = $2;			#attributes of div tag
      $p = "0";				#reset paragraph counter to 0
      $t = "0";				#reset element counter to 0
      print STDERR "Found div: <div$1$2> --> " if $debug;
      $sect = getName($level, $stuff);	#get section name
      $sect = "x" unless $sect;
      print STDERR "$sect\n" if $debug;
      if ($ids_used{$sect}){
        print STDERR "HEY--$sect was already used! (1)\n" if $ids_used{$sect};
#        $sect .= "1";
#        while ($ids_used{$sect}) {
#          $sect++; 
#        }
#replaced
        $x = 1;
        $xsect = $sect.'_'."$x";
        while ($ids_used{$xsect}) {
          $x++;
          $xsect= $sect.'_'."$x";}
        $sect = $xsect;
#end replaced
        print STDERR "    -- let's use $sect instead.\n";
      }
      $ids_used{$sect} = 1;
      $tag =~ s|(^<div[1-7].*?)>|$1 id="$sect">|s;  #add section name as id
      print $tag;
    }
    elsif ($tag =~ m|(<P[^>]*)>|s)
    {
      $p++;				#increment paragraph counter
      $t = "0";				#reset tag counter to 0
#     print STDERR "1=$1 sect=$sect p=$p\n";
      my $id = "$sect.p$p";
      if ($ids_used{$id}) {
        print STDERR "HEY -- $id was already used! (2)\n" if $ids_used{$id};
        $id .= "1";
        while ($ids_used{$id}) { 
          print "Before increment, sect=$sect\n";
          $id++; 
          print "Before increment, sect=$sect\n";
        }
        print STDERR "    -- let's use $id instead.\n";
      }
      $ids_used{$id} = 1;
      print "$1 id=\"$id\">";
    }
    elsif ($tag =~ m|(<[^>]*?)(/?>)|s)	#any other tag, possibly <  />
    {
      $t++;				#increment tag counter
      my $id = "$sect.p$p.$t";
#     print STDERR "Assigning ID $id\n";
      if ($ids_used{$id}) {
        print STDERR "HEY -- $id was already used! (3)\n" if $ids_used{$id};
        $id .= "1";
        while ($ids_used{$id}) { 
          $id++; }
        print STDERR "    -- let's use $id instead.\n";
      }
      $ids_used{$id} = 1;
      print "$1 id=\"$id\"$2";
    }
  }

  print $_;
}


# 
# this function takes info from a <div> tag and returns 
# the current section (e.g. 1.3.2) 
#
# side-effect: modify global $id[] array of current context
# 
sub getName
{
  my $level = shift;
  my $rest = shift;
  my $divtitle = "";
  my $type = "";
  my $n = "";
  my ($sect, $i);

  $divtitle = $1 if $rest =~ m|title="(.*?)"|s;		#find title
  $type = $1 if $rest =~ m|type="(.*?)"|s;		#find type
  $n = $1 if $rest =~ m|n="(.*?)"|s;			#find n
#  print STDERR "title=$divtitle type=$type n=$n\n";
  
  if ($n)                                 #each level id is n if it exists 
    { $id[$level] = $n; }
  else
    { $id[$level] = $divtitle; }          #if no n, use divtitle as id

  $id[$level] =~ s|&.*?;||g;              #delete entities
  $id[$level] =~ s|^the||gi;              #delete initial _the_
  $id[$level] =~ s|[^-_0-9a-zA-Z]||gs;    #delete chars not in list
  $id[$level] =~ s|(.{1,12}).*|$1|;       #limit to 12 characters
 
  $divtitle = "$type $n: " . $divtitle if $type and $n;
  $sect = "";
  for ($i="1"; $i le $level; $i++)        #add id for each level
    { $sect .= ".".$id[$i]; }
  $sect =~ s|^\.||;
  $sect = "s$sect" if $sect =~ m/^[0-9]/;    #legal IDs can't start with digit

  return ($sect);
}

