#!/usr/local/bin/perl -w
# 
# classify: this program converts style="..." attributes in ThML documents
# to class="..." attributes and stylesheet entries.  
#
# classify v0.5, Harry Plantinga, Nov. 8, 1998. This program may be 
# distributed and used under the terms of the Artistic License.
# 
# v0.51,  Jan 25, 1999.  Modified to work with voyager/xml.
#
# First, style attributes that are default for an element are deleted. 
#
# Then, the if there are differences between remaining styles and the
# default styles for the element, a new stylesheet entry for the current
# document is added. 
#
# Bugs: should read and parse ThML stylesheet, not program it all in.
# Major bug:  rtf2xml doesn't pass indentation and space-before/after 
# info thru to trn file, so modifying a paragraph such as continue 
# loses that information.  
#
use strict;

my ($input, %styles, $sty, %seen, @id, %stylename, %stylesheet );
      # stylename gives a name to each style combination seen so far
      # so far and the class name associated with them.
my $styleid =  "s01";   #initial new class name
my $styleapp = "01";    #initial new class suffix 
my $debug = 0;

&defaults;

while (<>) 
  { $input .= $_; }

#first delete styles from hr, verse
$input =~ s#(<(hr|verse|name|cite|unclear)[^>]*)style=".*?"#$1#gs;

#since footnotes are nested, handle them first, then escape them.
#then handle the rest of the document, and unescape the footnotes.

#just classify _contents_ of note
$input=~ s#(<note\b[^>]*?>)(.*?)(</note>)#$1.&escape(&multiclass($2)).$3#gse;
print STDERR "Done processing footnotes\n" if $debug;

# now classify rest of block-level elements
$input=~ s#(<(p|h[1-6]|attr|sectionInfo|l|term|def)\b[^>]*?>.*?</\2>)#&classify($1)#gse;
print STDERR "Done processing blocks\n" if $debug;
$input =~ s|&less-than;|<|g;

my $stylesheet = "<link rel=\"stylesheet\" type=\"text/css\" " .
                 "href=\"/css/ThML099.css\"/>\n<style type=\"text/css\">\n";
my $s;
foreach $s (sort keys %stylesheet) {
    $stylesheet .= "$s\t{ $stylesheet{$s} }\n";
}
$stylesheet .= "</style>\n";
$input =~ s|(</ThML.head>)|$stylesheet$1|;

print $input;	
exit(0);


#---------------------subroutines-----------------
#defaults: load in default css element styles
sub defaults
{
  %styles = (
  "h1" => ["font-family: 'Palatino';","font-size: 100%;","color: black;",
    "font-weight: bold;","font-style: normal;","text-align: center;"], 
  "h2" => ["font-family: 'Palatino';","font-size: 100%;","color: black;",
    "font-weight: bold;","font-style: normal;","text-align: center;"], 
  "h3" => ["font-family: 'Palatino';","font-size: 100%;","color: black;",
    "font-weight: bold;","font-style: normal;","text-align: center;"], 
  "h4" => ["font-family: 'Palatino';","font-size: 100%;","color: black;",
    "font-weight: normal;","font-style: italic;","text-align: left;"], 
  "h5" => ["font-family: 'Palatino';","font-size: 100%;","color: black;",
    "font-weight: normal;","font-style: italic;","text-align: left;"], 
  "h6" => ["font-family: 'Palatino';","font-size: 100%;","color: black;",
    "font-weight: normal;","font-style: italic;","text-align: left;"],
  "attr" => ["font-family: 'Palatino';","font-size: 110%;","color: black;",
	"font-weight: normal;","font-style: normal;","text-align: right;"],
  "term" => ["font-family: 'Palatino';","font-size: 110%;","color: black;", 
	"font-weight: normal;","font-style: normal;","text-align: right;"],
  "def" => ["font-family: 'Palatino';","font-size: 110%;","color: black;",
	"font-weight: normal;","font-style: normal;","text-align: right;"],
  "sectionInfo" => ["font-family: 'Palatino';","font-size: 110%;",
	"color: black;", "font-weight: normal;","font-style: italic;",
	"text-align: right;"],
  "Normal" => ["font-family: 'Palatino';","font-size: 110%;","color: black;",
	"font-weight: normal;","font-style: normal;","text-align: left;"],
  "p" 	=> ["font-family: 'Palatino';","font-size: 110%;","color: black;",
	"font-weight: normal;","font-style: normal;","text-align: left;"],
  "hr" 	=> ["font-family: 'Palatino';","font-size: 110%;","color: black;",
	"font-weight: normal;","font-style: normal;","text-align: left;"],
  "Continue"=> ["font-family: 'Palatino';","font-size: 110%;","color: black;",
	"font-weight: normal;","font-style: normal;","text-align: left;"],
  "First" => ["font-family: 'Palatino';","font-size: 110%;","color: black;",
	"font-weight: normal;","font-style: normal;","text-align: left;"],
  "Resume" => ["font-family: 'Palatino';","font-size: 110%;","color: black;",
	"font-weight: normal;","font-style: normal;","text-align: left;"],
  "Footnote"=> ["font-family: 'Palatino';","font-size: 100%;","color: black;",
	"font-weight: normal;","font-style: normal;","text-align: left;"],
  "endnote text"=> ["font-family: 'Palatino';","font-size: 100%;" ],
  "endnote reference"=> ["font-family: 'Palatino';","font-size: 100%;",
                            "vertical-align: super;" ],
  "code" => ["font-family: 'Courier New';","font-size: 100%;",
	"color: black;"],
  "cite" => ["font-style: italic;", "color: black;" ],
  "span" => [ ],
  "l" 	=> ["font-family: 'Palatino';","font-size: 110%;","color: black;",
	"font-weight: normal;","font-style: normal;","text-align: left;"],
  "1" 	=> ["font-family: 'Palatino';","font-size: 110%;","color: black;",
	"font-weight: normal;","font-style: normal;","text-align: left;"],
  "2" 	=> ["font-family: 'Palatino';","font-size: 110%;","color: black;",
	"font-weight: normal;","font-style: normal;","text-align: left;"],
  "3" 	=> ["font-family: 'Palatino';","font-size: 110%;","color: black;",
	"font-weight: normal;","font-style: normal;","text-align: left;"],
  "list1" => ["font-family: 'Palatino';","font-size: 110%;","color: black;",
	"font-weight: normal;","font-style: normal;","text-align: left;"],
  "list2" => ["font-family: 'Palatino';","font-size: 110%;","color: black;",
	"font-weight: normal;","font-style: normal;","text-align: left;"],
  "list3" => ["font-family: 'Palatino';","font-size: 110%;","color: black;",
	"font-weight: normal;","font-style: normal;","text-align: left;"],
  "list4" => ["font-family: 'Palatino';","font-size: 110%;","color: black;",
	"font-weight: normal;","font-style: normal;","text-align: left;"],
  "listCont1"=>["font-family: 'Palatino';","font-size: 110%;","color: black;",
	"font-weight: normal;","font-style: normal;","text-align: left;"],
  "listCont2"=>["font-family: 'Palatino';","font-size: 110%;","color: black;",
	"font-weight: normal;","font-style: normal;","text-align: left;"],
  "listCont3"=>["font-family: 'Palatino';","font-size: 110%;","color: black;",
	"font-weight: normal;","font-style: normal;","text-align: left;"],
  "listCont4"=>["font-family: 'Palatino';","font-size: 110%;","color: black;",
	"font-weight: normal;","font-style: normal;","text-align: left;"] );
}


# multiclass: handle multi-paragraph footnotes
sub multiclass
{
  my $instring=shift;
  my (@pars, $par, $multi);

  $multi = 0;
  @pars = split(m|</p>|,$instring);
  foreach $par (@pars) {
    $par .= '</p>';
    $multi++;
    print "Before par: $par\n" if $debug;
    $par = &classify($par);
  }
  warn "Got $multi-paragraph footnote\n" if $multi && $debug;
  join("",@pars);
}


# classify:  this subroutine processes block-level elements. It changes
# styles to classes according to the following algorithm:
#
# (1) delete default styles for the element from all style="" attributes
# (2) delete non-default styles for the element from span styles.
# (3) if any span styles remain that apply to the entire element, merge
#     with element styles and delete span
# (4) if any spans remain that only embolden or italicize, change to <B>
#     or <I>
# (5) if styles are left for the element and 
#     (a) it has a class, create a new one with a name such as First-x01.
#         it should have old class styles + mods.
#     (b) it doesn't have a class, create one with a name such as s01.
# (6) for each remaining span with styles,
#     (a) if it has a class, modify, e.g. cite-x01, and add to stylesheet
#     (b) if it has no class, create one, e.g. s01; add to stylesheet

sub classify
{
  $_ = shift;
  print "--------\nCLASSIFY: $_\n" if $debug;
  my ($s, $name, $first, @stylelist, @spans);
  if (m|.*<p\b.*<p\b|s) {
    print "Fatal error: classify got $_\n";
    die "Fatal error: classify got two paragraphs! See output.\n";
  }

# (1) delete default styles from all style elements
  my ($tag) = m|^<(\w*)|;
  my ($tagclass) = m|^<[^>]*?class="(.*?)"|s;
  my $tagkey = $tag; 
  $tagkey = $tagclass if $tagclass;             #tagkey gives us defaults
  print "=tagkey:   $tagkey\n" if $debug;
  foreach $s (@{ $styles{$tagkey}}) 
    { s|$s||gs; }                               #delete default styles
  print "--\nAfter deleting defaults: $_\n" if $debug;

# (2) if any element styles are left over, delete from remaining styles
  my ($tagstyle) = m|^<[^>]*?style="\s*(.*?)\s*"|s;
  @stylelist = split /;/, $tagstyle if $tagstyle and $tagstyle =~ /;/;
  foreach $s (@stylelist)
    { s|(.*?>.*?style="[^"]*)$s;|$1|gs; }
  print "--\nAfter deleting element styles from spans: $_\n" if $debug;

# (3) merge whole-element span styles with element styles
  $_ = normalize($_);
  if ( m|<$tag[^>]*style=[^>]*><span[^>]*style=[^>]*>[^>]*</span></$tag>|s) {
     print "--\nFound element-wide style: $_\n" if $debug;
     s|(style=".*?)(".*?)\s*style="(.*?)"|$1 $3$2|s;
     print " --> changed to $_\n" if $debug;
     ($tagstyle) = m|^<[^>]*?style="\s*(.*?)\s*"|s;
  }
  $_ = normalize($_);

# (4) change remaining spans that embolden or italicize to <b> or <i>
  s|<span\s+style="\s*font-weight:\s+bold;\s*">(.*?)</span>|<b>$1</b>|gs;
  s|<span\s+style="\s*font-style:\s+italic;\s*">(.*?)</span>|<i>$1</i>|gs;
  print "--\nAfter adding <b> and <i>: $_\n" if $debug;

# (5) if any element styles are left over, find or make a new class
  if (s|^(<[^>]*)style=".*?"|$1|s) {	#if this tag had styles, delete them

# (5a): if there was a tag class
    if ($tagclass) {  
      print "--\nCase 5a. Tag=$tag class=$tagclass.  " if $debug;
      print "Added styles: $tagstyle\n" if $debug;

      if ($tagclass eq "Continue") 
      {
         $tagstyle .= " text-indent: 0.5in;";
         print "Continue: styles $tagstyle\n" if $debug;
      }

      #first check if we have this class on record. If not, add.
      if (!$styles{$tagclass}->[0]) {
        print "Hmmm, seems to be a new class. Let's add it.\n" if $debug;
        my @a1 = split /;/, $tagstyle;
        my @a2;
        foreach $s (@a1) {			#for each style, 
          $s =~ s|^\s*(.*?)\s*$|$1;|s;		#delete first&last spaces,
          push @{ $styles{$tagclass} }, $s;	#add to styles hash, 
        }
        $stylesheet{".".$tagclass} = $tagstyle;	#add stylesheet entry
        print "Stylesheet entry: .$tagclass { $tagstyle }\n" if $debug;

      } else {	                  #there was an existing class. Modify it.
        my $stykey = "$tagclass-$tagstyle";
        $stylename{$stykey} = $tagclass.$styleapp++ 
            unless $stylename{$stykey};
        $name = $stylename{$stykey};	
        my $newstyles = "";
         
        # add each default style to stylesheet entry unless 
        # it is overridden by new styles for this tag
        foreach $s (@{$styles{$tag}}) { 
          my ($stylekey) = $s =~ m|(.*?):|s; 
          $newstyles .= $s . " " unless $tagstyle =~ m|$stylekey|s;
        }
        $newstyles .= $tagstyle;
        $stylesheet{".".$name} = $newstyles unless $stylesheet{".".$name};
        print "Stylesheet entry 1: .$name { $newstyles }\n" if $debug;
        s|class="$tagclass"|class="$name"|;
      }
    }

# (5b): if there was no tag class
    else {				#there was no class
      print "--\nCase 5b. Tag $tag; no class.  " if $debug;
      $stylename{$tagstyle} = $styleid++ unless $stylename{$tagstyle};
      $name = ".".$stylename{$tagstyle};
      $stylesheet{$name} = $tagstyle unless $stylesheet{$name};
      print "Stylesheet entry 2: $name { $tagstyle }\n" if $debug;
      s|>| class="$stylename{$tagstyle}">|;
    }
  }
  print "After new element class: $_\n" if $debug;


# (6) now process each enclosed span, cite, or code (styled inline element)
  if (s#(.*?)<(span|cite|code)#<$2#si) {
    my $startstuff = $1; 
    while (s#(<(span|cite|code).*?</\2>.*?)<(span|cite|code)#<$3#si) {
      push @spans, $1;
    }
    push @spans, $_;
    $_ = $startstuff;				#build element to return
  
    my $span;
    foreach $span (@spans) {			#process each span
      if ($span =~ m/<(span|cite|code).*?<(span|cite|code)/) {
         warn "Hey -- there are two span|cite|codes in that chunk ($span)!\n"
      }
      my ($spantag) = $span =~ m|^<(\w+)|;
      print "Processing $spantag: $span\n" if $debug;
      foreach $s (@{ $styles{$spantag}}) 
        { $span =~ s|$s||g; }                   #delete default styles
      $span = &normalize($span);
      print "After deleting span defaults: $span\n" if $debug;

      my ($spanstyle) = $span =~ m|^<[^>]*style="(.*?)"|s;
      my ($spanclass) = $span =~ m|^<[^>]*class="(.*?)"|s;

      # now delete default styles for class $spanclass, if any
      if ($spanclass and $styles{$spanclass}->[0]) {
        foreach $s (@{ $styles{$spanclass}}) 
          { $span =~ s|$s||g; }                   #delete default styles
        $span = &normalize($span);
#       print "After deleting spanclass defaults: $span\n";
      }

      print "--------Processing $spantag:-------\n$span\n" if $debug;
      print " -style: $spanstyle\n" if $spanstyle and $debug;
      print " -class: $spanclass\n" if $spanclass and $debug;
  
      if ($spanstyle) {
        $span =~ s|^(<[^>]*)style=".*?"|$1|s;	#delete span style
        if ($spanclass) {
          # case (6a): have a span with style and class.
          print "-----Span 6a-----\nStartstuff: $startstuff\n" if $debug;
          print "Processing span: $span\n" if $debug;
          print "Style=$spanstyle Class=$spanclass\n" if $debug;

          # first check if we have this class on record. If not, add.
          if (!$styles{$spanclass}->[0]) {
            print "Hmmm, seems to be a new class. Let's add it.\n" if $debug;
            my @a1 = split /;/, $spanstyle;
            my @a2;
            foreach $s (@a1) {                    #for each style, 
              $s =~ s|^\s*(.*?)\s*$|$1;|s;        #delete first&last spaces,
              push @{ $styles{$spanclass} }, $s;  #add to styles hash, 
            }
            $stylesheet{".".$spanclass} = $spanstyle; #add stylesheet entry
            print "Stylesheet entry: .$spanclass { $spanstyle }\n" if $debug;
          } 
          else 	#we've already seen this class -- modify it, class-x01
          {
            my $stykey = "$spanclass-$spanstyle";
            $stylename{$stykey} = $spanclass.$styleapp++ 
                unless $stylename{$stykey};
            $name = $stylename{$stykey};	
            my $newstyles = "";
            print "stykey=$stykey name=$name;\n" if $debug;
          }

        } else {	#span with no span class.  Make one.
          $stylename{$spanstyle} = $styleid++ unless $stylename{$spanstyle};
          $name = ".".$stylename{$spanstyle};
          $stylesheet{$name} = $spanstyle;
          print "Stylesheet entry 3: $name { $spanstyle }\n" if $debug;
          $span =~ s|>| class="$stylename{$spanstyle}">|;
        }
      }
      print "->result $span\n" if $debug;
      $_ .= $span;
    }
    print "After processing spans: $_\n" if $debug;
  }
  $_ = normalize($_);

  print "Returning:  $_\n" if $debug;
  print "-----------------------------\n" if $debug;
  if (m|style=|) { 
    warn "Warning: classify left some remaining styles.\n"; 
    print "***Styles remaining*** in $_\n" if $debug;
  }
  return $_;
}


#escape notes so they don't get processed later
sub escape 
{
  my $stuff = shift;
  $stuff =~ s/</\&less-than;/g;
  return $stuff;
}


#remove extra spaces, etc from styles.
sub normalize
{
  my $x = shift;
  my $twosize = ($x =~ m|font-size.*font-size|);
# print "In normalize: styles $x\n" if $twosize;

  # delete the second font size spec if more than one occurs
  $x =~ s|font-size:\s*[^;"]+?;(.*font-size:\s*[^;"]+?;)|$1|gs;  
  $x =~ s|  | |g;                          #compress spaces
  $x =~ s|style="\s*"||g;                  #delete empty styles
  $x =~ s|(style=")\s+([^"]*?")|$1$2|g;    #delete leading spaces
  $x =~ s|(style="[^"]*?)\s+(")|$1$2|g;    #delete trailing spaces
  $x =~ s|\s+>|>|g;                        #delete trailing tag spaces
  $x =~ s|<span\s*>([^<]*)</span>|$1|gs;   #delete empty span
# print "Returning $x\n---------\n" if $twosize;
  return $x;
}
