#!/usr/bin/perl -w
#
# de-ms: remove microsoft-specific xml and html from html file
# as saved from Microsoft Word 2000.  Note that this can include
# some useful information...
#
# version 0.3, 1999-11-04, Harry Plantinga. Bugfixes and testing on one
#    or two documents, but it's still no doubt got some bugs.
# version 0.2, 1999-11-04, Harry Plantinga. There are probably
# still some bugs left here.
#
use strict;
use ThMLutil;

# 
# read entire input into $input
#
my $input;
while (<>) {
  $input .= $_;
}
$_ = $input;
print STDERR "de-ms: " . length($input) . " bytes read\n";

#
# first, let's fix up footnotes.
# replace the footnote bodies with <a><sup>number</sup></a> [body]
#
s/<div\s+style='mso-element:(end|foot)note-list'>/<div class="$1note-list">/gsi;
s/<div\s+style='mso-element:(end|foot)note' id=([^>?]*)>/<div class="$1note" id="$2">/gsi;
print STDERR "Removing Microsoft-specific stuff: 1";

#
# delete drop caps
#
s|<div style='mso-element:dropcap[^>]*>\s*<table.*?grid-mode:line'>(.)<.*?</table>\s*</div>\s*(<p[^>]*><span[^>]*>)|$2$1|gis;
s|<div style='mso-element:dropcap.*?(.)<o:p>.*?</table>\s*</div>\s*(<p[^>]*>)|$2$1|gis;
print STDERR "2";


#
# general cleanup
#
s|<html.*?head>|<html><head>|s;			
s|<style>|<style type="text/css">|s;
s|\s*\@font-face\s*{.*?}||gs;  		# delete font-face definitions
s|\s*\@page.*?-->|\n-->|is; 	  	# delete page definitions
s|/\*.*?\*/\s*||gs;			# delete c-style comments
s|<o:DocumentProperties>.*</o:DocumentProperties||s; #delete doc props
s|<(o:[^>]*?)>[^<]*?</\1>||gs;		# delete o: tags
s|<\!-*\[if .*?>||g;			# delete those weird conditional tags
s|<\!\[endif]-*>||g;
s|<p[^>]*>\s*<span[^>]*>\s*&nbsp;\s*</span>\s*</p>|<p class="Normal">&nbsp;</p>|gsi;
#s|\s*align=center||g;	#this tag fixes a netscape bug
print STDERR "3";

s/\s*page-break-(before|after):(avoid|always);//gs;
s|\s*margin-right:0in;||gs;
s|\s*margin-left:0in;||gs;
s|\s*margin-bottom:.0001pt;||gs;

# word uses nonstandard <br clear="all"/> for section break--let's delete it
 s|<span[^>]*>\s*<br\s+clear="?all[^>]*>\s*</span>|<br clear="all"/>|gis;
#s|<span[^>]*>\s*<br\s+clear="?all[^>]*>\s*</span>||gis;
 s|<br\s+clear="?all[^>]*>||gis;
#s|<br clear="?all[^>]*>|<br clear="all"/>|gis;
 s/<(br|hr)>/<$1\/>/gis;
 s/<(br|hr)\s+([^>]*)>/<$1 $2\/>/gis;
print STDERR "4";

s|<xml.*?</xml>||gs;			#more non-html and ms-specific stuff
s|<meta name=Originator[^>]*>||s;
s|<meta name=Generator[^>]*>||s;
s|<meta name=ProgId[^>]*>||s;
s|<meta http-equiv[^>]*>||s;
s|<link rel=File-List[^>]*>||s;
s|\.0pt|pt|g;				# normalize font point sizes
s|MsoNormal|Normal|g;			# change style MsoNormal to Normal
#s| align="center"||g;			# this is redundant with styles

print STDERR "5";
s|\r||g;				# delete those nasty dos return chars
					# add quotes around attributes
s|(<[^ ]+\s[-_a-zA-Z0-9]+\s*=\s*)([-_a-zA-Z0-9]+)|$1"$2"|gis;	
s|style='([^'"]*)'|style="$1"|gis;	# use double quotes for style atts
s|style='([^'"]*?)"([^'"]*?)"([^'"]*?)"([^'"]*?)"([^'"]*?)'|style="$1'$2'$3'$4'$5"|gis;
s|style='([^'"]*?)"([^'"]*?)"([^'"]*?)'|style="$1'$2'$3"|gis;

print STDERR "6";


#
# now delete some ms-specific styles and other general housekeeping
#
s|\s*mso-[^>]*?;||gis;
s|\s*mso-[^>"]*"|"|gis;
s|;?\s*layout-grid-mode:\s*line\s*||gs;
s|;?\s*text-autospace:\s*none\s*||gs;
s|<span[^>]*>\&nbsp;</span>|\&nbsp;|gs;
s|<span[^>]*>\s+</span>| |gs;
s|<span[^>]*></span>||gs;
s/<([b|i|span|sup])\s*>(\s*)<\/\1>//gis;	#delete empty elements
s|&quot;|"|g;				
s|\s+title\s*=\s*""||gs;
s|\n\n|\n|gs;				#collapse multiple blank lines

s|style="\s*;|style="|gs;
s|\s*style=(['"])\s*\1||gs;		#delete empty style atts
s|<span\s*>([^<]*)</span>|$1|gs;        #delete spans without atts
print STDERR "7";

# 
# let XML pass through
#
s|<p[^>]*><span\s+class="XML"><span[^>]*>([^<]*)</span></span></p>|&unescape($1)|gsie;
s|<p[^>]*><a[^>]*><span\s+class="XML"><span[^>]*>([^<]*)</span></span></a></p>|&unescape($1)|gsie;
s|</span><span\s+class="XML"><span[^>]*>([^<]*)</span></span></p>|"</span></p>\n".&unescape($1)|gsie;
s|<span\s+class="XML"><span[^>]*>([^<]*)</span></span>|&unescape($1)|gsie;

print STDERR "8\n";

#
# now fix up footnote and endnote references
#
s|(<a)\s+(href="\#_edn[^>]*>)((\s*<span[^>]*>)+)([^<]*)((\s*</span>)+)|$1 class="endnote" $2<sup class="endnote">$5</sup>|gsi;
s|(<a)\s+(href="\#_ftn[^>]*>)((\s*<span[^>]*>)+)([^<]*)((\s*</span>)+)|$1 class="footnote" $2<sup class="footnote">$5</sup>|gsi;

print $_;				#done! print it out.

