w3m

Unnamed repository; edit this file to name it for gitweb.
git clone https://logand.com/git/w3m.git/
Log | Files | Refs | README

html2latex (10574B)


      1 #!/usr/local/bin/ruby
      2 
      3 #
      4 #       HTML to LaTeX converter
      5 #         by A. Ito, 16 June, 1997
      6 #
      7 
      8 require 'kconv'
      9 
     10 # configuration
     11 def gif2eps(giffile,epsfile)
     12   cmd = "convert #{giffile} #{epsfile}"
     13   STDERR.print cmd,"\n"
     14   system cmd
     15 end
     16 
     17 ###########################################################################
     18 class Tag
     19   def initialize(str)
     20     if str =~ /<(.+)>/ then
     21       str = $1
     22     end
     23     tags = str.split
     24     @tagname = tags.shift.downcase
     25     @vals = {}
     26     tags.each do |t|
     27       if t =~ /=/ then
     28 	tn,tv = t.split(/\s*=\s*/,2)
     29 	tv.sub!(/^"/,"")
     30 	tv.sub!(/"$/,"")
     31 	@vals[tn.downcase] = tv
     32       else
     33 	@vals[t.downcase] = TRUE
     34       end
     35     end
     36   end
     37   def tagname
     38     return @tagname
     39   end
     40   def each
     41     @vals.each do |k,v|
     42       yield k,v
     43     end
     44   end
     45   def switch(k)
     46     return @vals[k]
     47   end
     48 end
     49 
     50 class TokenStream
     51   TAG_START = ?<
     52   TAG_END = ?>
     53   AMP_START = ?&
     54   AMP_END = ?;
     55   
     56   AMP_REPLACE_TABLE = {
     57     '&amp;'   => '\\&',
     58     '&gt;'    => '$>$',
     59     '&lt;'    => '$<$',
     60     '&nbsp;'  => '~',
     61     '&quot;'  => '"',
     62   }
     63   def initialize(file)
     64     if file.kind_of?(File) then
     65       @f = file
     66     else
     67       @f = File.new(file)
     68     end
     69     @buf = nil
     70     @bpos = 0
     71   end
     72   
     73   def read_until(endsym)
     74     complete = FALSE
     75     tag = []
     76     begin
     77       while @bpos < @buf.size
     78 	c = @buf[@bpos]
     79 	if c == endsym then
     80 	  tag.push(c.chr)
     81 	  complete = TRUE
     82 	  @bpos += 1
     83 	  break
     84 	end
     85 	if c == 10 || c == 13 then
     86 	  tag.push(' ')
     87 	else
     88 	  tag.push(c.chr)
     89 	end
     90 	@bpos += 1
     91       end
     92       unless complete
     93 	@buf = @f.gets
     94 	@bpos = 0
     95 	break if @f.eof?
     96       end
     97     end until complete
     98     return tag.join('')
     99   end
    100     
    101   def get
    102     while TRUE
    103       if @buf.nil? then
    104 	@buf = Kconv.toeuc(@f.gets)
    105 	if @f.eof? then
    106 	  return nil
    107 	end
    108 	@bpos = 0
    109       end
    110       if @buf[@bpos] == TAG_START then
    111 	return Tag.new(read_until(TAG_END))
    112       elsif @buf[@bpos] == AMP_START then
    113 	return replace_amp(read_until(AMP_END))
    114       else
    115 	i = @bpos
    116 	while i < @buf.size && @buf[i] != TAG_START && @buf[i] != AMP_START
    117 	  i += 1
    118 	end
    119 	r = @buf[@bpos,i-@bpos]
    120 	if i == @buf.size then
    121 	  @buf = nil
    122 	else
    123 	  @bpos = i
    124 	end
    125 	redo if r =~ /^\s+$/
    126 	return r
    127       end
    128     end
    129   end
    130   public :eof?
    131   def eof?
    132     @f.eof?
    133   end
    134   def replace_amp(s)
    135     if AMP_REPLACE_TABLE.key?(s) then
    136       return AMP_REPLACE_TABLE[s]
    137     else
    138       return s
    139     end
    140   end
    141 end
    142 
    143 
    144 def print_header
    145   print '
    146 \documentstyle[epsf]{jarticle}
    147 \def\hr{\par\hbox to \textwidth{\hrulefill}}
    148 \def\pre{\begin{quote}\def\baselinestretch{0.8}\tt\obeylines}
    149 \def\endpre{\end{quote}}
    150 \makeatletter
    151 \@ifundefined{gt}{\let\gt=\dg}{}
    152 \makeatother
    153 '
    154 end
    155 
    156 
    157 class Environ_stack
    158   def initialize(*envs)
    159     @stack = envs
    160   end
    161   def action(tag)
    162     if tag =~ /^!/ then # comment
    163       return ["",nil]
    164     end
    165     i = @stack.size-1
    166     while i >= 0
    167       a = @stack[i].action(tag)
    168       unless a.nil? then
    169 	return a
    170       end
    171       i -= 1
    172     end
    173     return nil
    174   end
    175   def pop
    176     @stack.pop
    177   end
    178   def push(env)
    179     @stack.push(env)
    180   end
    181   def top
    182     @stack[@stack.size-1]
    183   end
    184   def dup
    185     @stack.push(top.clone)
    186   end
    187 end
    188 
    189 
    190 class Environment
    191   def initialize(interp)
    192     @silent = FALSE
    193     @in_table = FALSE
    194     @interp = interp;
    195     @align = nil;
    196   end
    197   def action(tag)
    198     return @interp[tag]
    199   end
    200   
    201   def flush(tok)
    202     if tok.kind_of?(String) then
    203       tok = tok.gsub(/&/,"\\&");
    204       tok = tok.gsub(/%/,"\\%");
    205       tok = tok.gsub(/#/,"\\#");
    206       tok = tok.gsub(/\$/,"\\$");
    207       tok = tok.gsub(/_/,"\\verb+_+");
    208       tok = tok.gsub(/\^/,"\\verb+^+");
    209       tok = tok.gsub(/~/,"\\verb+~+");
    210     end
    211     if @in_table then
    212       @table[@table_rows][@table_cols] += tok
    213     elsif !@silent then
    214       if !@align.nil? && tok =~ /\n$/ then
    215 	print tok.chop,"\\\\\n"
    216       else
    217 	print tok
    218       end
    219     end
    220   end
    221   
    222   def set_interp(interp)
    223     @interp = interp
    224   end
    225   
    226   # tag processing methods
    227   
    228   # <TITLE>
    229   def do_silent(tag)
    230     @silent = TRUE
    231   end
    232   
    233   # </TITLE>
    234   def undo_silent(tag)
    235     @silent = FALSE
    236   end
    237   
    238   # <IMG>
    239   def img_proc(tag)
    240     src = tag.switch('src')
    241     newfile = src.sub(/\.GIF/i,".eps")
    242     gif2eps(src,newfile)
    243     flush "\\epsfile{file=#{newfile}}\n"
    244   end
    245   
    246   # <TABLE>
    247   def starttable(tag)
    248     @table = []
    249     @tablespan = []
    250     @table_rows = -1
    251     @table_cols_max = 0
    252     @in_table = TRUE
    253     unless tag.switch('border').nil? then
    254       @table_border = TRUE
    255     else
    256       @table_border = FALSE
    257     end
    258   end
    259   
    260   # <TR>
    261   def start_row(tag)
    262     @table_rows += 1
    263     @table[@table_rows] = []
    264     @tablespan[@table_rows] = []
    265     @table_cols = -1
    266     @colspan = 1
    267   end
    268   
    269   # <TD>
    270   def start_col(tag)
    271     @colspan = tag.switch('colspan')
    272     if @colspan.nil? then
    273       @colspan = 1
    274     else
    275       @colspan = @colspan.to_i
    276     end
    277     @tablespan[@table_rows][@table_cols+1] = @colspan
    278     @table_cols += @colspan
    279     if @table_cols > @table_cols_max then
    280       @table_cols_max = @table_cols
    281     end
    282   end
    283   
    284   # </TABLE>
    285   def endtable(tag)
    286     @in_table = FALSE
    287     flush "\\begin{tabular}{*{"
    288     flush @table_cols_max+1
    289     if @table_border then
    290       flush "}{|l}|}\n\\hline\n"
    291     else
    292       flush "}{l}}\n"
    293     end
    294     for i in 0..@table_rows
    295       j = 0
    296       while j <= @table_cols
    297 	span = @tablespan[i][j]
    298 	if span == 1 then
    299 	  flush @table[i][j]
    300 	elsif @table_border then
    301 	  form = "|l"
    302 	  if j+span > @table_cols then
    303 	    form = "|l|"
    304 	  end
    305 	  flush "\\multicolumn{"+span.to_s+"}{"+form+"}{"
    306 	  flush @table[i][j+span-1]
    307 	  flush "}"
    308 	else
    309 	  flush "\\multicolumn{"+span.to_s+"}{l}{"
    310 	  flush @table[i][j+span-1]
    311 	  flush "}"
    312 	end
    313 	j += span
    314 	if j <= @table_cols then
    315 	  flush "&"
    316 	end
    317       end
    318       flush "\\\\\n"
    319       flush "\\hline\n" if @table_border
    320     end
    321     flush "\\end{tabular}\n"
    322   end  
    323   
    324   # <CENTER>
    325   def startcenter(tag)
    326     if @in_table then
    327       flush "\\hfil"
    328     else
    329       flush "\\begin{center}\n"
    330     end
    331   end
    332   
    333   # </CENTER>
    334   def endcenter(tag)
    335     if @in_table then
    336       flush "\\hfil"
    337     else
    338       flush "\\end{center}\n"
    339     end
    340   end
    341   
    342   # <P>
    343   def paragraph(tag)
    344     align = tag.switch('align')
    345     if align.nil? then
    346       flush "\\par\n"
    347       @endparagraph = ""
    348     else
    349       align = align.downcase
    350       case align
    351       when "left" then
    352 	flush "\\begin{flushleft}\n"
    353 	@endparagraph = "\\end{flushleft}\n"
    354       when "center" then
    355 	flush "\\begin{center}\n"
    356 	@endparagraph = "\\end{center}\n"
    357       when "right" then
    358 	flush "\\begin{flushright}\n"
    359 	@endparagraph = "\\end{flushright}\n"
    360       end
    361     end
    362     @align = align
    363   end
    364   
    365   # </P>
    366   def endparagraph(tag)
    367     unless @align.nil? then
    368       @align = nil
    369       flush @endparagraph
    370     end
    371   end
    372 end
    373 
    374 
    375 enum_interp = {
    376   'li' => ["\\item ",nil]
    377 }
    378 
    379 item_interp = {
    380   'li' => ["\\item ",nil]
    381 }
    382 
    383 desc_interp = {
    384   'dt' => ["\\item[",nil],
    385   'dd' => ["]\n",nil]
    386 }
    387 
    388 table_interp = {
    389   'tr' => [:start_row,nil],
    390   'td' => [:start_col,nil],
    391   '/tr' => ["",nil],
    392   '/td' => ["",nil],
    393 }
    394 
    395 para_interp = {
    396   '/p'      => [:endparagraph ,"pop",TRUE],
    397 }
    398 
    399 main_interp = {
    400   'body'    => ["\\begin{document}\n",nil,FALSE],
    401   '/body'   => ["\\end{document}\n",nil,FALSE],
    402   'head'    => ["",nil,FALSE],
    403   '/head'   => ["",nil,FALSE],
    404   'html'    => ["",nil,FALSE],
    405   '/html'   => ["",nil,FALSE],
    406   'title'   => [:do_silent,nil,FALSE],
    407   '/title'  => [:undo_silent,nil,FALSE],
    408   '!'       => ["",nil,FALSE],
    409   'h1'      => ["\\section{",nil,TRUE],
    410   'h2'      => ["\\subsection{",nil,TRUE],
    411   'h3'      => ["\\subsubsection{",nil,TRUE],
    412   'h4'      => ["\\paragraph{",nil,TRUE],
    413   '/h1'     => ["}\n",nil,TRUE],
    414   '/h2'     => ["}\n",nil,TRUE],
    415   '/h3'     => ["}\n",nil,TRUE],
    416   '/h4'     => ["}\n",nil,TRUE],
    417   'a'       => ["",nil,TRUE],
    418   '/a'      => ["",nil,TRUE],
    419   'center'  => [:startcenter,nil,TRUE],
    420   '/center' => [:endcenter,nil,TRUE],
    421   'ol'      => ["\\begin{enumerate}\n",enum_interp,TRUE],
    422   '/ol'     => ["\\end{enumerate}\n","pop",TRUE],
    423   'ul'      => ["\\begin{itemize}\n",item_interp,TRUE],
    424   '/ul'     => ["\\end{itemize}\n","pop",TRUE],
    425   'dl'      => ["\\begin{description}\n",desc_interp,TRUE],
    426   '/dl'     => ["\\end{description}\n","pop",TRUE],
    427   'pre'     => ["\\begin{pre}\n",nil,TRUE],
    428   '/pre'    => ["\\end{pre}\n",nil,TRUE],
    429   'p'       => [:paragraph ,para_interp,TRUE],
    430   'br'      => ["\\par ",nil,TRUE],
    431   'img'     => [:img_proc,nil,TRUE],
    432   'hr'      => ["\\hr ",nil,TRUE],
    433   'b'       => ["{\\bf\\gt ",nil,TRUE],
    434   '/b'      => ["}",nil,TRUE],
    435   'strong'  => ["{\\bf\\gt ",nil,TRUE],
    436   '/strong' => ["}",nil,TRUE],
    437   'dfn'     => ["{\\bf\\gt ",nil,TRUE],
    438   '/dfn'    => ["}",nil,TRUE],
    439   'i'       => ["{\\it",nil,TRUE],
    440   '/i'      => ["}",nil,TRUE],
    441   'address' => ["{\\it",nil,TRUE],
    442   '/address'=> ["}",nil,TRUE],
    443   'cite'    => ["{\\it",nil,TRUE],
    444   '/cite'   => ["}",nil,TRUE],
    445   'code'    => ["{\\tt",nil,TRUE],
    446   '/code'   => ["}",nil,TRUE],
    447   'kbd'     => ["{\\tt",nil,TRUE],
    448   '/kbd'    => ["}",nil,TRUE],
    449   'tt'      => ["{\\tt",nil,TRUE],
    450   '/tt'     => ["}",nil,TRUE],
    451   'samp'    => ["{\\tt",nil,TRUE],
    452   '/samp'   => ["}",nil,TRUE],
    453   'em'      => ["{\\em",nil,TRUE],
    454   '/em'     => ["}",nil,TRUE],
    455   'u'       => ["$\\underline{\\mbox{",nil,TRUE],
    456   '/u'      => ["}}$",nil,TRUE],
    457   'sub'     => ["${}_\mbox{",nil,TRUE],
    458   '/sub'    => ["}$",nil,TRUE],
    459   'sup'     => ["${}^\mbox{",nil,TRUE],
    460   '/sup'    => ["}$",nil,TRUE],
    461   'table'   => [:starttable, table_interp,TRUE],
    462   '/table'  => [:endtable, "pop",TRUE],
    463   'font'    => ["",nil,TRUE],
    464   '/font'   => ["",nil,TRUE],
    465 }
    466 
    467 
    468 
    469 
    470 ################################ MAIN ####################################
    471 
    472 $in_document = FALSE
    473 print_header
    474 intp = Environ_stack.new(Environment.new(main_interp))
    475 f = TokenStream.new(ARGV[0])
    476 until f.eof?
    477   tok = f.get
    478   if tok.kind_of?(Tag) then
    479     case tok.tagname
    480     when "body"
    481       $in_document = TRUE
    482     when "/body"
    483       $in_document = FALSE
    484     end
    485     act = intp.action(tok.tagname)
    486     if act.nil? then
    487       STDERR.print "tag ",tok.tagname," ignored\n"
    488     else
    489       if act[2] && !$in_document then
    490         print "\\begin{document}\n"
    491 	$in_document = TRUE
    492       end
    493       # environment push
    494       if act[1].kind_of?(Hash) &&
    495 	  (tok.tagname != "p" || tok.switch('align') != nil) then
    496 	  intp.dup
    497 	  intp.top.set_interp(act[1])
    498       end
    499       
    500       if act[0].kind_of?(String) then
    501 	intp.top.flush act[0]
    502       elsif act[0].kind_of?(Fixnum) then # interned symbol
    503 	intp.top.send(act[0],tok)
    504       end
    505       
    506       # environment pop
    507       if act[1] == "pop" then
    508 	intp.pop
    509       end
    510     end
    511   elsif !tok.nil? then
    512     intp.top.flush tok
    513   end
    514 end
    515 if $in_document then
    516   print "\\end{document}\n"
    517 end