html2latex (10574B)
1 #!/usr/local/bin/ruby 2 3 # 4 # HTML to LaTeX converter 5 # by A. Ito, 16 June, 1997 6 # 7 8 require 'kconv' 9 10 # configuration 11 def gif2eps(giffile,epsfile) 12 cmd = "convert #{giffile} #{epsfile}" 13 STDERR.print cmd,"\n" 14 system cmd 15 end 16 17 ########################################################################### 18 class Tag 19 def initialize(str) 20 if str =~ /<(.+)>/ then 21 str = $1 22 end 23 tags = str.split 24 @tagname = tags.shift.downcase 25 @vals = {} 26 tags.each do |t| 27 if t =~ /=/ then 28 tn,tv = t.split(/\s*=\s*/,2) 29 tv.sub!(/^"/,"") 30 tv.sub!(/"$/,"") 31 @vals[tn.downcase] = tv 32 else 33 @vals[t.downcase] = TRUE 34 end 35 end 36 end 37 def tagname 38 return @tagname 39 end 40 def each 41 @vals.each do |k,v| 42 yield k,v 43 end 44 end 45 def switch(k) 46 return @vals[k] 47 end 48 end 49 50 class TokenStream 51 TAG_START = ?< 52 TAG_END = ?> 53 AMP_START = ?& 54 AMP_END = ?; 55 56 AMP_REPLACE_TABLE = { 57 '&' => '\\&', 58 '>' => '$>$', 59 '<' => '$<$', 60 ' ' => '~', 61 '"' => '"', 62 } 63 def initialize(file) 64 if file.kind_of?(File) then 65 @f = file 66 else 67 @f = File.new(file) 68 end 69 @buf = nil 70 @bpos = 0 71 end 72 73 def read_until(endsym) 74 complete = FALSE 75 tag = [] 76 begin 77 while @bpos < @buf.size 78 c = @buf[@bpos] 79 if c == endsym then 80 tag.push(c.chr) 81 complete = TRUE 82 @bpos += 1 83 break 84 end 85 if c == 10 || c == 13 then 86 tag.push(' ') 87 else 88 tag.push(c.chr) 89 end 90 @bpos += 1 91 end 92 unless complete 93 @buf = @f.gets 94 @bpos = 0 95 break if @f.eof? 96 end 97 end until complete 98 return tag.join('') 99 end 100 101 def get 102 while TRUE 103 if @buf.nil? then 104 @buf = Kconv.toeuc(@f.gets) 105 if @f.eof? then 106 return nil 107 end 108 @bpos = 0 109 end 110 if @buf[@bpos] == TAG_START then 111 return Tag.new(read_until(TAG_END)) 112 elsif @buf[@bpos] == AMP_START then 113 return replace_amp(read_until(AMP_END)) 114 else 115 i = @bpos 116 while i < @buf.size && @buf[i] != TAG_START && @buf[i] != AMP_START 117 i += 1 118 end 119 r = @buf[@bpos,i-@bpos] 120 if i == @buf.size then 121 @buf = nil 122 else 123 @bpos = i 124 end 125 redo if r =~ /^\s+$/ 126 return r 127 end 128 end 129 end 130 public :eof? 131 def eof? 132 @f.eof? 133 end 134 def replace_amp(s) 135 if AMP_REPLACE_TABLE.key?(s) then 136 return AMP_REPLACE_TABLE[s] 137 else 138 return s 139 end 140 end 141 end 142 143 144 def print_header 145 print ' 146 \documentstyle[epsf]{jarticle} 147 \def\hr{\par\hbox to \textwidth{\hrulefill}} 148 \def\pre{\begin{quote}\def\baselinestretch{0.8}\tt\obeylines} 149 \def\endpre{\end{quote}} 150 \makeatletter 151 \@ifundefined{gt}{\let\gt=\dg}{} 152 \makeatother 153 ' 154 end 155 156 157 class Environ_stack 158 def initialize(*envs) 159 @stack = envs 160 end 161 def action(tag) 162 if tag =~ /^!/ then # comment 163 return ["",nil] 164 end 165 i = @stack.size-1 166 while i >= 0 167 a = @stack[i].action(tag) 168 unless a.nil? then 169 return a 170 end 171 i -= 1 172 end 173 return nil 174 end 175 def pop 176 @stack.pop 177 end 178 def push(env) 179 @stack.push(env) 180 end 181 def top 182 @stack[@stack.size-1] 183 end 184 def dup 185 @stack.push(top.clone) 186 end 187 end 188 189 190 class Environment 191 def initialize(interp) 192 @silent = FALSE 193 @in_table = FALSE 194 @interp = interp; 195 @align = nil; 196 end 197 def action(tag) 198 return @interp[tag] 199 end 200 201 def flush(tok) 202 if tok.kind_of?(String) then 203 tok = tok.gsub(/&/,"\\&"); 204 tok = tok.gsub(/%/,"\\%"); 205 tok = tok.gsub(/#/,"\\#"); 206 tok = tok.gsub(/\$/,"\\$"); 207 tok = tok.gsub(/_/,"\\verb+_+"); 208 tok = tok.gsub(/\^/,"\\verb+^+"); 209 tok = tok.gsub(/~/,"\\verb+~+"); 210 end 211 if @in_table then 212 @table[@table_rows][@table_cols] += tok 213 elsif !@silent then 214 if !@align.nil? && tok =~ /\n$/ then 215 print tok.chop,"\\\\\n" 216 else 217 print tok 218 end 219 end 220 end 221 222 def set_interp(interp) 223 @interp = interp 224 end 225 226 # tag processing methods 227 228 # <TITLE> 229 def do_silent(tag) 230 @silent = TRUE 231 end 232 233 # </TITLE> 234 def undo_silent(tag) 235 @silent = FALSE 236 end 237 238 # <IMG> 239 def img_proc(tag) 240 src = tag.switch('src') 241 newfile = src.sub(/\.GIF/i,".eps") 242 gif2eps(src,newfile) 243 flush "\\epsfile{file=#{newfile}}\n" 244 end 245 246 # <TABLE> 247 def starttable(tag) 248 @table = [] 249 @tablespan = [] 250 @table_rows = -1 251 @table_cols_max = 0 252 @in_table = TRUE 253 unless tag.switch('border').nil? then 254 @table_border = TRUE 255 else 256 @table_border = FALSE 257 end 258 end 259 260 # <TR> 261 def start_row(tag) 262 @table_rows += 1 263 @table[@table_rows] = [] 264 @tablespan[@table_rows] = [] 265 @table_cols = -1 266 @colspan = 1 267 end 268 269 # <TD> 270 def start_col(tag) 271 @colspan = tag.switch('colspan') 272 if @colspan.nil? then 273 @colspan = 1 274 else 275 @colspan = @colspan.to_i 276 end 277 @tablespan[@table_rows][@table_cols+1] = @colspan 278 @table_cols += @colspan 279 if @table_cols > @table_cols_max then 280 @table_cols_max = @table_cols 281 end 282 end 283 284 # </TABLE> 285 def endtable(tag) 286 @in_table = FALSE 287 flush "\\begin{tabular}{*{" 288 flush @table_cols_max+1 289 if @table_border then 290 flush "}{|l}|}\n\\hline\n" 291 else 292 flush "}{l}}\n" 293 end 294 for i in 0..@table_rows 295 j = 0 296 while j <= @table_cols 297 span = @tablespan[i][j] 298 if span == 1 then 299 flush @table[i][j] 300 elsif @table_border then 301 form = "|l" 302 if j+span > @table_cols then 303 form = "|l|" 304 end 305 flush "\\multicolumn{"+span.to_s+"}{"+form+"}{" 306 flush @table[i][j+span-1] 307 flush "}" 308 else 309 flush "\\multicolumn{"+span.to_s+"}{l}{" 310 flush @table[i][j+span-1] 311 flush "}" 312 end 313 j += span 314 if j <= @table_cols then 315 flush "&" 316 end 317 end 318 flush "\\\\\n" 319 flush "\\hline\n" if @table_border 320 end 321 flush "\\end{tabular}\n" 322 end 323 324 # <CENTER> 325 def startcenter(tag) 326 if @in_table then 327 flush "\\hfil" 328 else 329 flush "\\begin{center}\n" 330 end 331 end 332 333 # </CENTER> 334 def endcenter(tag) 335 if @in_table then 336 flush "\\hfil" 337 else 338 flush "\\end{center}\n" 339 end 340 end 341 342 # <P> 343 def paragraph(tag) 344 align = tag.switch('align') 345 if align.nil? then 346 flush "\\par\n" 347 @endparagraph = "" 348 else 349 align = align.downcase 350 case align 351 when "left" then 352 flush "\\begin{flushleft}\n" 353 @endparagraph = "\\end{flushleft}\n" 354 when "center" then 355 flush "\\begin{center}\n" 356 @endparagraph = "\\end{center}\n" 357 when "right" then 358 flush "\\begin{flushright}\n" 359 @endparagraph = "\\end{flushright}\n" 360 end 361 end 362 @align = align 363 end 364 365 # </P> 366 def endparagraph(tag) 367 unless @align.nil? then 368 @align = nil 369 flush @endparagraph 370 end 371 end 372 end 373 374 375 enum_interp = { 376 'li' => ["\\item ",nil] 377 } 378 379 item_interp = { 380 'li' => ["\\item ",nil] 381 } 382 383 desc_interp = { 384 'dt' => ["\\item[",nil], 385 'dd' => ["]\n",nil] 386 } 387 388 table_interp = { 389 'tr' => [:start_row,nil], 390 'td' => [:start_col,nil], 391 '/tr' => ["",nil], 392 '/td' => ["",nil], 393 } 394 395 para_interp = { 396 '/p' => [:endparagraph ,"pop",TRUE], 397 } 398 399 main_interp = { 400 'body' => ["\\begin{document}\n",nil,FALSE], 401 '/body' => ["\\end{document}\n",nil,FALSE], 402 'head' => ["",nil,FALSE], 403 '/head' => ["",nil,FALSE], 404 'html' => ["",nil,FALSE], 405 '/html' => ["",nil,FALSE], 406 'title' => [:do_silent,nil,FALSE], 407 '/title' => [:undo_silent,nil,FALSE], 408 '!' => ["",nil,FALSE], 409 'h1' => ["\\section{",nil,TRUE], 410 'h2' => ["\\subsection{",nil,TRUE], 411 'h3' => ["\\subsubsection{",nil,TRUE], 412 'h4' => ["\\paragraph{",nil,TRUE], 413 '/h1' => ["}\n",nil,TRUE], 414 '/h2' => ["}\n",nil,TRUE], 415 '/h3' => ["}\n",nil,TRUE], 416 '/h4' => ["}\n",nil,TRUE], 417 'a' => ["",nil,TRUE], 418 '/a' => ["",nil,TRUE], 419 'center' => [:startcenter,nil,TRUE], 420 '/center' => [:endcenter,nil,TRUE], 421 'ol' => ["\\begin{enumerate}\n",enum_interp,TRUE], 422 '/ol' => ["\\end{enumerate}\n","pop",TRUE], 423 'ul' => ["\\begin{itemize}\n",item_interp,TRUE], 424 '/ul' => ["\\end{itemize}\n","pop",TRUE], 425 'dl' => ["\\begin{description}\n",desc_interp,TRUE], 426 '/dl' => ["\\end{description}\n","pop",TRUE], 427 'pre' => ["\\begin{pre}\n",nil,TRUE], 428 '/pre' => ["\\end{pre}\n",nil,TRUE], 429 'p' => [:paragraph ,para_interp,TRUE], 430 'br' => ["\\par ",nil,TRUE], 431 'img' => [:img_proc,nil,TRUE], 432 'hr' => ["\\hr ",nil,TRUE], 433 'b' => ["{\\bf\\gt ",nil,TRUE], 434 '/b' => ["}",nil,TRUE], 435 'strong' => ["{\\bf\\gt ",nil,TRUE], 436 '/strong' => ["}",nil,TRUE], 437 'dfn' => ["{\\bf\\gt ",nil,TRUE], 438 '/dfn' => ["}",nil,TRUE], 439 'i' => ["{\\it",nil,TRUE], 440 '/i' => ["}",nil,TRUE], 441 'address' => ["{\\it",nil,TRUE], 442 '/address'=> ["}",nil,TRUE], 443 'cite' => ["{\\it",nil,TRUE], 444 '/cite' => ["}",nil,TRUE], 445 'code' => ["{\\tt",nil,TRUE], 446 '/code' => ["}",nil,TRUE], 447 'kbd' => ["{\\tt",nil,TRUE], 448 '/kbd' => ["}",nil,TRUE], 449 'tt' => ["{\\tt",nil,TRUE], 450 '/tt' => ["}",nil,TRUE], 451 'samp' => ["{\\tt",nil,TRUE], 452 '/samp' => ["}",nil,TRUE], 453 'em' => ["{\\em",nil,TRUE], 454 '/em' => ["}",nil,TRUE], 455 'u' => ["$\\underline{\\mbox{",nil,TRUE], 456 '/u' => ["}}$",nil,TRUE], 457 'sub' => ["${}_\mbox{",nil,TRUE], 458 '/sub' => ["}$",nil,TRUE], 459 'sup' => ["${}^\mbox{",nil,TRUE], 460 '/sup' => ["}$",nil,TRUE], 461 'table' => [:starttable, table_interp,TRUE], 462 '/table' => [:endtable, "pop",TRUE], 463 'font' => ["",nil,TRUE], 464 '/font' => ["",nil,TRUE], 465 } 466 467 468 469 470 ################################ MAIN #################################### 471 472 $in_document = FALSE 473 print_header 474 intp = Environ_stack.new(Environment.new(main_interp)) 475 f = TokenStream.new(ARGV[0]) 476 until f.eof? 477 tok = f.get 478 if tok.kind_of?(Tag) then 479 case tok.tagname 480 when "body" 481 $in_document = TRUE 482 when "/body" 483 $in_document = FALSE 484 end 485 act = intp.action(tok.tagname) 486 if act.nil? then 487 STDERR.print "tag ",tok.tagname," ignored\n" 488 else 489 if act[2] && !$in_document then 490 print "\\begin{document}\n" 491 $in_document = TRUE 492 end 493 # environment push 494 if act[1].kind_of?(Hash) && 495 (tok.tagname != "p" || tok.switch('align') != nil) then 496 intp.dup 497 intp.top.set_interp(act[1]) 498 end 499 500 if act[0].kind_of?(String) then 501 intp.top.flush act[0] 502 elsif act[0].kind_of?(Fixnum) then # interned symbol 503 intp.top.send(act[0],tok) 504 end 505 506 # environment pop 507 if act[1] == "pop" then 508 intp.pop 509 end 510 end 511 elsif !tok.nil? then 512 intp.top.flush tok 513 end 514 end 515 if $in_document then 516 print "\\end{document}\n" 517 end