1#!/usr/bin/env ruby 2 3require_relative 'hyph-utf8' 4 5$path_root=File.expand_path("../../../..", __FILE__) 6$encoding_data_dir = File.expand_path("../data/encodings", __FILE__) 7$output_data_dir = "#{$path_root}/tex/generic/hyph-utf8/conversions" 8 9def output_file_name(encoding) 10 File.join($output_data_dir, sprintf('conv-utf8-%s.tex', encoding)) 11end 12 13$header = <<__EOHEADER__ 14%% conv-utf8-%s.tex 15%% 16%% Conversion from UTF-8 to %s, 17%% used before loading hyphenation patterns for 8-bit TeX engines. 18%% 19%% This file is part of hyph-utf8 package and autogenerated. 20%% See http://tug.org/tex-hyphen 21%% 22%% Copyright 2008-%d TeX Users Group. 23%% You may freely use, modify and/or distribute this file. 24%% (But consider adapting the scripts if you need modifications.) 25__EOHEADER__ 26 27def output_copyright_notice(outfile, encoding) 28 outfile.printf $header, encoding, encoding.upcase, Time.new.year 29end 30 31$uniconvmacro1 = <<__EOUNIMAC1__ 32% macros adapted from ConTeXt MKII; see unic-ini.mkii 33\\def\\unicodechar#1{% 34 \\ifcsname unichar@\\number#1\\endcsname 35 \\csname unichar@\\number#1\\endcsname 36 \\else 37 \\errmessage{Unicode character [#1] not in encoding.}% 38 \\fi} 39__EOUNIMAC1__ 40 41$uniconvmacros = [nil, nil] 42 43$uniconvmacros << <<__EOTWOBYTES__ 44\\def\\utftwouniglyph#1#2% 45 {\\expandafter\\unicodechar\\expandafter 46 {\\the\\numexpr64*(#1-192)+`#2-128\\relax}} 47__EOTWOBYTES__ 48 49$uniconvmacros << <<__EOTHREEBYTES__ 50\\def\\utfthreeuniglyph#1#2#3% 51 {\\expandafter\\unicodechar\\expandafter 52 {\\the\\numexpr4096*(#1-224)+64*(`#2-128)+`#3-128\\relax}} 53__EOTHREEBYTES__ 54 55$uniconvmacros << <<__EOFOURBYTES__ 56\\def\\utffouruniglyph#1#2#3#4% 57 {\\expandafter\\unicodechar\\expandafter 58 {\\the\\numexpr262144*(#1-240)+4096*(`#2-128)+64*(`#3-128)+`#4-128\\relax}} 59__EOFOURBYTES__ 60 61$uniconvmacro2 = <<__EOUNIMAC2__ 62 63\\def\\addunichar #1 #2 {\\expandafter\\def\\csname unichar@\\number#1\\endcsname{#2}} 64 65% \\addunichar "unicode_code - ^^font_encoding_code 66__EOUNIMAC2__ 67 68["t8m", "lth"].each do |encoding| 69 # load encoding 70 e = HyphEncoding.new(encoding) 71 72 # open file 73 File.open(output_file_name(encoding), "w") do |file_out| 74 75 # copyright notice 76 output_copyright_notice(file_out, encoding) 77 file_out.puts 78 79 # macro to get mapping unicode -> font encoding & error message if screwed up 80 file_out.puts $uniconvmacro1 81 82 # minimal and maximal length of characters in the encoding (until now just 2 & 3) 83 unicode_characters_array = e.unicode_characters.sort 84 length_min = unicode_characters_array.first[1].bytes.size 85 length_max = unicode_characters_array.last[1].bytes.size 86 87 # only output the necessary macros for transforming UTF-8 -> Unicode number 88 (length_min..length_max).each do |nbytes| 89 file_out.puts $uniconvmacros[nbytes] 90 end 91 92 # macro to store mapping unicode -> font encoding 93 file_out.puts $uniconvmacro2 94 95 # all unicode characters in the encoding 96 e.unicode_characters.sort.each do |code,c| 97 file_out.printf("\\addunichar \"%04X ^^%02x \\lccode\"%02X=\"%02X %% %s - %s\n", 98 c.code_uni, c.code_enc, c.code_enc, c.code_enc, [c.code_uni].pack('U'), c.name) 99 end 100 file_out.puts 101 102 # make all the possible first characters active 103 # output the definition into file 104 e.unicode_characters_first_byte.sort.each do |first_byte_code,chars| 105 byte = first_byte_code.hex 106 size = chars[0].bytes.size 107 # 2-byte: 0b11000000 <= byte < 0b11100000 108 str = case size 109 when 2 then 110 "two" 111 # 3-byte: 0b11100000 <= byte < 0b11110000 112 when 3 then 113 "three" 114 # 4-byte: 0b11110000 <= byte < 0b11111000 115 when 4 then 116 "four" 117 end 118 file_out.printf("\\catcode\"%02X=\\active \\def^^%02x{\\utf%suniglyph{\"%02X}}\n", byte, byte, str, byte) 119 end 120 end 121end 122 123["ec", "qx", "t2a", "lmc", "il2", "il3", "l7x"].each do |encoding| 124 # load encoding 125 e = HyphEncoding.new(encoding) 126 127 # open file 128 File.open(output_file_name(encoding), "w") do |file_out| 129 130 # copyright notice 131 output_copyright_notice(file_out, encoding) 132 file_out.puts '%' 133 134 e.unicode_characters_first_byte.sort.each do |first_byte_code,chars| 135 # sorting all the second characters alphabetically 136 chars.sort!{|x,y| x.code_uni <=> y.code_uni } 137 # make all the possible first characters active 138 # output the definition into file 139 file_out.printf("\\catcode\"%02X=\\active\n", first_byte_code.hex) 140 end 141 file_out.puts "%" 142 e.unicode_characters_first_byte.sort.each do |first_byte_code,chars| 143 first_byte_code = first_byte_code.hex 144 size = chars[0].bytes.size 145 if size != 2 then 146 throw "The encoding #{encoding} uses more than two bytes to encode characters" 147 else 148 149 file_out.printf("\\def^^%02x#1{%%\n", first_byte_code) 150 string_fi = "" 151 for i in 1..(chars.size) 152 uni_character = chars[i-1] 153 enc_byte = uni_character.code_enc 154 enc_byte = [ uni_character.code_enc ].pack('c').unpack('H2') 155 file_out.printf("\t\\ifx#1^^%02x^^%02x\\else %% %s - U+%04X - %s\n", uni_character.bytes[1], uni_character.code_enc, [uni_character.code_uni].pack('U'), uni_character.code_uni, uni_character.name) 156 string_fi = string_fi + "\\fi" 157 end 158 159 # at least three bytes 160 end 161 file_out.puts "\t\\errmessage{Hyphenation pattern file corrupted or #{encoding} encoding not supported!}" 162 file_out.puts string_fi + "}" 163 end 164 file_out.puts '%' 165 file_out.puts '% ensure all the chars above have valid \lccode values' 166 file_out.puts '%' 167 e.lowercase_characters.each do |character| 168 code = [ character.code_enc ].pack("c").unpack("H2").first.upcase 169 # \lccode"FF="FF 170 file_out.printf "\\lccode\"%s=\"%s %% %s - U+%04X - %s\n", code, code, [character.code_uni].pack('U'), character.code_uni, character.name 171 end 172 173 file_out.puts 174 end 175end 176