1#!/usr/bin/env ruby 2 3encodings = ["ec", "qx", "latin7x", "t8m", "lth"] 4# "texnansi", "t5", "lt" 5 6$path_data = "data" 7$filename_AGL = File.join($path_data, "aglfn13.txt") 8 9$filename_unicode_data = File.join($path_data, "UnicodeData.txt") 10 11$AGL_names = Hash.new() 12 13 14# read from adobe glyph list 15File.open($filename_AGL).grep /^[0-9A-F]+/ do |line| 16 unicode, pdfname = line.split(/;/) 17 $AGL_names[pdfname] = unicode; 18end 19 20$lowercase_letter = Hash.new() 21# 00F0;LATIN SMALL LETTER ETH;Ll;0;L;;;;;N;;Icelandic;00D0;;00D0 22# FB01;LATIN SMALL LIGATURE FI;Ll;0;L;<compat> 0066 0069;;;;N;;;;; 23# lowercase letters 24#File.open($filename_unicode_data).grep /^([0-9A-F]*);[^;]*;Ll;.*$/ do |line| 25File.open($filename_unicode_data).grep /^([0-9A-F]*);.*$/ do |line| 26 unicode, name, lowercase, dummy1, dummy2, compat = line.split(/;/) 27 if lowercase == "Ll" then 28 unless compat.include?("compat") 29 $lowercase_letter[unicode] = true 30 end 31 # Thai 32 elsif unicode.hex >= 0x0E01 and unicode.hex <= 0x0E5B then 33 if lowercase =~ /(Lo|Mn)/ then 34 $lowercase_letter[unicode] = true 35 end 36 # Georgian lowercase (lowercase: 'Lo') 37 elsif unicode.hex >= 0x10D0 and unicode.hex <= 0x10FA then 38 $lowercase_letter[unicode] = true 39 end 40end 41 42 43# ij 44$lowercase_letter["0133"] = true 45# florin 46$lowercase_letter["0192"] = false 47# ell 48$lowercase_letter["2113"] = false 49 50$AGL_names["hyphenchar"] = $AGL_names["hyphen"] 51$AGL_names["sfthyphen"] = "00AD" 52$AGL_names["hyphen.alt"] = "00AD" 53 54$AGL_names["dotlessj"] = "0237" 55$AGL_names["tcedilla"] = "0163" 56$AGL_names["Tcedilla"] = "0162" 57 58$AGL_names["ff"] = "FB00" # = 0066 + 0066 59$AGL_names["fi"] = "FB01" # = 0066 + 0069 60$AGL_names["fl"] = "FB02" # = 0066 + 006C 61$AGL_names["ffi"] = "FB03" # = 0066 + 0066 + 0069 62$AGL_names["ffl"] = "FB04" # = 0066 + 0066 + 006C 63 64$AGL_names["cwm"] = "200B" 65$AGL_names["zerowidthspace"] = "200B" 66$AGL_names["perthousandzero"] = "?" 67$AGL_names["visiblespace"] = "2423" 68#$AGL_names["nbspace"] = "00A0" 69$AGL_names["nonbreakingspace"] = "00A0" 70$AGL_names["Germandbls"] = "1E9E" # = 0053 + 0053 71$AGL_names["ell"] = "2113" 72 73$AGL_names[".notdef"] = "?" 74 75$AGL_names["onesuperior"] = "00B9" 76$AGL_names["twosuperior"] = "00B2" 77$AGL_names["threesuperior"] = "00B3" 78 79$AGL_names["anglearc"] = "2222" 80$AGL_names["diameter"] = "2300" 81$AGL_names["dottedcircle"] = "25CC" 82$AGL_names["threequartersemdash"] = "?" 83$AGL_names["f_k"] = "?" 84 85encodings.each do |enc| 86 puts "Writing files for encoding '#{enc}'" 87 88 $filename_encoding = File.join($path_data, "enc/#{enc}.enc") 89 $filename_xetex_mapping = File.join($path_data, "map/#{enc}.map") 90 $filename_encoding2unicode = File.join($path_data, "enc2unicode/#{enc}.dat") 91 92 93 $file_map = File.open($filename_xetex_mapping, "w") 94 # FIXME 95 $file_fixed_enc = File.open("data/enc/#{enc}-new.enc", "w") 96 $file_encoding2unicode = File.open($filename_encoding2unicode, "w") 97 98 $file_map.print("EncodingName \"TeX-#{enc}\"\n\n") 99 $file_map.print("pass(Byte_Unicode)\n\n") 100 101 i = 0 102 #$file_out = File.open("#{enc}.txt", "w") 103 # read from adobe glyph list 104 File.open($filename_encoding).grep(/\/[_a-zA-Z0-9\.]+/) do |line| 105 # ignore comments 106 line.gsub!(/%.*/,'') 107 # encoding name should not be considered 108 line.gsub!(/.*\[/,'') 109 # nor the ending definition 110 line.gsub!(/\].*/,'') 111 112 line.scan(/[_a-zA-Z0-9\.]+/) do |w| 113 # Adobe Glyph List doesn't contain uniXXXX names, 114 # so we add that particular uniXXXX to our list for easier handling later on 115 if w =~ /^uni(.*)$/ then 116 $AGL_names[w] = $1 117 end 118 # if the glyph is not in AGL and isn't uniXXXX, print a warning 119 if $AGL_names[w] == nil then 120 puts sprintf(">> error: %s unknown (index 0x%02X)", w, i) 121 else 122 #$file_out.printf("%3s %-20s %s\n", i.to_s, w, $AGL_names[w]) 123 #puts w + " " + $AGL_names[w] 124 if $AGL_names[w] == "?" 125 $file_map.printf("; %-20s: no Unicode mapping assigned\n", w); 126 $file_fixed_enc.printf("/%-15s %% 0x%02X\n", w, i); 127 $file_encoding2unicode.printf("0x%02X\tU+....\t\t%s\n", i, w); 128 # somewhat unreliable way to filter out uniXXXX.something 129 elsif $AGL_names[w].size > 4 then 130 $file_map.printf("; %-20s: no unique way to map to Unicode\n", w); 131 $file_fixed_enc.printf("/%-15s %% 0x%02X U+%s\n", w, i, $AGL_names[w]); 132 $file_encoding2unicode.printf("0x%02X\tU+....\t\t%s\n", i, w); 133 else 134 unicode_point = $AGL_names[w] 135 if i != $AGL_names[w].hex 136 $file_map.printf("%d\t<>\tU+%s\t; %s\n", i, unicode_point, w); 137 $file_fixed_enc.printf("/%-15s %% 0x%02X U+%s\n", w, i, unicode_point); 138 else 139 $file_map.printf("%d\t<>\tU+%s\t; %s\n", i, unicode_point, w); 140 $file_fixed_enc.printf("/%-15s %% 0x%02X\n", w, i); 141 end 142 lowercase = "" 143 if $lowercase_letter[unicode_point] == true and unicode_point.hex > 127 144 lowercase = "1" 145 # exception: in Thai, we don't want any characted below 0xA0 146 if enc == "lth" and i < 0xA0 then 147 lowercase = "" 148 end 149 end 150 $file_encoding2unicode.printf("0x%02X\tU+%s\t%s\t%s\n", i, unicode_point, lowercase, w); 151 end 152 end 153 i = i.next 154 end 155 end 156 #$file_out.close 157 $file_map.close 158 $file_encoding2unicode.close 159end 160 161 162