• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env ruby
2
3encodings = ["ec", "qx", "latin7x", "t8m", "lth"]
4# "texnansi", "t5", "lt"
5
6$path_data         = "data"
7$filename_AGL      = File.join($path_data, "aglfn13.txt")
8
9$filename_unicode_data = File.join($path_data, "UnicodeData.txt")
10
11$AGL_names = Hash.new()
12
13
14# read from adobe glyph list
15File.open($filename_AGL).grep /^[0-9A-F]+/ do |line|
16	unicode, pdfname = line.split(/;/)
17	$AGL_names[pdfname] = unicode;
18end
19
20$lowercase_letter = Hash.new()
21# 00F0;LATIN SMALL LETTER ETH;Ll;0;L;;;;;N;;Icelandic;00D0;;00D0
22# FB01;LATIN SMALL LIGATURE FI;Ll;0;L;<compat> 0066 0069;;;;N;;;;;
23# lowercase letters
24#File.open($filename_unicode_data).grep /^([0-9A-F]*);[^;]*;Ll;.*$/ do |line|
25File.open($filename_unicode_data).grep /^([0-9A-F]*);.*$/ do |line|
26	unicode, name, lowercase, dummy1, dummy2, compat = line.split(/;/)
27	if lowercase == "Ll" then
28		unless compat.include?("compat")
29			$lowercase_letter[unicode] = true
30		end
31	# Thai
32	elsif unicode.hex >= 0x0E01 and unicode.hex <= 0x0E5B then
33		if lowercase =~ /(Lo|Mn)/ then
34			$lowercase_letter[unicode] = true
35		end
36	# Georgian lowercase (lowercase: 'Lo')
37	elsif unicode.hex >= 0x10D0 and unicode.hex <= 0x10FA then
38		$lowercase_letter[unicode] = true
39	end
40end
41
42
43# ij
44$lowercase_letter["0133"] = true
45# florin
46$lowercase_letter["0192"] = false
47# ell
48$lowercase_letter["2113"] = false
49
50$AGL_names["hyphenchar"] = $AGL_names["hyphen"]
51$AGL_names["sfthyphen"] = "00AD"
52$AGL_names["hyphen.alt"] = "00AD"
53
54$AGL_names["dotlessj"] = "0237"
55$AGL_names["tcedilla"] = "0163"
56$AGL_names["Tcedilla"] = "0162"
57
58$AGL_names["ff"]  = "FB00" # = 0066 + 0066
59$AGL_names["fi"]  = "FB01" # = 0066 + 0069
60$AGL_names["fl"]  = "FB02" # = 0066 + 006C
61$AGL_names["ffi"] = "FB03" # = 0066 + 0066 + 0069
62$AGL_names["ffl"] = "FB04" # = 0066 + 0066 + 006C
63
64$AGL_names["cwm"] = "200B"
65$AGL_names["zerowidthspace"] = "200B"
66$AGL_names["perthousandzero"] = "?"
67$AGL_names["visiblespace"] = "2423"
68#$AGL_names["nbspace"] = "00A0"
69$AGL_names["nonbreakingspace"] = "00A0"
70$AGL_names["Germandbls"] = "1E9E" # = 0053 + 0053
71$AGL_names["ell"] = "2113"
72
73$AGL_names[".notdef"] = "?"
74
75$AGL_names["onesuperior"] = "00B9"
76$AGL_names["twosuperior"] = "00B2"
77$AGL_names["threesuperior"] = "00B3"
78
79$AGL_names["anglearc"] = "2222"
80$AGL_names["diameter"] = "2300"
81$AGL_names["dottedcircle"] = "25CC"
82$AGL_names["threequartersemdash"] = "?"
83$AGL_names["f_k"] = "?"
84
85encodings.each do |enc|
86	puts "Writing files for encoding '#{enc}'"
87
88	$filename_encoding         = File.join($path_data, "enc/#{enc}.enc")
89	$filename_xetex_mapping    = File.join($path_data, "map/#{enc}.map")
90	$filename_encoding2unicode = File.join($path_data, "enc2unicode/#{enc}.dat")
91
92
93	$file_map = File.open($filename_xetex_mapping, "w")
94	# FIXME
95	$file_fixed_enc = File.open("data/enc/#{enc}-new.enc", "w")
96	$file_encoding2unicode = File.open($filename_encoding2unicode, "w")
97
98	$file_map.print("EncodingName \"TeX-#{enc}\"\n\n")
99	$file_map.print("pass(Byte_Unicode)\n\n")
100
101	i = 0
102	#$file_out = File.open("#{enc}.txt", "w")
103	# read from adobe glyph list
104	File.open($filename_encoding).grep(/\/[_a-zA-Z0-9\.]+/) do |line|
105		# ignore comments
106		line.gsub!(/%.*/,'')
107		# encoding name should not be considered
108		line.gsub!(/.*\[/,'')
109		# nor the ending definition
110		line.gsub!(/\].*/,'')
111
112		line.scan(/[_a-zA-Z0-9\.]+/) do |w|
113			# Adobe Glyph List doesn't contain uniXXXX names,
114			# so we add that particular uniXXXX to our list for easier handling later on
115			if w =~ /^uni(.*)$/ then
116				$AGL_names[w] = $1
117			end
118			# if the glyph is not in AGL and isn't uniXXXX, print a warning
119			if $AGL_names[w] == nil then
120				puts sprintf(">> error: %s unknown (index 0x%02X)", w, i)
121			else
122				#$file_out.printf("%3s %-20s %s\n", i.to_s, w, $AGL_names[w])
123				#puts w + " " + $AGL_names[w]
124				if $AGL_names[w] == "?"
125					$file_map.printf("; %-20s: no Unicode mapping assigned\n", w);
126					$file_fixed_enc.printf("/%-15s %% 0x%02X\n", w, i);
127					$file_encoding2unicode.printf("0x%02X\tU+....\t\t%s\n", i, w);
128				# somewhat unreliable way to filter out uniXXXX.something
129				elsif $AGL_names[w].size > 4 then
130					$file_map.printf("; %-20s: no unique way to map to Unicode\n", w);
131					$file_fixed_enc.printf("/%-15s %% 0x%02X U+%s\n", w, i, $AGL_names[w]);
132					$file_encoding2unicode.printf("0x%02X\tU+....\t\t%s\n", i, w);
133				else
134					unicode_point = $AGL_names[w]
135					if i != $AGL_names[w].hex
136						$file_map.printf("%d\t<>\tU+%s\t; %s\n", i, unicode_point, w);
137						$file_fixed_enc.printf("/%-15s %% 0x%02X U+%s\n", w, i, unicode_point);
138					else
139						$file_map.printf("%d\t<>\tU+%s\t; %s\n", i, unicode_point, w);
140						$file_fixed_enc.printf("/%-15s %% 0x%02X\n", w, i);
141					end
142					lowercase = ""
143					if $lowercase_letter[unicode_point] == true and unicode_point.hex > 127
144						lowercase = "1"
145						# exception: in Thai, we don't want any characted below 0xA0
146						if enc == "lth" and i < 0xA0 then
147							lowercase = ""
148						end
149					end
150					$file_encoding2unicode.printf("0x%02X\tU+%s\t%s\t%s\n", i, unicode_point, lowercase, w);
151				end
152			end
153			i = i.next
154		end
155	end
156	#$file_out.close
157	$file_map.close
158	$file_encoding2unicode.close
159end
160
161
162