• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env ruby
2
3require_relative 'hyph-utf8'
4
5$path_root=File.expand_path("../../../..", __FILE__)
6$encoding_data_dir = File.expand_path("../data/encodings", __FILE__)
7$output_data_dir = "#{$path_root}/tex/generic/hyph-utf8/conversions"
8
9def output_file_name(encoding)
10	File.join($output_data_dir, sprintf('conv-utf8-%s.tex', encoding))
11end
12
13$header = <<__EOHEADER__
14%% conv-utf8-%s.tex
15%%
16%% Conversion from UTF-8 to %s,
17%% used before loading hyphenation patterns for 8-bit TeX engines.
18%%
19%% This file is part of hyph-utf8 package and autogenerated.
20%% See http://tug.org/tex-hyphen
21%%
22%% Copyright 2008-%d TeX Users Group.
23%% You may freely use, modify and/or distribute this file.
24%% (But consider adapting the scripts if you need modifications.)
25__EOHEADER__
26
27def output_copyright_notice(outfile, encoding)
28	outfile.printf $header, encoding, encoding.upcase, Time.new.year
29end
30
31$uniconvmacro1 = <<__EOUNIMAC1__
32% macros adapted from ConTeXt MKII; see unic-ini.mkii
33\\def\\unicodechar#1{%
34	\\ifcsname unichar@\\number#1\\endcsname
35		\\csname unichar@\\number#1\\endcsname
36	\\else
37		\\errmessage{Unicode character [#1] not in encoding.}%
38	\\fi}
39__EOUNIMAC1__
40
41$uniconvmacros = [nil, nil]
42
43$uniconvmacros << <<__EOTWOBYTES__
44\\def\\utftwouniglyph#1#2%
45	{\\expandafter\\unicodechar\\expandafter
46		{\\the\\numexpr64*(#1-192)+`#2-128\\relax}}
47__EOTWOBYTES__
48
49$uniconvmacros << <<__EOTHREEBYTES__
50\\def\\utfthreeuniglyph#1#2#3%
51	{\\expandafter\\unicodechar\\expandafter
52		{\\the\\numexpr4096*(#1-224)+64*(`#2-128)+`#3-128\\relax}}
53__EOTHREEBYTES__
54
55$uniconvmacros << <<__EOFOURBYTES__
56\\def\\utffouruniglyph#1#2#3#4%
57	{\\expandafter\\unicodechar\\expandafter
58		{\\the\\numexpr262144*(#1-240)+4096*(`#2-128)+64*(`#3-128)+`#4-128\\relax}}
59__EOFOURBYTES__
60
61$uniconvmacro2 = <<__EOUNIMAC2__
62
63\\def\\addunichar #1 #2 {\\expandafter\\def\\csname unichar@\\number#1\\endcsname{#2}}
64
65% \\addunichar "unicode_code - ^^font_encoding_code
66__EOUNIMAC2__
67
68["t8m", "lth"].each do |encoding|
69	# load encoding
70	e = HyphEncoding.new(encoding)
71
72	# open file
73	File.open(output_file_name(encoding), "w") do |file_out|
74
75		# copyright notice
76		output_copyright_notice(file_out, encoding)
77		file_out.puts
78
79		# macro to get mapping unicode -> font encoding & error message if screwed up
80		file_out.puts $uniconvmacro1
81
82		# minimal and maximal length of characters in the encoding (until now just 2 & 3)
83		unicode_characters_array = e.unicode_characters.sort
84		length_min = unicode_characters_array.first[1].bytes.size
85		length_max = unicode_characters_array.last[1].bytes.size
86
87		# only output the necessary macros for transforming UTF-8 -> Unicode number
88		(length_min..length_max).each do |nbytes|
89			file_out.puts $uniconvmacros[nbytes]
90		end
91
92		# macro to store mapping unicode -> font encoding
93		file_out.puts $uniconvmacro2
94
95		# all unicode characters in the encoding
96		e.unicode_characters.sort.each do |code,c|
97			file_out.printf("\\addunichar \"%04X ^^%02x \\lccode\"%02X=\"%02X %% %s - %s\n",
98				c.code_uni, c.code_enc, c.code_enc, c.code_enc, [c.code_uni].pack('U'), c.name)
99		end
100		file_out.puts
101
102		# make all the possible first characters active
103		# output the definition into file
104		e.unicode_characters_first_byte.sort.each do |first_byte_code,chars|
105			byte = first_byte_code.hex
106			size = chars[0].bytes.size
107			# 2-byte: 0b11000000 <= byte < 0b11100000
108			str = case size
109			when 2 then
110				"two"
111			# 3-byte: 0b11100000 <= byte < 0b11110000
112			when 3 then
113				"three"
114			# 4-byte: 0b11110000 <= byte < 0b11111000
115			when 4 then
116				"four"
117			end
118			file_out.printf("\\catcode\"%02X=\\active \\def^^%02x{\\utf%suniglyph{\"%02X}}\n", byte, byte, str, byte)
119		end
120	end
121end
122
123["ec", "qx", "t2a", "lmc", "il2", "il3", "l7x"].each do |encoding|
124	# load encoding
125	e = HyphEncoding.new(encoding)
126
127	# open file
128	File.open(output_file_name(encoding), "w") do |file_out|
129
130		# copyright notice
131		output_copyright_notice(file_out, encoding)
132		file_out.puts '%'
133
134		e.unicode_characters_first_byte.sort.each do |first_byte_code,chars|
135			# sorting all the second characters alphabetically
136			chars.sort!{|x,y| x.code_uni <=> y.code_uni }
137			# make all the possible first characters active
138			# output the definition into file
139			file_out.printf("\\catcode\"%02X=\\active\n", first_byte_code.hex)
140		end
141		file_out.puts "%"
142		e.unicode_characters_first_byte.sort.each do |first_byte_code,chars|
143			first_byte_code = first_byte_code.hex
144			size = chars[0].bytes.size
145			if size != 2 then
146				throw "The encoding #{encoding} uses more than two bytes to encode characters"
147			else
148
149				file_out.printf("\\def^^%02x#1{%%\n", first_byte_code)
150				string_fi = ""
151				for i in 1..(chars.size)
152					uni_character = chars[i-1]
153					enc_byte    = uni_character.code_enc
154					enc_byte    = [ uni_character.code_enc ].pack('c').unpack('H2')
155					file_out.printf("\t\\ifx#1^^%02x^^%02x\\else %% %s - U+%04X - %s\n", uni_character.bytes[1], uni_character.code_enc, [uni_character.code_uni].pack('U'), uni_character.code_uni, uni_character.name)
156					string_fi = string_fi + "\\fi"
157				end
158
159			# at least three bytes
160			end
161			file_out.puts "\t\\errmessage{Hyphenation pattern file corrupted or #{encoding} encoding not supported!}"
162			file_out.puts string_fi + "}"
163		end
164		file_out.puts '%'
165		file_out.puts '% ensure all the chars above have valid \lccode values'
166		file_out.puts '%'
167		e.lowercase_characters.each do |character|
168			code = [ character.code_enc ].pack("c").unpack("H2").first.upcase
169			# \lccode"FF="FF
170			file_out.printf "\\lccode\"%s=\"%s %% %s - U+%04X - %s\n", code, code, [character.code_uni].pack('U'), character.code_uni, character.name
171		end
172
173		file_out.puts
174	end
175end
176