• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# this is a Unicode character represented in some particular encoding
2class UnicodeCharacter
3	# unicode code
4	# code in that particular encoding
5	# character name (like 'eacute')
6	def initialize(code_uni, code_enc, name)
7		@code_uni = code_uni
8		@code_enc = code_enc
9		@bytes    = [code_uni].pack('U').unpack('C*')
10		@name     = name
11	end
12
13	attr_reader :code_uni, :code_enc, :bytes, :name
14end
15
16class UnicodeCharacters < Hash
17	# a hash based on the first character
18	def add_new_character_first_byte(code_uni, code_enc, name)
19		first_byte = [code_uni].pack('U').unpack('H2').first
20		if self[first_byte] == nil then
21			self[first_byte] = Array.new
22		end
23		self[first_byte].push(UnicodeCharacter.new(code_uni, code_enc, name))
24	end
25	# a hash based on the whole unicode codepoint
26	def add_new_character(code_uni, code_enc, name)
27		self[code_uni] = UnicodeCharacter.new(code_uni, code_enc, name)
28	end
29end
30
31class HyphEncoding
32	def initialize(encoding_name)
33		@encoding_name = encoding_name
34		@unicode_characters_first_byte = UnicodeCharacters.new
35		@unicode_characters = UnicodeCharacters.new
36		@lowercase_characters = Array.new
37
38		if encoding_name != 'ascii' then
39			read_data
40		end
41	end
42
43	def convert_to_escaped_characters(str)
44		if str.kind_of?(Array) then
45			str.each_index do |i|
46				str[i] = convert_string_to_escaped_characters(str[i])
47			end
48		elsif str.kind_of?(String) then
49			str = convert_string_to_escaped_characters(str)
50		end
51		return str
52	end
53
54	attr_reader :encoding_name, :unicode_characters, :unicode_characters_first_byte, :lowercase_characters
55
56	def convert_string_to_escaped_characters(str)
57		skip_this_string = false
58		characters = str.unpack('U*')
59		new_string = Array.new(characters.length)
60		characters.each_index do |i|
61			c = characters[i] # character code on position i
62			# check if unicode entry with that number exists
63			uc = @unicode_characters[c]
64			if uc == nil then
65				if c < 128 then
66					new_string[i] = [c].pack('U')
67				elsif c == 8217 # ’
68					new_string[i] = "'"
69				elsif (c == 0x01FD or c == 0x0301) and @encoding_name == 'ec'
70					skip_this_string = true
71					new_string[i] = sprintf("[U+%04X]", c)
72				else
73					puts sprintf("There must be an error: character U+%04X in string '%s' is not ASCII or %s.", c, str, @encoding_name.upcase)
74				end
75			# an unicode character
76			else
77				new_string[i] = sprintf("^^%x", uc.code_enc)
78			end
79		end
80		if skip_this_string
81			new_string.unshift("% ")
82		end
83		return new_string.join('')
84	end
85
86private
87	def read_data
88		# fetch the characters
89		encoding_data_dir = File.expand_path("../data/encodings", __FILE__)
90		filename = "#{encoding_data_dir}/#{@encoding_name}.dat"
91
92		if File.exist?(filename) then
93			File.open(filename).grep(/^0x(\w+)\tU\+(\w+)\t(\d*)\t([_a-zA-Z0-9\.]*)$/) do |line|
94				# puts line
95				code_enc = $1.hex
96				code_uni = $2.hex
97				if $3.length > 0
98					type = $3.to_i
99				else
100					type = 0
101				end
102				name = $4
103				if type == 1 then
104					@unicode_characters_first_byte.add_new_character_first_byte(code_uni, code_enc, name)
105					@unicode_characters.add_new_character(code_uni, code_enc, name)
106					@lowercase_characters.push(UnicodeCharacter.new(code_uni, code_enc, name))
107				end
108				@lowercase_characters.sort!{|x,y| x.code_enc <=> y.code_enc}
109			end
110		else
111			# TODO: throw an error
112			puts "Invalid encoding name '#{@encoding_name}'."
113			puts "File '#{filename}' doesn't exist."
114		end
115	end
116end
117