1# this is a Unicode character represented in some particular encoding 2class UnicodeCharacter 3 # unicode code 4 # code in that particular encoding 5 # character name (like 'eacute') 6 def initialize(code_uni, code_enc, name) 7 @code_uni = code_uni 8 @code_enc = code_enc 9 @bytes = [code_uni].pack('U').unpack('C*') 10 @name = name 11 end 12 13 attr_reader :code_uni, :code_enc, :bytes, :name 14end 15 16class UnicodeCharacters < Hash 17 # a hash based on the first character 18 def add_new_character_first_byte(code_uni, code_enc, name) 19 first_byte = [code_uni].pack('U').unpack('H2').first 20 if self[first_byte] == nil then 21 self[first_byte] = Array.new 22 end 23 self[first_byte].push(UnicodeCharacter.new(code_uni, code_enc, name)) 24 end 25 # a hash based on the whole unicode codepoint 26 def add_new_character(code_uni, code_enc, name) 27 self[code_uni] = UnicodeCharacter.new(code_uni, code_enc, name) 28 end 29end 30 31class HyphEncoding 32 def initialize(encoding_name) 33 @encoding_name = encoding_name 34 @unicode_characters_first_byte = UnicodeCharacters.new 35 @unicode_characters = UnicodeCharacters.new 36 @lowercase_characters = Array.new 37 38 if encoding_name != 'ascii' then 39 read_data 40 end 41 end 42 43 def convert_to_escaped_characters(str) 44 if str.kind_of?(Array) then 45 str.each_index do |i| 46 str[i] = convert_string_to_escaped_characters(str[i]) 47 end 48 elsif str.kind_of?(String) then 49 str = convert_string_to_escaped_characters(str) 50 end 51 return str 52 end 53 54 attr_reader :encoding_name, :unicode_characters, :unicode_characters_first_byte, :lowercase_characters 55 56 def convert_string_to_escaped_characters(str) 57 skip_this_string = false 58 characters = str.unpack('U*') 59 new_string = Array.new(characters.length) 60 characters.each_index do |i| 61 c = characters[i] # character code on position i 62 # check if unicode entry with that number exists 63 uc = @unicode_characters[c] 64 if uc == nil then 65 if c < 128 then 66 new_string[i] = [c].pack('U') 67 elsif c == 8217 # ’ 68 new_string[i] = "'" 69 elsif (c == 0x01FD or c == 0x0301) and @encoding_name == 'ec' 70 skip_this_string = true 71 new_string[i] = sprintf("[U+%04X]", c) 72 else 73 puts sprintf("There must be an error: character U+%04X in string '%s' is not ASCII or %s.", c, str, @encoding_name.upcase) 74 end 75 # an unicode character 76 else 77 new_string[i] = sprintf("^^%x", uc.code_enc) 78 end 79 end 80 if skip_this_string 81 new_string.unshift("% ") 82 end 83 return new_string.join('') 84 end 85 86private 87 def read_data 88 # fetch the characters 89 encoding_data_dir = File.expand_path("../data/encodings", __FILE__) 90 filename = "#{encoding_data_dir}/#{@encoding_name}.dat" 91 92 if File.exist?(filename) then 93 File.open(filename).grep(/^0x(\w+)\tU\+(\w+)\t(\d*)\t([_a-zA-Z0-9\.]*)$/) do |line| 94 # puts line 95 code_enc = $1.hex 96 code_uni = $2.hex 97 if $3.length > 0 98 type = $3.to_i 99 else 100 type = 0 101 end 102 name = $4 103 if type == 1 then 104 @unicode_characters_first_byte.add_new_character_first_byte(code_uni, code_enc, name) 105 @unicode_characters.add_new_character(code_uni, code_enc, name) 106 @lowercase_characters.push(UnicodeCharacter.new(code_uni, code_enc, name)) 107 end 108 @lowercase_characters.sort!{|x,y| x.code_enc <=> y.code_enc} 109 end 110 else 111 # TODO: throw an error 112 puts "Invalid encoding name '#{@encoding_name}'." 113 puts "File '#{filename}' doesn't exist." 114 end 115 end 116end 117