1#!/usr/bin/env ruby 2 3# This file generates patterns for Hyphenator.js 4# http://code.google.com/p/hyphenator 5# 6# Collaboration with: 7# Mathias Nater, <mathias at mnn.ch> 8 9$path_root=File.expand_path(Dir.getwd + "/../../../hyph-utf8") 10$path_sources="#{$path_root}/source/generic/hyph-utf8" 11$path_plain="#{$path_root}/tex/generic/hyph-utf8/patterns/txt" 12$path_repository=File.expand_path(Dir.getwd + "/../../collaboration/hyphenator") 13$path_js=File.expand_path(Dir.getwd + "/../../repository/hyphenator") 14 15load "#{$path_sources}/languages.rb" 16# TODO: should be singleton 17languages = Languages.new 18 19languages["sr-latn"] = languages["sh-latn"] 20languages["sr-cyrl"] = languages["sh-cyrl"] 21 22# change to current folder and read all the files in it 23Dir.chdir("#{$path_plain}") 24files = Dir.glob("*.pat.txt") 25# files = Dir.glob("*sl.pat.txt") 26 27# we need to escape some characters; for a complete list see 28# http://www.jslint.com/lint.html 29# but at the moment there are only two such characters present anyway 30# 31# this function encapsulates the string into single quotes and uses 32def unescape_string_if_needed(str) 33 # unsafe characters - see above for complete list 34 unsafeCharacters = [0x200c, 0x200d] 35 # let's convert our string into array (to preserve proper unicode numbers) 36 str_array=str.unpack("U*") 37 # set this to false until the first replacement takes place 38 replacement_done = false 39 40 # loop over all unsafe character and try to replace all occurencies 41 unsafeCharacters.each do |c| 42 # find the first occurence of that character 43 i = str_array.index(c) 44 while i != nil 45 # replaces a single character with '%uXXXX', where XXXX is hex code of character 46 # this only works with non-math characters, but it should not happen that any bigger number would occur 47 str_array[i,1] = sprintf("%%u%4X", c).unpack("U*") 48 i = str_array.index(c) 49 replacement_done = true 50 end 51 end 52 53 # convert the array back to string 54 str = str_array.pack("U*") 55 56 if replacement_done 57 return "unescape('#{str}')" 58 else 59 return "'#{str}'" 60 end 61end 62 63class Pattern 64 # include Enumerable 65 66 def initialize(pattern) 67 @pattern = pattern.strip 68 @pattern_array = @pattern.unpack("U*") 69 @length = @pattern_array.length 70 end 71 72 def <=>(anOther) 73 # if @length == anOther.length 74 # 0.upto(@length-1) do |i| 75 # if @pattern_array[i] != anOther.pattern_array[i] 76 # return @pattern_array[i] <=> anOther.pattern_array[i] 77 # end 78 # end 79 # return 1 <=> 1 80 # else 81 # @length <=> anOther.length 82 # end 83 @length <=> anOther.length 84 end 85 86 def js_pattern 87 @pattern.gsub(/[.]/, "_") 88 end 89 90 def to_s 91 @pattern 92 end 93 94 def length_of_letters_only 95 return @pattern.gsub(/[0-9]/,'').unpack("U*").length 96 end 97 98 # def sort_by_length 99 attr_reader :pattern, :length, :pattern_array 100end 101 102# TODO: this should be an explicit array of patterns only 103class Patterns < Array 104 def length_of_shortest_and_longest_pattern 105 # store the minimum and maximum length of pattern 106 a = [self.first.length_of_letters_only, self.first.length_of_letters_only] 107 # 108 # a = [0, 1] 109 self.each do |pat| 110 a[0] = [a[0], pat.length_of_letters_only].min 111 a[1] = [a[1], pat.length_of_letters_only].max 112 # a.first = [a.first, pat.length_of_letters_only].min 113 # a.last = [a.last, pat.length_of_letters_only].max 114 end 115 return a 116 end 117 # TODO: you need to make sure that patterns are sorted according to their length first 118 def each_length 119 current_length = 0 120 first_pattern_with_some_size = Array.new 121 122 self.each_index do |i| 123 pattern = self[i] 124 if pattern.length > current_length 125 current_length = pattern.length 126 first_pattern_with_some_size.push(i) 127 end 128 end 129 130 first_pattern_with_some_size.each_index do |i| 131 i_first = first_pattern_with_some_size[i] 132 i_last = nil 133 if i < first_pattern_with_some_size.length-1 134 i_last = first_pattern_with_some_size[i+1] 135 else 136 i_last = self.length 137 end 138 i_len = i_last-i_first 139 140 yield self[i_first,i_len] 141 end 142 end 143end 144 145files.each do |filename| 146 code_in = filename.gsub(/hyph-(.*).pat.txt/,'\1') 147 code_out = code_in.gsub(/-/,"_") 148 language = languages[code_in] # FIXME 149 # TODO: handle exceptions 150 puts 151 puts "Generating Hyphenator.js support for " + code_in 152 puts " writing to '#{$path_js}/#{code_out}.js'" 153 puts 154 patterns = Patterns.new 155 File.open(filename,'r') do |f_in| 156 f_in.each_line do |line| 157 if line.strip.length > 0 158 patterns.push(Pattern.new(line)) 159 end 160 end 161 end 162 patterns.sort! 163 # puts patterns 164 specialChars = patterns.join('').gsub(/[.0-9a-z]/,'').unpack('U*').sort.uniq.pack('U*') 165 166 File.open("#{$path_js}/#{code_out}.js", "w") do |f_out| 167 # BOM mark 168 # f_out.puts [239,187,191].pack("ccc") 169 # f_out.print ["EF","BB","BF"].pack("H2H2H2") 170 f_out.putc(239) 171 f_out.putc(187) 172 f_out.putc(191) 173 f_out.puts "Hyphenator.languages.#{code_out} = {" 174 f_out.puts "\tleftmin : #{language.hyphenmin[0]}," 175 f_out.puts "\trightmin : #{language.hyphenmin[1]}," 176 lengths = patterns.length_of_shortest_and_longest_pattern 177 f_out.puts "\tshortestPattern : #{lengths.first}," 178 f_out.puts "\tlongestPattern : #{lengths.last}," 179 # TODO: handle Ux201C, Ux201D 180 # if specialChars.gsub!(/.../, ...) ~= nil 181 # if specialChars =~ /[]/ 182 # if has_unsafe_characters(specialChars) 183 # end 184 unescape_string_if_needed(specialChars) 185 f_out.puts "\tspecialChars : #{unescape_string_if_needed(specialChars)}," 186 f_out.puts "\tpatterns : {" 187 188 # current length of patterns (they are sorted according to their length) 189 current_length = 0 190 pattern_string = "" 191 i_first = i_last = -1 192 patterns.each_length do |pats| 193 f_out.puts "\t\t#{pats.first.length} : #{unescape_string_if_needed(pats.join(""))}" 194 end 195 196 f_out.puts "\t}" 197 f_out.puts "};" 198 end 199end 200