• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env ruby
2
3# This file generates patterns for Hyphenator.js
4#     http://code.google.com/p/hyphenator
5#
6# Collaboration with:
7#     Mathias Nater, <mathias at mnn.ch>
8
9$path_root=File.expand_path(Dir.getwd + "/../../../hyph-utf8")
10$path_sources="#{$path_root}/source/generic/hyph-utf8"
11$path_plain="#{$path_root}/tex/generic/hyph-utf8/patterns/txt"
12$path_repository=File.expand_path(Dir.getwd + "/../../collaboration/hyphenator")
13$path_js=File.expand_path(Dir.getwd + "/../../repository/hyphenator")
14
15load "#{$path_sources}/languages.rb"
16# TODO: should be singleton
17languages = Languages.new
18
19languages["sr-latn"] = languages["sh-latn"]
20languages["sr-cyrl"] = languages["sh-cyrl"]
21
22# change to current folder and read all the files in it
23Dir.chdir("#{$path_plain}")
24files = Dir.glob("*.pat.txt")
25# files = Dir.glob("*sl.pat.txt")
26
27# we need to escape some characters; for a complete list see
28#     http://www.jslint.com/lint.html
29# but at the moment there are only two such characters present anyway
30#
31# this function encapsulates the string into single quotes and uses
32def unescape_string_if_needed(str)
33	# unsafe characters - see above for complete list
34	unsafeCharacters = [0x200c, 0x200d]
35	# let's convert our string into array (to preserve proper unicode numbers)
36	str_array=str.unpack("U*")
37	# set this to false until the first replacement takes place
38	replacement_done = false
39
40	# loop over all unsafe character and try to replace all occurencies
41	unsafeCharacters.each do |c|
42		# find the first occurence of that character
43		i = str_array.index(c)
44		while i != nil
45			# replaces a single character with '%uXXXX', where XXXX is hex code of character
46			# this only works with non-math characters, but it should not happen that any bigger number would occur
47			str_array[i,1] = sprintf("%%u%4X", c).unpack("U*")
48			i = str_array.index(c)
49			replacement_done = true
50		end
51	end
52
53	# convert the array back to string
54	str = str_array.pack("U*")
55
56	if replacement_done
57		return "unescape('#{str}')"
58	else
59		return "'#{str}'"
60	end
61end
62
63class Pattern
64	# include Enumerable
65
66	def initialize(pattern)
67		@pattern = pattern.strip
68		@pattern_array = @pattern.unpack("U*")
69		@length = @pattern_array.length
70	end
71
72	def <=>(anOther)
73		# if @length == anOther.length
74		# 	0.upto(@length-1) do |i|
75		# 		if @pattern_array[i] != anOther.pattern_array[i]
76		# 			return @pattern_array[i] <=> anOther.pattern_array[i]
77		# 		end
78		# 	end
79		# 	return 1 <=> 1
80		# else
81		# 	@length <=> anOther.length
82		# end
83		@length <=> anOther.length
84	end
85
86	def js_pattern
87		@pattern.gsub(/[.]/, "_")
88	end
89
90	def to_s
91		@pattern
92	end
93
94	def length_of_letters_only
95		return @pattern.gsub(/[0-9]/,'').unpack("U*").length
96	end
97
98	# def sort_by_length
99	attr_reader :pattern, :length, :pattern_array
100end
101
102# TODO: this should be an explicit array of patterns only
103class Patterns < Array
104	def length_of_shortest_and_longest_pattern
105		# store the minimum and maximum length of pattern
106		a = [self.first.length_of_letters_only, self.first.length_of_letters_only]
107		#
108		# a = [0, 1]
109		self.each do |pat|
110			a[0] = [a[0], pat.length_of_letters_only].min
111			a[1] = [a[1], pat.length_of_letters_only].max
112			# a.first = [a.first, pat.length_of_letters_only].min
113			# a.last  = [a.last,  pat.length_of_letters_only].max
114		end
115		return a
116	end
117	# TODO: you need to make sure that patterns are sorted according to their length first
118	def each_length
119		current_length = 0
120		first_pattern_with_some_size = Array.new
121
122		self.each_index do |i|
123			pattern = self[i]
124			if pattern.length > current_length
125				current_length = pattern.length
126				first_pattern_with_some_size.push(i)
127			end
128		end
129
130		first_pattern_with_some_size.each_index do |i|
131			i_first = first_pattern_with_some_size[i]
132			i_last = nil
133			if i < first_pattern_with_some_size.length-1
134				i_last = first_pattern_with_some_size[i+1]
135			else
136				i_last = self.length
137			end
138			i_len = i_last-i_first
139
140			yield self[i_first,i_len]
141		end
142	end
143end
144
145files.each do |filename|
146	code_in  = filename.gsub(/hyph-(.*).pat.txt/,'\1')
147	code_out = code_in.gsub(/-/,"_")
148	language = languages[code_in] # FIXME
149	# TODO: handle exceptions
150	puts
151	puts "Generating Hyphenator.js support for " + code_in
152	puts "    writing to '#{$path_js}/#{code_out}.js'"
153	puts
154	patterns = Patterns.new
155	File.open(filename,'r') do |f_in|
156		f_in.each_line do |line|
157			if line.strip.length > 0
158				patterns.push(Pattern.new(line))
159			end
160		end
161	end
162	patterns.sort!
163	# puts patterns
164	specialChars = patterns.join('').gsub(/[.0-9a-z]/,'').unpack('U*').sort.uniq.pack('U*')
165
166	File.open("#{$path_js}/#{code_out}.js", "w") do |f_out|
167		# BOM mark
168		# f_out.puts [239,187,191].pack("ccc")
169		# f_out.print ["EF","BB","BF"].pack("H2H2H2")
170		f_out.putc(239)
171		f_out.putc(187)
172		f_out.putc(191)
173		f_out.puts "Hyphenator.languages.#{code_out} = {"
174		f_out.puts "\tleftmin : #{language.hyphenmin[0]},"
175		f_out.puts "\trightmin : #{language.hyphenmin[1]},"
176		lengths = patterns.length_of_shortest_and_longest_pattern
177		f_out.puts "\tshortestPattern : #{lengths.first},"
178		f_out.puts "\tlongestPattern : #{lengths.last},"
179		# TODO: handle Ux201C, Ux201D
180		# if specialChars.gsub!(/.../, ...) ~= nil
181		# if specialChars =~ /[]/
182		# if has_unsafe_characters(specialChars)
183		# end
184		unescape_string_if_needed(specialChars)
185		f_out.puts "\tspecialChars : #{unescape_string_if_needed(specialChars)},"
186		f_out.puts "\tpatterns : {"
187
188		# current length of patterns (they are sorted according to their length)
189		current_length = 0
190		pattern_string = ""
191		i_first = i_last = -1
192		patterns.each_length do |pats|
193			f_out.puts "\t\t#{pats.first.length} : #{unescape_string_if_needed(pats.join(""))}"
194		end
195
196		f_out.puts "\t}"
197		f_out.puts "};"
198	end
199end
200