1 /*
2 * Copyright (C) 2007 Apple Computer, Inc.
3 *
4 * Portions are Copyright (C) 1998 Netscape Communications Corporation.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Alternatively, the contents of this file may be used under the terms
21 * of either the Mozilla Public License Version 1.1, found at
22 * http://www.mozilla.org/MPL/ (the "MPL") or the GNU General Public
23 * License Version 2.0, found at http://www.fsf.org/copyleft/gpl.html
24 * (the "GPL"), in which case the provisions of the MPL or the GPL are
25 * applicable instead of those above. If you wish to allow use of your
26 * version of this file only under the terms of one of those two
27 * licenses (the MPL or the GPL) and not to allow others to use your
28 * version of this file under the LGPL, indicate your decision by
29 * deletingthe provisions above and replace them with the notice and
30 * other provisions required by the MPL or the GPL, as the case may be.
31 * If you do not delete the provisions above, a recipient may use your
32 * version of this file under any of the LGPL, the MPL or the GPL.
33 */
34
35 #include "config.h"
36 #include "UnicodeRange.h"
37
38 namespace WebCore {
39
40 // This table depends on unicode range definitions.
41 // Each item's index must correspond to a unicode range value
42 // eg. x-cyrillic = LangGroupTable[cRangeCyrillic]
43 static const char* gUnicodeRangeToLangGroupTable[] =
44 {
45 "x-cyrillic",
46 "el",
47 "tr",
48 "he",
49 "ar",
50 "x-baltic",
51 "th",
52 "ko",
53 "ja",
54 "zh-CN",
55 "zh-TW",
56 "x-devanagari",
57 "x-tamil",
58 "x-armn",
59 "x-beng",
60 "x-cans",
61 "x-ethi",
62 "x-geor",
63 "x-gujr",
64 "x-guru",
65 "x-khmr",
66 "x-mlym"
67 };
68
69 /**********************************************************************
70 * Unicode subranges as defined in unicode 3.0
71 * x-western, x-central-euro, tr, x-baltic -> latin
72 * 0000 - 036f
73 * 1e00 - 1eff
74 * 2000 - 206f (general punctuation)
75 * 20a0 - 20cf (currency symbols)
76 * 2100 - 214f (letterlike symbols)
77 * 2150 - 218f (Number Forms)
78 * el -> greek
79 * 0370 - 03ff
80 * 1f00 - 1fff
81 * x-cyrillic -> cyrillic
82 * 0400 - 04ff
83 * he -> hebrew
84 * 0590 - 05ff
85 * ar -> arabic
86 * 0600 - 06ff
87 * fb50 - fdff (arabic presentation forms)
88 * fe70 - feff (arabic presentation forms b)
89 * th - thai
90 * 0e00 - 0e7f
91 * ko -> korean
92 * ac00 - d7af (hangul Syllables)
93 * 1100 - 11ff (jamo)
94 * 3130 - 318f (hangul compatibility jamo)
95 * ja
96 * 3040 - 309f (hiragana)
97 * 30a0 - 30ff (katakana)
98 * zh-CN
99 * zh-TW
100 *
101 * CJK
102 * 3100 - 312f (bopomofo)
103 * 31a0 - 31bf (bopomofo extended)
104 * 3000 - 303f (CJK Symbols and Punctuation)
105 * 2e80 - 2eff (CJK radicals supplement)
106 * 2f00 - 2fdf (Kangxi Radicals)
107 * 2ff0 - 2fff (Ideographic Description Characters)
108 * 3190 - 319f (kanbun)
109 * 3200 - 32ff (Enclosed CJK letters and Months)
110 * 3300 - 33ff (CJK compatibility)
111 * 3400 - 4dbf (CJK Unified Ideographs Extension A)
112 * 4e00 - 9faf (CJK Unified Ideographs)
113 * f900 - fa5f (CJK Compatibility Ideographs)
114 * fe30 - fe4f (CJK compatibility Forms)
115 * ff00 - ffef (halfwidth and fullwidth forms)
116 *
117 * Armenian
118 * 0530 - 058f
119 * Sriac
120 * 0700 - 074f
121 * Thaana
122 * 0780 - 07bf
123 * Devanagari
124 * 0900 - 097f
125 * Bengali
126 * 0980 - 09ff
127 * Gurmukhi
128 * 0a00 - 0a7f
129 * Gujarati
130 * 0a80 - 0aff
131 * Oriya
132 * 0b00 - 0b7f
133 * Tamil
134 * 0b80 - 0bff
135 * Telugu
136 * 0c00 - 0c7f
137 * Kannada
138 * 0c80 - 0cff
139 * Malayalam
140 * 0d00 - 0d7f
141 * Sinhala
142 * 0d80 - 0def
143 * Lao
144 * 0e80 - 0eff
145 * Tibetan
146 * 0f00 - 0fbf
147 * Myanmar
148 * 1000 - 109f
149 * Georgian
150 * 10a0 - 10ff
151 * Ethiopic
152 * 1200 - 137f
153 * Cherokee
154 * 13a0 - 13ff
155 * Canadian Aboriginal Syllabics
156 * 1400 - 167f
157 * Ogham
158 * 1680 - 169f
159 * Runic
160 * 16a0 - 16ff
161 * Khmer
162 * 1780 - 17ff
163 * Mongolian
164 * 1800 - 18af
165 * Misc - superscripts and subscripts
166 * 2070 - 209f
167 * Misc - Combining Diacritical Marks for Symbols
168 * 20d0 - 20ff
169 * Misc - Arrows
170 * 2190 - 21ff
171 * Misc - Mathematical Operators
172 * 2200 - 22ff
173 * Misc - Miscellaneous Technical
174 * 2300 - 23ff
175 * Misc - Control picture
176 * 2400 - 243f
177 * Misc - Optical character recognition
178 * 2440 - 2450
179 * Misc - Enclose Alphanumerics
180 * 2460 - 24ff
181 * Misc - Box Drawing
182 * 2500 - 257f
183 * Misc - Block Elements
184 * 2580 - 259f
185 * Misc - Geometric Shapes
186 * 25a0 - 25ff
187 * Misc - Miscellaneous Symbols
188 * 2600 - 267f
189 * Misc - Dingbats
190 * 2700 - 27bf
191 * Misc - Braille Patterns
192 * 2800 - 28ff
193 * Yi Syllables
194 * a000 - a48f
195 * Yi radicals
196 * a490 - a4cf
197 * Alphabetic Presentation Forms
198 * fb00 - fb4f
199 * Misc - Combining half Marks
200 * fe20 - fe2f
201 * Misc - small form variants
202 * fe50 - fe6f
203 * Misc - Specials
204 * fff0 - ffff
205 *********************************************************************/
206
207 static const unsigned cNumSubTables = 9;
208 static const unsigned cSubTableSize = 16;
209
210 static const unsigned char gUnicodeSubrangeTable[cNumSubTables][cSubTableSize] =
211 {
212 { // table for X---
213 cRangeTableBase+1, //u0xxx
214 cRangeTableBase+2, //u1xxx
215 cRangeTableBase+3, //u2xxx
216 cRangeSetCJK, //u3xxx
217 cRangeSetCJK, //u4xxx
218 cRangeSetCJK, //u5xxx
219 cRangeSetCJK, //u6xxx
220 cRangeSetCJK, //u7xxx
221 cRangeSetCJK, //u8xxx
222 cRangeSetCJK, //u9xxx
223 cRangeTableBase+4, //uaxxx
224 cRangeKorean, //ubxxx
225 cRangeKorean, //ucxxx
226 cRangeTableBase+5, //udxxx
227 cRangePrivate, //uexxx
228 cRangeTableBase+6 //ufxxx
229 },
230 { //table for 0X--
231 cRangeSetLatin, //u00xx
232 cRangeSetLatin, //u01xx
233 cRangeSetLatin, //u02xx
234 cRangeGreek, //u03xx XXX 0300-036f is in fact cRangeCombiningDiacriticalMarks
235 cRangeCyrillic, //u04xx
236 cRangeTableBase+7, //u05xx, includes Cyrillic supplement, Hebrew, and Armenian
237 cRangeArabic, //u06xx
238 cRangeTertiaryTable, //u07xx
239 cRangeUnassigned, //u08xx
240 cRangeTertiaryTable, //u09xx
241 cRangeTertiaryTable, //u0axx
242 cRangeTertiaryTable, //u0bxx
243 cRangeTertiaryTable, //u0cxx
244 cRangeTertiaryTable, //u0dxx
245 cRangeTertiaryTable, //u0exx
246 cRangeTibetan, //u0fxx
247 },
248 { //table for 1x--
249 cRangeTertiaryTable, //u10xx
250 cRangeKorean, //u11xx
251 cRangeEthiopic, //u12xx
252 cRangeTertiaryTable, //u13xx
253 cRangeCanadian, //u14xx
254 cRangeCanadian, //u15xx
255 cRangeTertiaryTable, //u16xx
256 cRangeKhmer, //u17xx
257 cRangeMongolian, //u18xx
258 cRangeUnassigned, //u19xx
259 cRangeUnassigned, //u1axx
260 cRangeUnassigned, //u1bxx
261 cRangeUnassigned, //u1cxx
262 cRangeUnassigned, //u1dxx
263 cRangeSetLatin, //u1exx
264 cRangeGreek, //u1fxx
265 },
266 { //table for 2x--
267 cRangeSetLatin, //u20xx
268 cRangeSetLatin, //u21xx
269 cRangeMathOperators, //u22xx
270 cRangeMiscTechnical, //u23xx
271 cRangeControlOpticalEnclose, //u24xx
272 cRangeBoxBlockGeometrics, //u25xx
273 cRangeMiscSymbols, //u26xx
274 cRangeDingbats, //u27xx
275 cRangeBraillePattern, //u28xx
276 cRangeUnassigned, //u29xx
277 cRangeUnassigned, //u2axx
278 cRangeUnassigned, //u2bxx
279 cRangeUnassigned, //u2cxx
280 cRangeUnassigned, //u2dxx
281 cRangeSetCJK, //u2exx
282 cRangeSetCJK, //u2fxx
283 },
284 { //table for ax--
285 cRangeYi, //ua0xx
286 cRangeYi, //ua1xx
287 cRangeYi, //ua2xx
288 cRangeYi, //ua3xx
289 cRangeYi, //ua4xx
290 cRangeUnassigned, //ua5xx
291 cRangeUnassigned, //ua6xx
292 cRangeUnassigned, //ua7xx
293 cRangeUnassigned, //ua8xx
294 cRangeUnassigned, //ua9xx
295 cRangeUnassigned, //uaaxx
296 cRangeUnassigned, //uabxx
297 cRangeKorean, //uacxx
298 cRangeKorean, //uadxx
299 cRangeKorean, //uaexx
300 cRangeKorean, //uafxx
301 },
302 { //table for dx--
303 cRangeKorean, //ud0xx
304 cRangeKorean, //ud1xx
305 cRangeKorean, //ud2xx
306 cRangeKorean, //ud3xx
307 cRangeKorean, //ud4xx
308 cRangeKorean, //ud5xx
309 cRangeKorean, //ud6xx
310 cRangeKorean, //ud7xx
311 cRangeSurrogate, //ud8xx
312 cRangeSurrogate, //ud9xx
313 cRangeSurrogate, //udaxx
314 cRangeSurrogate, //udbxx
315 cRangeSurrogate, //udcxx
316 cRangeSurrogate, //uddxx
317 cRangeSurrogate, //udexx
318 cRangeSurrogate, //udfxx
319 },
320 { // table for fx--
321 cRangePrivate, //uf0xx
322 cRangePrivate, //uf1xx
323 cRangePrivate, //uf2xx
324 cRangePrivate, //uf3xx
325 cRangePrivate, //uf4xx
326 cRangePrivate, //uf5xx
327 cRangePrivate, //uf6xx
328 cRangePrivate, //uf7xx
329 cRangePrivate, //uf8xx
330 cRangeSetCJK, //uf9xx
331 cRangeSetCJK, //ufaxx
332 cRangeArabic, //ufbxx, includes alphabic presentation form
333 cRangeArabic, //ufcxx
334 cRangeArabic, //ufdxx
335 cRangeArabic, //ufexx, includes Combining half marks,
336 // CJK compatibility forms,
337 // CJK compatibility forms,
338 // small form variants
339 cRangeTableBase+8, //uffxx, halfwidth and fullwidth forms, includes Specials
340 },
341 { //table for 0x0500 - 0x05ff
342 cRangeCyrillic, //u050x
343 cRangeCyrillic, //u051x
344 cRangeCyrillic, //u052x
345 cRangeArmenian, //u053x
346 cRangeArmenian, //u054x
347 cRangeArmenian, //u055x
348 cRangeArmenian, //u056x
349 cRangeArmenian, //u057x
350 cRangeArmenian, //u058x
351 cRangeHebrew, //u059x
352 cRangeHebrew, //u05ax
353 cRangeHebrew, //u05bx
354 cRangeHebrew, //u05cx
355 cRangeHebrew, //u05dx
356 cRangeHebrew, //u05ex
357 cRangeHebrew, //u05fx
358 },
359 { //table for 0xff00 - 0xffff
360 cRangeSetCJK, //uff0x, fullwidth latin
361 cRangeSetCJK, //uff1x, fullwidth latin
362 cRangeSetCJK, //uff2x, fullwidth latin
363 cRangeSetCJK, //uff3x, fullwidth latin
364 cRangeSetCJK, //uff4x, fullwidth latin
365 cRangeSetCJK, //uff5x, fullwidth latin
366 cRangeSetCJK, //uff6x, halfwidth katakana
367 cRangeSetCJK, //uff7x, halfwidth katakana
368 cRangeSetCJK, //uff8x, halfwidth katakana
369 cRangeSetCJK, //uff9x, halfwidth katakana
370 cRangeSetCJK, //uffax, halfwidth hangul jamo
371 cRangeSetCJK, //uffbx, halfwidth hangul jamo
372 cRangeSetCJK, //uffcx, halfwidth hangul jamo
373 cRangeSetCJK, //uffdx, halfwidth hangul jamo
374 cRangeSetCJK, //uffex, fullwidth symbols
375 cRangeSpecials, //ufffx, Specials
376 },
377 };
378
379 // Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80)
380 // code points so that the number of entries in the tertiary range
381 // table for that range is obtained by dividing (0x1700 - 0x0700) by 128.
382 // Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal
383 // syllabaries take multiple chunks and Ogham and Runic share a single chunk.
384 static const unsigned cTertiaryTableSize = ((0x1700 - 0x0700) / 0x80);
385
386 static const unsigned char gUnicodeTertiaryRangeTable[cTertiaryTableSize] =
387 { //table for 0x0700 - 0x1600
388 cRangeSyriac, //u070x
389 cRangeThaana, //u078x
390 cRangeUnassigned, //u080x place holder(resolved in the 2ndary tab.)
391 cRangeUnassigned, //u088x place holder(resolved in the 2ndary tab.)
392 cRangeDevanagari, //u090x
393 cRangeBengali, //u098x
394 cRangeGurmukhi, //u0a0x
395 cRangeGujarati, //u0a8x
396 cRangeOriya, //u0b0x
397 cRangeTamil, //u0b8x
398 cRangeTelugu, //u0c0x
399 cRangeKannada, //u0c8x
400 cRangeMalayalam, //u0d0x
401 cRangeSinhala, //u0d8x
402 cRangeThai, //u0e0x
403 cRangeLao, //u0e8x
404 cRangeTibetan, //u0f0x place holder(resolved in the 2ndary tab.)
405 cRangeTibetan, //u0f8x place holder(resolved in the 2ndary tab.)
406 cRangeMyanmar, //u100x
407 cRangeGeorgian, //u108x
408 cRangeKorean, //u110x place holder(resolved in the 2ndary tab.)
409 cRangeKorean, //u118x place holder(resolved in the 2ndary tab.)
410 cRangeEthiopic, //u120x place holder(resolved in the 2ndary tab.)
411 cRangeEthiopic, //u128x place holder(resolved in the 2ndary tab.)
412 cRangeEthiopic, //u130x
413 cRangeCherokee, //u138x
414 cRangeCanadian, //u140x place holder(resolved in the 2ndary tab.)
415 cRangeCanadian, //u148x place holder(resolved in the 2ndary tab.)
416 cRangeCanadian, //u150x place holder(resolved in the 2ndary tab.)
417 cRangeCanadian, //u158x place holder(resolved in the 2ndary tab.)
418 cRangeCanadian, //u160x
419 cRangeOghamRunic, //u168x this contains two scripts, Ogham & Runic
420 };
421
422 // A two level index is almost enough for locating a range, with the
423 // exception of u03xx and u05xx. Since we don't really care about range for
424 // combining diacritical marks in our font application, they are
425 // not discriminated further. Future adoption of this method for other use
426 // should be aware of this limitation. The implementation can be extended if
427 // there is such a need.
428 // For Indic, Southeast Asian scripts and some other scripts between
429 // U+0700 and U+16FF, it's extended to the third level.
findCharUnicodeRange(UChar32 ch)430 unsigned int findCharUnicodeRange(UChar32 ch)
431 {
432 if (ch >= 0xFFFF)
433 return 0;
434
435 unsigned int range;
436
437 //search the first table
438 range = gUnicodeSubrangeTable[0][ch >> 12];
439
440 if (range < cRangeTableBase)
441 // we try to get a specific range
442 return range;
443
444 // otherwise, we have one more table to look at
445 range = gUnicodeSubrangeTable[range - cRangeTableBase][(ch & 0x0f00) >> 8];
446 if (range < cRangeTableBase)
447 return range;
448 if (range < cRangeTertiaryTable)
449 return gUnicodeSubrangeTable[range - cRangeTableBase][(ch & 0x00f0) >> 4];
450
451 // Yet another table to look at : U+0700 - U+16FF : 128 code point blocks
452 return gUnicodeTertiaryRangeTable[(ch - 0x0700) >> 7];
453 }
454
langGroupFromUnicodeRange(unsigned char unicodeRange)455 const char* langGroupFromUnicodeRange(unsigned char unicodeRange)
456 {
457 if (cRangeSpecificItemNum > unicodeRange)
458 return gUnicodeRangeToLangGroupTable[unicodeRange];
459 return 0;
460 }
461
462 }
463