1 /*
2 **********************************************************************
3 * Copyright (C) 1999-2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 *
7 * File USC_IMPL.C
8 *
9 * Modification History:
10 *
11 * Date Name Description
12 * 07/08/2002 Eric Mader Creation.
13 ******************************************************************************
14 */
15
16 #include "unicode/uscript.h"
17 #include "usc_impl.h"
18 #include "cmemory.h"
19
20 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
21
22 #define PAREN_STACK_DEPTH 32
23
24 #define MOD(sp) ((sp) % PAREN_STACK_DEPTH)
25 #define LIMIT_INC(sp) (((sp) < PAREN_STACK_DEPTH)? (sp) + 1 : PAREN_STACK_DEPTH)
26 #define INC(sp,count) (MOD((sp) + (count)))
27 #define INC1(sp) (INC(sp, 1))
28 #define DEC(sp,count) (MOD((sp) + PAREN_STACK_DEPTH - (count)))
29 #define DEC1(sp) (DEC(sp, 1))
30 #define STACK_IS_EMPTY(scriptRun) ((scriptRun)->pushCount <= 0)
31 #define STACK_IS_NOT_EMPTY(scriptRun) (! STACK_IS_EMPTY(scriptRun))
32 #define TOP(scriptRun) ((scriptRun)->parenStack[(scriptRun)->parenSP])
33 #define SYNC_FIXUP(scriptRun) ((scriptRun)->fixupCount = 0)
34
35 struct ParenStackEntry
36 {
37 int32_t pairIndex;
38 UScriptCode scriptCode;
39 };
40
41 struct UScriptRun
42 {
43 int32_t textLength;
44 const UChar *textArray;
45
46 int32_t scriptStart;
47 int32_t scriptLimit;
48 UScriptCode scriptCode;
49
50 struct ParenStackEntry parenStack[PAREN_STACK_DEPTH];
51 int32_t parenSP;
52 int32_t pushCount;
53 int32_t fixupCount;
54 };
55
56 static int8_t highBit(int32_t value);
57
58 static const UChar32 pairedChars[] = {
59 0x0028, 0x0029, /* ascii paired punctuation */
60 0x003c, 0x003e,
61 0x005b, 0x005d,
62 0x007b, 0x007d,
63 0x00ab, 0x00bb, /* guillemets */
64 0x2018, 0x2019, /* general punctuation */
65 0x201c, 0x201d,
66 0x2039, 0x203a,
67 0x3008, 0x3009, /* chinese paired punctuation */
68 0x300a, 0x300b,
69 0x300c, 0x300d,
70 0x300e, 0x300f,
71 0x3010, 0x3011,
72 0x3014, 0x3015,
73 0x3016, 0x3017,
74 0x3018, 0x3019,
75 0x301a, 0x301b
76 };
77
push(UScriptRun * scriptRun,int32_t pairIndex,UScriptCode scriptCode)78 static void push(UScriptRun *scriptRun, int32_t pairIndex, UScriptCode scriptCode)
79 {
80 scriptRun->pushCount = LIMIT_INC(scriptRun->pushCount);
81 scriptRun->fixupCount = LIMIT_INC(scriptRun->fixupCount);
82
83 scriptRun->parenSP = INC1(scriptRun->parenSP);
84 scriptRun->parenStack[scriptRun->parenSP].pairIndex = pairIndex;
85 scriptRun->parenStack[scriptRun->parenSP].scriptCode = scriptCode;
86 }
87
pop(UScriptRun * scriptRun)88 static void pop(UScriptRun *scriptRun)
89 {
90 if (STACK_IS_EMPTY(scriptRun)) {
91 return;
92 }
93
94 if (scriptRun->fixupCount > 0) {
95 scriptRun->fixupCount -= 1;
96 }
97
98 scriptRun->pushCount -= 1;
99 scriptRun->parenSP = DEC1(scriptRun->parenSP);
100
101 /* If the stack is now empty, reset the stack
102 pointers to their initial values.
103 */
104 if (STACK_IS_EMPTY(scriptRun)) {
105 scriptRun->parenSP = -1;
106 }
107 }
108
fixup(UScriptRun * scriptRun,UScriptCode scriptCode)109 static void fixup(UScriptRun *scriptRun, UScriptCode scriptCode)
110 {
111 int32_t fixupSP = DEC(scriptRun->parenSP, scriptRun->fixupCount);
112
113 while (scriptRun->fixupCount-- > 0) {
114 fixupSP = INC1(fixupSP);
115 scriptRun->parenStack[fixupSP].scriptCode = scriptCode;
116 }
117 }
118
119 static int8_t
highBit(int32_t value)120 highBit(int32_t value)
121 {
122 int8_t bit = 0;
123
124 if (value <= 0) {
125 return -32;
126 }
127
128 if (value >= 1 << 16) {
129 value >>= 16;
130 bit += 16;
131 }
132
133 if (value >= 1 << 8) {
134 value >>= 8;
135 bit += 8;
136 }
137
138 if (value >= 1 << 4) {
139 value >>= 4;
140 bit += 4;
141 }
142
143 if (value >= 1 << 2) {
144 value >>= 2;
145 bit += 2;
146 }
147
148 if (value >= 1 << 1) {
149 value >>= 1;
150 bit += 1;
151 }
152
153 return bit;
154 }
155
156 static int32_t
getPairIndex(UChar32 ch)157 getPairIndex(UChar32 ch)
158 {
159 int32_t pairedCharCount = ARRAY_SIZE(pairedChars);
160 int32_t pairedCharPower = 1 << highBit(pairedCharCount);
161 int32_t pairedCharExtra = pairedCharCount - pairedCharPower;
162
163 int32_t probe = pairedCharPower;
164 int32_t index = 0;
165
166 if (ch >= pairedChars[pairedCharExtra]) {
167 index = pairedCharExtra;
168 }
169
170 while (probe > (1 << 0)) {
171 probe >>= 1;
172
173 if (ch >= pairedChars[index + probe]) {
174 index += probe;
175 }
176 }
177
178 if (pairedChars[index] != ch) {
179 index = -1;
180 }
181
182 return index;
183 }
184
185 static UBool
sameScript(UScriptCode scriptOne,UScriptCode scriptTwo)186 sameScript(UScriptCode scriptOne, UScriptCode scriptTwo)
187 {
188 return scriptOne <= USCRIPT_INHERITED || scriptTwo <= USCRIPT_INHERITED || scriptOne == scriptTwo;
189 }
190
191 U_CAPI UScriptRun * U_EXPORT2
uscript_openRun(const UChar * src,int32_t length,UErrorCode * pErrorCode)192 uscript_openRun(const UChar *src, int32_t length, UErrorCode *pErrorCode)
193 {
194 UScriptRun *result = NULL;
195
196 if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) {
197 return NULL;
198 }
199
200 result = uprv_malloc(sizeof (UScriptRun));
201
202 if (result == NULL) {
203 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
204 return NULL;
205 }
206
207 uscript_setRunText(result, src, length, pErrorCode);
208
209 /* Release the UScriptRun if uscript_setRunText() returns an error */
210 if (U_FAILURE(*pErrorCode)) {
211 uprv_free(result);
212 result = NULL;
213 }
214
215 return result;
216 }
217
218 U_CAPI void U_EXPORT2
uscript_closeRun(UScriptRun * scriptRun)219 uscript_closeRun(UScriptRun *scriptRun)
220 {
221 if (scriptRun != NULL) {
222 uprv_free(scriptRun);
223 }
224 }
225
226 U_CAPI void U_EXPORT2
uscript_resetRun(UScriptRun * scriptRun)227 uscript_resetRun(UScriptRun *scriptRun)
228 {
229 if (scriptRun != NULL) {
230 scriptRun->scriptStart = 0;
231 scriptRun->scriptLimit = 0;
232 scriptRun->scriptCode = USCRIPT_INVALID_CODE;
233 scriptRun->parenSP = -1;
234 scriptRun->pushCount = 0;
235 scriptRun->fixupCount = 0;
236 }
237 }
238
239 U_CAPI void U_EXPORT2
uscript_setRunText(UScriptRun * scriptRun,const UChar * src,int32_t length,UErrorCode * pErrorCode)240 uscript_setRunText(UScriptRun *scriptRun, const UChar *src, int32_t length, UErrorCode *pErrorCode)
241 {
242 if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) {
243 return;
244 }
245
246 if (scriptRun == NULL || length < 0 || ((src == NULL) != (length == 0))) {
247 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
248 return;
249 }
250
251 scriptRun->textArray = src;
252 scriptRun->textLength = length;
253
254 uscript_resetRun(scriptRun);
255 }
256
257 U_CAPI UBool U_EXPORT2
uscript_nextRun(UScriptRun * scriptRun,int32_t * pRunStart,int32_t * pRunLimit,UScriptCode * pRunScript)258 uscript_nextRun(UScriptRun *scriptRun, int32_t *pRunStart, int32_t *pRunLimit, UScriptCode *pRunScript)
259 {
260 UErrorCode error = U_ZERO_ERROR;
261
262 /* if we've fallen off the end of the text, we're done */
263 if (scriptRun == NULL || scriptRun->scriptLimit >= scriptRun->textLength) {
264 return FALSE;
265 }
266
267 SYNC_FIXUP(scriptRun);
268 scriptRun->scriptCode = USCRIPT_COMMON;
269
270 for (scriptRun->scriptStart = scriptRun->scriptLimit; scriptRun->scriptLimit < scriptRun->textLength; scriptRun->scriptLimit += 1) {
271 UChar high = scriptRun->textArray[scriptRun->scriptLimit];
272 UChar32 ch = high;
273 UScriptCode sc;
274 int32_t pairIndex;
275
276 /*
277 * if the character is a high surrogate and it's not the last one
278 * in the text, see if it's followed by a low surrogate
279 */
280 if (high >= 0xD800 && high <= 0xDBFF && scriptRun->scriptLimit < scriptRun->textLength - 1) {
281 UChar low = scriptRun->textArray[scriptRun->scriptLimit + 1];
282
283 /*
284 * if it is followed by a low surrogate,
285 * consume it and form the full character
286 */
287 if (low >= 0xDC00 && low <= 0xDFFF) {
288 ch = (high - 0xD800) * 0x0400 + low - 0xDC00 + 0x10000;
289 scriptRun->scriptLimit += 1;
290 }
291 }
292
293 sc = uscript_getScript(ch, &error);
294 pairIndex = getPairIndex(ch);
295
296 /*
297 * Paired character handling:
298 *
299 * if it's an open character, push it onto the stack.
300 * if it's a close character, find the matching open on the
301 * stack, and use that script code. Any non-matching open
302 * characters above it on the stack will be poped.
303 */
304 if (pairIndex >= 0) {
305 if ((pairIndex & 1) == 0) {
306 push(scriptRun, pairIndex, scriptRun->scriptCode);
307 } else {
308 int32_t pi = pairIndex & ~1;
309
310 while (STACK_IS_NOT_EMPTY(scriptRun) && TOP(scriptRun).pairIndex != pi) {
311 pop(scriptRun);
312 }
313
314 if (STACK_IS_NOT_EMPTY(scriptRun)) {
315 sc = TOP(scriptRun).scriptCode;
316 }
317 }
318 }
319
320 if (sameScript(scriptRun->scriptCode, sc)) {
321 if (scriptRun->scriptCode <= USCRIPT_INHERITED && sc > USCRIPT_INHERITED) {
322 scriptRun->scriptCode = sc;
323
324 fixup(scriptRun, scriptRun->scriptCode);
325 }
326
327 /*
328 * if this character is a close paired character,
329 * pop the matching open character from the stack
330 */
331 if (pairIndex >= 0 && (pairIndex & 1) != 0) {
332 pop(scriptRun);
333 }
334 } else {
335 /*
336 * if the run broke on a surrogate pair,
337 * end it before the high surrogate
338 */
339 if (ch >= 0x10000) {
340 scriptRun->scriptLimit -= 1;
341 }
342
343 break;
344 }
345 }
346
347
348 if (pRunStart != NULL) {
349 *pRunStart = scriptRun->scriptStart;
350 }
351
352 if (pRunLimit != NULL) {
353 *pRunLimit = scriptRun->scriptLimit;
354 }
355
356 if (pRunScript != NULL) {
357 *pRunScript = scriptRun->scriptCode;
358 }
359
360 return TRUE;
361 }
362