1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 * Copyright (C) 1999-2015, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 ******************************************************************************
10 * file name: ubidi.c
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 1999jul27
16 * created by: Markus W. Scherer, updated by Matitiahu Allouche
17 *
18 */
19
20 #include "cmemory.h"
21 #include "unicode/utypes.h"
22 #include "unicode/ustring.h"
23 #include "unicode/uchar.h"
24 #include "unicode/ubidi.h"
25 #include "unicode/utf16.h"
26 #include "ubidi_props.h"
27 #include "ubidiimp.h"
28 #include "uassert.h"
29
30 /*
31 * General implementation notes:
32 *
33 * Throughout the implementation, there are comments like (W2) that refer to
34 * rules of the BiDi algorithm, in this example to the second rule of the
35 * resolution of weak types.
36 *
37 * For handling surrogate pairs, where two UChar's form one "abstract" (or UTF-32)
38 * character according to UTF-16, the second UChar gets the directional property of
39 * the entire character assigned, while the first one gets a BN, a boundary
40 * neutral, type, which is ignored by most of the algorithm according to
41 * rule (X9) and the implementation suggestions of the BiDi algorithm.
42 *
43 * Later, adjustWSLevels() will set the level for each BN to that of the
44 * following character (UChar), which results in surrogate pairs getting the
45 * same level on each of their surrogates.
46 *
47 * In a UTF-8 implementation, the same thing could be done: the last byte of
48 * a multi-byte sequence would get the "real" property, while all previous
49 * bytes of that sequence would get BN.
50 *
51 * It is not possible to assign all those parts of a character the same real
52 * property because this would fail in the resolution of weak types with rules
53 * that look at immediately surrounding types.
54 *
55 * As a related topic, this implementation does not remove Boundary Neutral
56 * types from the input, but ignores them wherever this is relevant.
57 * For example, the loop for the resolution of the weak types reads
58 * types until it finds a non-BN.
59 * Also, explicit embedding codes are neither changed into BN nor removed.
60 * They are only treated the same way real BNs are.
61 * As stated before, adjustWSLevels() takes care of them at the end.
62 * For the purpose of conformance, the levels of all these codes
63 * do not matter.
64 *
65 * Note that this implementation modifies the dirProps
66 * after the initial setup, when applying X5c (replace FSI by LRI or RLI),
67 * X6, N0 (replace paired brackets by L or R).
68 *
69 * In this implementation, the resolution of weak types (W1 to W6),
70 * neutrals (N1 and N2), and the assignment of the resolved level (In)
71 * are all done in one single loop, in resolveImplicitLevels().
72 * Changes of dirProp values are done on the fly, without writing
73 * them back to the dirProps array.
74 *
75 *
76 * This implementation contains code that allows to bypass steps of the
77 * algorithm that are not needed on the specific paragraph
78 * in order to speed up the most common cases considerably,
79 * like text that is entirely LTR, or RTL text without numbers.
80 *
81 * Most of this is done by setting a bit for each directional property
82 * in a flags variable and later checking for whether there are
83 * any LTR characters or any RTL characters, or both, whether
84 * there are any explicit embedding codes, etc.
85 *
86 * If the (Xn) steps are performed, then the flags are re-evaluated,
87 * because they will then not contain the embedding codes any more
88 * and will be adjusted for override codes, so that subsequently
89 * more bypassing may be possible than what the initial flags suggested.
90 *
91 * If the text is not mixed-directional, then the
92 * algorithm steps for the weak type resolution are not performed,
93 * and all levels are set to the paragraph level.
94 *
95 * If there are no explicit embedding codes, then the (Xn) steps
96 * are not performed.
97 *
98 * If embedding levels are supplied as a parameter, then all
99 * explicit embedding codes are ignored, and the (Xn) steps
100 * are not performed.
101 *
102 * White Space types could get the level of the run they belong to,
103 * and are checked with a test of (flags&MASK_EMBEDDING) to
104 * consider if the paragraph direction should be considered in
105 * the flags variable.
106 *
107 * If there are no White Space types in the paragraph, then
108 * (L1) is not necessary in adjustWSLevels().
109 */
110
111 /* to avoid some conditional statements, use tiny constant arrays */
112 static const Flags flagLR[2]={ DIRPROP_FLAG(L), DIRPROP_FLAG(R) };
113 static const Flags flagE[2]={ DIRPROP_FLAG(LRE), DIRPROP_FLAG(RLE) };
114 static const Flags flagO[2]={ DIRPROP_FLAG(LRO), DIRPROP_FLAG(RLO) };
115
116 #define DIRPROP_FLAG_LR(level) flagLR[(level)&1]
117 #define DIRPROP_FLAG_E(level) flagE[(level)&1]
118 #define DIRPROP_FLAG_O(level) flagO[(level)&1]
119
120 #define DIR_FROM_STRONG(strong) ((strong)==L ? L : R)
121
122 #define NO_OVERRIDE(level) ((level)&~UBIDI_LEVEL_OVERRIDE)
123
124 /* UBiDi object management -------------------------------------------------- */
125
126 U_CAPI UBiDi * U_EXPORT2
ubidi_open(void)127 ubidi_open(void)
128 {
129 UErrorCode errorCode=U_ZERO_ERROR;
130 return ubidi_openSized(0, 0, &errorCode);
131 }
132
133 U_CAPI UBiDi * U_EXPORT2
ubidi_openSized(int32_t maxLength,int32_t maxRunCount,UErrorCode * pErrorCode)134 ubidi_openSized(int32_t maxLength, int32_t maxRunCount, UErrorCode *pErrorCode) {
135 UBiDi *pBiDi;
136
137 /* check the argument values */
138 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
139 return NULL;
140 } else if(maxLength<0 || maxRunCount<0) {
141 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
142 return NULL; /* invalid arguments */
143 }
144
145 /* allocate memory for the object */
146 pBiDi=(UBiDi *)uprv_malloc(sizeof(UBiDi));
147 if(pBiDi==NULL) {
148 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
149 return NULL;
150 }
151
152 /* reset the object, all pointers NULL, all flags FALSE, all sizes 0 */
153 uprv_memset(pBiDi, 0, sizeof(UBiDi));
154
155 /* allocate memory for arrays as requested */
156 if(maxLength>0) {
157 if( !getInitialDirPropsMemory(pBiDi, maxLength) ||
158 !getInitialLevelsMemory(pBiDi, maxLength)
159 ) {
160 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
161 }
162 } else {
163 pBiDi->mayAllocateText=TRUE;
164 }
165
166 if(maxRunCount>0) {
167 if(maxRunCount==1) {
168 /* use simpleRuns[] */
169 pBiDi->runsSize=sizeof(Run);
170 } else if(!getInitialRunsMemory(pBiDi, maxRunCount)) {
171 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
172 }
173 } else {
174 pBiDi->mayAllocateRuns=TRUE;
175 }
176
177 if(U_SUCCESS(*pErrorCode)) {
178 return pBiDi;
179 } else {
180 ubidi_close(pBiDi);
181 return NULL;
182 }
183 }
184
185 /*
186 * We are allowed to allocate memory if memory==NULL or
187 * mayAllocate==TRUE for each array that we need.
188 * We also try to grow memory as needed if we
189 * allocate it.
190 *
191 * Assume sizeNeeded>0.
192 * If *pMemory!=NULL, then assume *pSize>0.
193 *
194 * ### this realloc() may unnecessarily copy the old data,
195 * which we know we don't need any more;
196 * is this the best way to do this??
197 */
198 U_CFUNC UBool
ubidi_getMemory(BidiMemoryForAllocation * bidiMem,int32_t * pSize,UBool mayAllocate,int32_t sizeNeeded)199 ubidi_getMemory(BidiMemoryForAllocation *bidiMem, int32_t *pSize, UBool mayAllocate, int32_t sizeNeeded) {
200 void **pMemory = (void **)bidiMem;
201 /* check for existing memory */
202 if(*pMemory==NULL) {
203 /* we need to allocate memory */
204 if(mayAllocate && (*pMemory=uprv_malloc(sizeNeeded))!=NULL) {
205 *pSize=sizeNeeded;
206 return TRUE;
207 } else {
208 return FALSE;
209 }
210 } else {
211 if(sizeNeeded<=*pSize) {
212 /* there is already enough memory */
213 return TRUE;
214 }
215 else if(!mayAllocate) {
216 /* not enough memory, and we must not allocate */
217 return FALSE;
218 } else {
219 /* we try to grow */
220 void *memory;
221 /* in most cases, we do not need the copy-old-data part of
222 * realloc, but it is needed when adding runs using getRunsMemory()
223 * in setParaRunsOnly()
224 */
225 if((memory=uprv_realloc(*pMemory, sizeNeeded))!=NULL) {
226 *pMemory=memory;
227 *pSize=sizeNeeded;
228 return TRUE;
229 } else {
230 /* we failed to grow */
231 return FALSE;
232 }
233 }
234 }
235 }
236
237 U_CAPI void U_EXPORT2
ubidi_close(UBiDi * pBiDi)238 ubidi_close(UBiDi *pBiDi) {
239 if(pBiDi!=NULL) {
240 pBiDi->pParaBiDi=NULL; /* in case one tries to reuse this block */
241 if(pBiDi->dirPropsMemory!=NULL) {
242 uprv_free(pBiDi->dirPropsMemory);
243 }
244 if(pBiDi->levelsMemory!=NULL) {
245 uprv_free(pBiDi->levelsMemory);
246 }
247 if(pBiDi->openingsMemory!=NULL) {
248 uprv_free(pBiDi->openingsMemory);
249 }
250 if(pBiDi->parasMemory!=NULL) {
251 uprv_free(pBiDi->parasMemory);
252 }
253 if(pBiDi->runsMemory!=NULL) {
254 uprv_free(pBiDi->runsMemory);
255 }
256 if(pBiDi->isolatesMemory!=NULL) {
257 uprv_free(pBiDi->isolatesMemory);
258 }
259 if(pBiDi->insertPoints.points!=NULL) {
260 uprv_free(pBiDi->insertPoints.points);
261 }
262
263 uprv_free(pBiDi);
264 }
265 }
266
267 /* set to approximate "inverse BiDi" ---------------------------------------- */
268
269 U_CAPI void U_EXPORT2
ubidi_setInverse(UBiDi * pBiDi,UBool isInverse)270 ubidi_setInverse(UBiDi *pBiDi, UBool isInverse) {
271 if(pBiDi!=NULL) {
272 pBiDi->isInverse=isInverse;
273 pBiDi->reorderingMode = isInverse ? UBIDI_REORDER_INVERSE_NUMBERS_AS_L
274 : UBIDI_REORDER_DEFAULT;
275 }
276 }
277
278 U_CAPI UBool U_EXPORT2
ubidi_isInverse(UBiDi * pBiDi)279 ubidi_isInverse(UBiDi *pBiDi) {
280 if(pBiDi!=NULL) {
281 return pBiDi->isInverse;
282 } else {
283 return FALSE;
284 }
285 }
286
287 /* FOOD FOR THOUGHT: currently the reordering modes are a mixture of
288 * algorithm for direct BiDi, algorithm for inverse BiDi and the bizarre
289 * concept of RUNS_ONLY which is a double operation.
290 * It could be advantageous to divide this into 3 concepts:
291 * a) Operation: direct / inverse / RUNS_ONLY
292 * b) Direct algorithm: default / NUMBERS_SPECIAL / GROUP_NUMBERS_WITH_R
293 * c) Inverse algorithm: default / INVERSE_LIKE_DIRECT / NUMBERS_SPECIAL
294 * This would allow combinations not possible today like RUNS_ONLY with
295 * NUMBERS_SPECIAL.
296 * Also allow to set INSERT_MARKS for the direct step of RUNS_ONLY and
297 * REMOVE_CONTROLS for the inverse step.
298 * Not all combinations would be supported, and probably not all do make sense.
299 * This would need to document which ones are supported and what are the
300 * fallbacks for unsupported combinations.
301 */
302 U_CAPI void U_EXPORT2
ubidi_setReorderingMode(UBiDi * pBiDi,UBiDiReorderingMode reorderingMode)303 ubidi_setReorderingMode(UBiDi *pBiDi, UBiDiReorderingMode reorderingMode) {
304 if ((pBiDi!=NULL) && (reorderingMode >= UBIDI_REORDER_DEFAULT)
305 && (reorderingMode < UBIDI_REORDER_COUNT)) {
306 pBiDi->reorderingMode = reorderingMode;
307 pBiDi->isInverse = (UBool)(reorderingMode == UBIDI_REORDER_INVERSE_NUMBERS_AS_L);
308 }
309 }
310
311 U_CAPI UBiDiReorderingMode U_EXPORT2
ubidi_getReorderingMode(UBiDi * pBiDi)312 ubidi_getReorderingMode(UBiDi *pBiDi) {
313 if (pBiDi!=NULL) {
314 return pBiDi->reorderingMode;
315 } else {
316 return UBIDI_REORDER_DEFAULT;
317 }
318 }
319
320 U_CAPI void U_EXPORT2
ubidi_setReorderingOptions(UBiDi * pBiDi,uint32_t reorderingOptions)321 ubidi_setReorderingOptions(UBiDi *pBiDi, uint32_t reorderingOptions) {
322 if (reorderingOptions & UBIDI_OPTION_REMOVE_CONTROLS) {
323 reorderingOptions&=~UBIDI_OPTION_INSERT_MARKS;
324 }
325 if (pBiDi!=NULL) {
326 pBiDi->reorderingOptions=reorderingOptions;
327 }
328 }
329
330 U_CAPI uint32_t U_EXPORT2
ubidi_getReorderingOptions(UBiDi * pBiDi)331 ubidi_getReorderingOptions(UBiDi *pBiDi) {
332 if (pBiDi!=NULL) {
333 return pBiDi->reorderingOptions;
334 } else {
335 return 0;
336 }
337 }
338
339 U_CAPI UBiDiDirection U_EXPORT2
ubidi_getBaseDirection(const UChar * text,int32_t length)340 ubidi_getBaseDirection(const UChar *text,
341 int32_t length){
342
343 int32_t i;
344 UChar32 uchar;
345 UCharDirection dir;
346
347 if( text==NULL || length<-1 ){
348 return UBIDI_NEUTRAL;
349 }
350
351 if(length==-1) {
352 length=u_strlen(text);
353 }
354
355 for( i = 0 ; i < length; ) {
356 /* i is incremented by U16_NEXT */
357 U16_NEXT(text, i, length, uchar);
358 dir = u_charDirection(uchar);
359 if( dir == U_LEFT_TO_RIGHT )
360 return UBIDI_LTR;
361 if( dir == U_RIGHT_TO_LEFT || dir ==U_RIGHT_TO_LEFT_ARABIC )
362 return UBIDI_RTL;
363 }
364 return UBIDI_NEUTRAL;
365 }
366
367 /* perform (P2)..(P3) ------------------------------------------------------- */
368
369 /**
370 * Returns the directionality of the first strong character
371 * after the last B in prologue, if any.
372 * Requires prologue!=null.
373 */
374 static DirProp
firstL_R_AL(UBiDi * pBiDi)375 firstL_R_AL(UBiDi *pBiDi) {
376 const UChar *text=pBiDi->prologue;
377 int32_t length=pBiDi->proLength;
378 int32_t i;
379 UChar32 uchar;
380 DirProp dirProp, result=ON;
381 for(i=0; i<length; ) {
382 /* i is incremented by U16_NEXT */
383 U16_NEXT(text, i, length, uchar);
384 dirProp=(DirProp)ubidi_getCustomizedClass(pBiDi, uchar);
385 if(result==ON) {
386 if(dirProp==L || dirProp==R || dirProp==AL) {
387 result=dirProp;
388 }
389 } else {
390 if(dirProp==B) {
391 result=ON;
392 }
393 }
394 }
395 return result;
396 }
397
398 /*
399 * Check that there are enough entries in the array pointed to by pBiDi->paras
400 */
401 static UBool
checkParaCount(UBiDi * pBiDi)402 checkParaCount(UBiDi *pBiDi) {
403 int32_t count=pBiDi->paraCount;
404 if(pBiDi->paras==pBiDi->simpleParas) {
405 if(count<=SIMPLE_PARAS_COUNT)
406 return TRUE;
407 if(!getInitialParasMemory(pBiDi, SIMPLE_PARAS_COUNT * 2))
408 return FALSE;
409 pBiDi->paras=pBiDi->parasMemory;
410 uprv_memcpy(pBiDi->parasMemory, pBiDi->simpleParas, SIMPLE_PARAS_COUNT * sizeof(Para));
411 return TRUE;
412 }
413 if(!getInitialParasMemory(pBiDi, count * 2))
414 return FALSE;
415 pBiDi->paras=pBiDi->parasMemory;
416 return TRUE;
417 }
418
419 /*
420 * Get the directional properties for the text, calculate the flags bit-set, and
421 * determine the paragraph level if necessary (in pBiDi->paras[i].level).
422 * FSI initiators are also resolved and their dirProp replaced with LRI or RLI.
423 * When encountering an FSI, it is initially replaced with an LRI, which is the
424 * default. Only if a strong R or AL is found within its scope will the LRI be
425 * replaced by an RLI.
426 */
427 static UBool
getDirProps(UBiDi * pBiDi)428 getDirProps(UBiDi *pBiDi) {
429 const UChar *text=pBiDi->text;
430 DirProp *dirProps=pBiDi->dirPropsMemory; /* pBiDi->dirProps is const */
431
432 int32_t i=0, originalLength=pBiDi->originalLength;
433 Flags flags=0; /* collect all directionalities in the text */
434 UChar32 uchar;
435 DirProp dirProp=0, defaultParaLevel=0; /* initialize to avoid compiler warnings */
436 UBool isDefaultLevel=IS_DEFAULT_LEVEL(pBiDi->paraLevel);
437 /* for inverse BiDi, the default para level is set to RTL if there is a
438 strong R or AL character at either end of the text */
439 UBool isDefaultLevelInverse=isDefaultLevel && (UBool)
440 (pBiDi->reorderingMode==UBIDI_REORDER_INVERSE_LIKE_DIRECT ||
441 pBiDi->reorderingMode==UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL);
442 int32_t lastArabicPos=-1;
443 int32_t controlCount=0;
444 UBool removeBiDiControls = (UBool)(pBiDi->reorderingOptions &
445 UBIDI_OPTION_REMOVE_CONTROLS);
446
447 enum State {
448 NOT_SEEKING_STRONG, /* 0: not contextual paraLevel, not after FSI */
449 SEEKING_STRONG_FOR_PARA, /* 1: looking for first strong char in para */
450 SEEKING_STRONG_FOR_FSI, /* 2: looking for first strong after FSI */
451 LOOKING_FOR_PDI /* 3: found strong after FSI, looking for PDI */
452 };
453 State state;
454 DirProp lastStrong=ON; /* for default level & inverse BiDi */
455 /* The following stacks are used to manage isolate sequences. Those
456 sequences may be nested, but obviously never more deeply than the
457 maximum explicit embedding level.
458 lastStack is the index of the last used entry in the stack. A value of -1
459 means that there is no open isolate sequence.
460 lastStack is reset to -1 on paragraph boundaries. */
461 /* The following stack contains the position of the initiator of
462 each open isolate sequence */
463 int32_t isolateStartStack[UBIDI_MAX_EXPLICIT_LEVEL+1];
464 /* The following stack contains the last known state before
465 encountering the initiator of an isolate sequence */
466 State previousStateStack[UBIDI_MAX_EXPLICIT_LEVEL+1];
467 int32_t stackLast=-1;
468
469 if(pBiDi->reorderingOptions & UBIDI_OPTION_STREAMING)
470 pBiDi->length=0;
471 defaultParaLevel=pBiDi->paraLevel&1;
472 if(isDefaultLevel) {
473 pBiDi->paras[0].level=defaultParaLevel;
474 lastStrong=defaultParaLevel;
475 if(pBiDi->proLength>0 && /* there is a prologue */
476 (dirProp=firstL_R_AL(pBiDi))!=ON) { /* with a strong character */
477 if(dirProp==L)
478 pBiDi->paras[0].level=0; /* set the default para level */
479 else
480 pBiDi->paras[0].level=1; /* set the default para level */
481 state=NOT_SEEKING_STRONG;
482 } else {
483 state=SEEKING_STRONG_FOR_PARA;
484 }
485 } else {
486 pBiDi->paras[0].level=pBiDi->paraLevel;
487 state=NOT_SEEKING_STRONG;
488 }
489 /* count paragraphs and determine the paragraph level (P2..P3) */
490 /*
491 * see comment in ubidi.h:
492 * the UBIDI_DEFAULT_XXX values are designed so that
493 * their bit 0 alone yields the intended default
494 */
495 for( /* i=0 above */ ; i<originalLength; ) {
496 /* i is incremented by U16_NEXT */
497 U16_NEXT(text, i, originalLength, uchar);
498 flags|=DIRPROP_FLAG(dirProp=(DirProp)ubidi_getCustomizedClass(pBiDi, uchar));
499 dirProps[i-1]=dirProp;
500 if(uchar>0xffff) { /* set the lead surrogate's property to BN */
501 flags|=DIRPROP_FLAG(BN);
502 dirProps[i-2]=BN;
503 }
504 if(removeBiDiControls && IS_BIDI_CONTROL_CHAR(uchar))
505 controlCount++;
506 if(dirProp==L) {
507 if(state==SEEKING_STRONG_FOR_PARA) {
508 pBiDi->paras[pBiDi->paraCount-1].level=0;
509 state=NOT_SEEKING_STRONG;
510 }
511 else if(state==SEEKING_STRONG_FOR_FSI) {
512 if(stackLast<=UBIDI_MAX_EXPLICIT_LEVEL) {
513 /* no need for next statement, already set by default */
514 /* dirProps[isolateStartStack[stackLast]]=LRI; */
515 flags|=DIRPROP_FLAG(LRI);
516 }
517 state=LOOKING_FOR_PDI;
518 }
519 lastStrong=L;
520 continue;
521 }
522 if(dirProp==R || dirProp==AL) {
523 if(state==SEEKING_STRONG_FOR_PARA) {
524 pBiDi->paras[pBiDi->paraCount-1].level=1;
525 state=NOT_SEEKING_STRONG;
526 }
527 else if(state==SEEKING_STRONG_FOR_FSI) {
528 if(stackLast<=UBIDI_MAX_EXPLICIT_LEVEL) {
529 dirProps[isolateStartStack[stackLast]]=RLI;
530 flags|=DIRPROP_FLAG(RLI);
531 }
532 state=LOOKING_FOR_PDI;
533 }
534 lastStrong=R;
535 if(dirProp==AL)
536 lastArabicPos=i-1;
537 continue;
538 }
539 if(dirProp>=FSI && dirProp<=RLI) { /* FSI, LRI or RLI */
540 stackLast++;
541 if(stackLast<=UBIDI_MAX_EXPLICIT_LEVEL) {
542 isolateStartStack[stackLast]=i-1;
543 previousStateStack[stackLast]=state;
544 }
545 if(dirProp==FSI) {
546 dirProps[i-1]=LRI; /* default if no strong char */
547 state=SEEKING_STRONG_FOR_FSI;
548 }
549 else
550 state=LOOKING_FOR_PDI;
551 continue;
552 }
553 if(dirProp==PDI) {
554 if(state==SEEKING_STRONG_FOR_FSI) {
555 if(stackLast<=UBIDI_MAX_EXPLICIT_LEVEL) {
556 /* no need for next statement, already set by default */
557 /* dirProps[isolateStartStack[stackLast]]=LRI; */
558 flags|=DIRPROP_FLAG(LRI);
559 }
560 }
561 if(stackLast>=0) {
562 if(stackLast<=UBIDI_MAX_EXPLICIT_LEVEL)
563 state=previousStateStack[stackLast];
564 stackLast--;
565 }
566 continue;
567 }
568 if(dirProp==B) {
569 if(i<originalLength && uchar==CR && text[i]==LF) /* do nothing on the CR */
570 continue;
571 pBiDi->paras[pBiDi->paraCount-1].limit=i;
572 if(isDefaultLevelInverse && lastStrong==R)
573 pBiDi->paras[pBiDi->paraCount-1].level=1;
574 if(pBiDi->reorderingOptions & UBIDI_OPTION_STREAMING) {
575 /* When streaming, we only process whole paragraphs
576 thus some updates are only done on paragraph boundaries */
577 pBiDi->length=i; /* i is index to next character */
578 pBiDi->controlCount=controlCount;
579 }
580 if(i<originalLength) { /* B not last char in text */
581 pBiDi->paraCount++;
582 if(checkParaCount(pBiDi)==FALSE) /* not enough memory for a new para entry */
583 return FALSE;
584 if(isDefaultLevel) {
585 pBiDi->paras[pBiDi->paraCount-1].level=defaultParaLevel;
586 state=SEEKING_STRONG_FOR_PARA;
587 lastStrong=defaultParaLevel;
588 } else {
589 pBiDi->paras[pBiDi->paraCount-1].level=pBiDi->paraLevel;
590 state=NOT_SEEKING_STRONG;
591 }
592 stackLast=-1;
593 }
594 continue;
595 }
596 }
597 /* Ignore still open isolate sequences with overflow */
598 if(stackLast>UBIDI_MAX_EXPLICIT_LEVEL) {
599 stackLast=UBIDI_MAX_EXPLICIT_LEVEL;
600 state=SEEKING_STRONG_FOR_FSI; /* to be on the safe side */
601 }
602 /* Resolve direction of still unresolved open FSI sequences */
603 while(stackLast>=0) {
604 if(state==SEEKING_STRONG_FOR_FSI) {
605 /* no need for next statement, already set by default */
606 /* dirProps[isolateStartStack[stackLast]]=LRI; */
607 flags|=DIRPROP_FLAG(LRI);
608 break;
609 }
610 state=previousStateStack[stackLast];
611 stackLast--;
612 }
613 /* When streaming, ignore text after the last paragraph separator */
614 if(pBiDi->reorderingOptions & UBIDI_OPTION_STREAMING) {
615 if(pBiDi->length<originalLength)
616 pBiDi->paraCount--;
617 } else {
618 pBiDi->paras[pBiDi->paraCount-1].limit=originalLength;
619 pBiDi->controlCount=controlCount;
620 }
621 /* For inverse bidi, default para direction is RTL if there is
622 a strong R or AL at either end of the paragraph */
623 if(isDefaultLevelInverse && lastStrong==R) {
624 pBiDi->paras[pBiDi->paraCount-1].level=1;
625 }
626 if(isDefaultLevel) {
627 pBiDi->paraLevel=static_cast<UBiDiLevel>(pBiDi->paras[0].level);
628 }
629 /* The following is needed to resolve the text direction for default level
630 paragraphs containing no strong character */
631 for(i=0; i<pBiDi->paraCount; i++)
632 flags|=DIRPROP_FLAG_LR(pBiDi->paras[i].level);
633
634 if(pBiDi->orderParagraphsLTR && (flags&DIRPROP_FLAG(B))) {
635 flags|=DIRPROP_FLAG(L);
636 }
637 pBiDi->flags=flags;
638 pBiDi->lastArabicPos=lastArabicPos;
639 return TRUE;
640 }
641
642 /* determine the paragraph level at position index */
643 U_CFUNC UBiDiLevel
ubidi_getParaLevelAtIndex(const UBiDi * pBiDi,int32_t pindex)644 ubidi_getParaLevelAtIndex(const UBiDi *pBiDi, int32_t pindex) {
645 int32_t i;
646 for(i=0; i<pBiDi->paraCount; i++)
647 if(pindex<pBiDi->paras[i].limit)
648 break;
649 if(i>=pBiDi->paraCount)
650 i=pBiDi->paraCount-1;
651 return (UBiDiLevel)(pBiDi->paras[i].level);
652 }
653
654 /* Functions for handling paired brackets ----------------------------------- */
655
656 /* In the isoRuns array, the first entry is used for text outside of any
657 isolate sequence. Higher entries are used for each more deeply nested
658 isolate sequence. isoRunLast is the index of the last used entry. The
659 openings array is used to note the data of opening brackets not yet
660 matched by a closing bracket, or matched but still susceptible to change
661 level.
662 Each isoRun entry contains the index of the first and
663 one-after-last openings entries for pending opening brackets it
664 contains. The next openings entry to use is the one-after-last of the
665 most deeply nested isoRun entry.
666 isoRun entries also contain their current embedding level and the last
667 encountered strong character, since these will be needed to resolve
668 the level of paired brackets. */
669
670 static void
bracketInit(UBiDi * pBiDi,BracketData * bd)671 bracketInit(UBiDi *pBiDi, BracketData *bd) {
672 bd->pBiDi=pBiDi;
673 bd->isoRunLast=0;
674 bd->isoRuns[0].start=0;
675 bd->isoRuns[0].limit=0;
676 bd->isoRuns[0].level=GET_PARALEVEL(pBiDi, 0);
677 UBiDiLevel t = GET_PARALEVEL(pBiDi, 0) & 1;
678 bd->isoRuns[0].lastStrong = bd->isoRuns[0].lastBase = t;
679 bd->isoRuns[0].contextDir = (UBiDiDirection)t;
680 bd->isoRuns[0].contextPos=0;
681 if(pBiDi->openingsMemory) {
682 bd->openings=pBiDi->openingsMemory;
683 bd->openingsCount=pBiDi->openingsSize / sizeof(Opening);
684 } else {
685 bd->openings=bd->simpleOpenings;
686 bd->openingsCount=SIMPLE_OPENINGS_COUNT;
687 }
688 bd->isNumbersSpecial=bd->pBiDi->reorderingMode==UBIDI_REORDER_NUMBERS_SPECIAL ||
689 bd->pBiDi->reorderingMode==UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL;
690 }
691
692 /* paragraph boundary */
693 static void
bracketProcessB(BracketData * bd,UBiDiLevel level)694 bracketProcessB(BracketData *bd, UBiDiLevel level) {
695 bd->isoRunLast=0;
696 bd->isoRuns[0].limit=0;
697 bd->isoRuns[0].level=level;
698 bd->isoRuns[0].lastStrong=bd->isoRuns[0].lastBase=level&1;
699 bd->isoRuns[0].contextDir=(UBiDiDirection)(level&1);
700 bd->isoRuns[0].contextPos=0;
701 }
702
703 /* LRE, LRO, RLE, RLO, PDF */
704 static void
bracketProcessBoundary(BracketData * bd,int32_t lastCcPos,UBiDiLevel contextLevel,UBiDiLevel embeddingLevel)705 bracketProcessBoundary(BracketData *bd, int32_t lastCcPos,
706 UBiDiLevel contextLevel, UBiDiLevel embeddingLevel) {
707 IsoRun *pLastIsoRun=&bd->isoRuns[bd->isoRunLast];
708 DirProp *dirProps=bd->pBiDi->dirProps;
709 if(DIRPROP_FLAG(dirProps[lastCcPos])&MASK_ISO) /* after an isolate */
710 return;
711 if(NO_OVERRIDE(embeddingLevel)>NO_OVERRIDE(contextLevel)) /* not a PDF */
712 contextLevel=embeddingLevel;
713 pLastIsoRun->limit=pLastIsoRun->start;
714 pLastIsoRun->level=embeddingLevel;
715 pLastIsoRun->lastStrong=pLastIsoRun->lastBase=contextLevel&1;
716 pLastIsoRun->contextDir=(UBiDiDirection)(contextLevel&1);
717 pLastIsoRun->contextPos=(UBiDiDirection)lastCcPos;
718 }
719
720 /* LRI or RLI */
721 static void
bracketProcessLRI_RLI(BracketData * bd,UBiDiLevel level)722 bracketProcessLRI_RLI(BracketData *bd, UBiDiLevel level) {
723 IsoRun *pLastIsoRun=&bd->isoRuns[bd->isoRunLast];
724 int16_t lastLimit;
725 pLastIsoRun->lastBase=ON;
726 lastLimit=pLastIsoRun->limit;
727 bd->isoRunLast++;
728 pLastIsoRun++;
729 pLastIsoRun->start=pLastIsoRun->limit=lastLimit;
730 pLastIsoRun->level=level;
731 pLastIsoRun->lastStrong=pLastIsoRun->lastBase=level&1;
732 pLastIsoRun->contextDir=(UBiDiDirection)(level&1);
733 pLastIsoRun->contextPos=0;
734 }
735
736 /* PDI */
737 static void
bracketProcessPDI(BracketData * bd)738 bracketProcessPDI(BracketData *bd) {
739 IsoRun *pLastIsoRun;
740 bd->isoRunLast--;
741 pLastIsoRun=&bd->isoRuns[bd->isoRunLast];
742 pLastIsoRun->lastBase=ON;
743 }
744
745 /* newly found opening bracket: create an openings entry */
746 static UBool /* return TRUE if success */
bracketAddOpening(BracketData * bd,UChar match,int32_t position)747 bracketAddOpening(BracketData *bd, UChar match, int32_t position) {
748 IsoRun *pLastIsoRun=&bd->isoRuns[bd->isoRunLast];
749 Opening *pOpening;
750 if(pLastIsoRun->limit>=bd->openingsCount) { /* no available new entry */
751 UBiDi *pBiDi=bd->pBiDi;
752 if(!getInitialOpeningsMemory(pBiDi, pLastIsoRun->limit * 2))
753 return FALSE;
754 if(bd->openings==bd->simpleOpenings)
755 uprv_memcpy(pBiDi->openingsMemory, bd->simpleOpenings,
756 SIMPLE_OPENINGS_COUNT * sizeof(Opening));
757 bd->openings=pBiDi->openingsMemory; /* may have changed */
758 bd->openingsCount=pBiDi->openingsSize / sizeof(Opening);
759 }
760 pOpening=&bd->openings[pLastIsoRun->limit];
761 pOpening->position=position;
762 pOpening->match=match;
763 pOpening->contextDir=pLastIsoRun->contextDir;
764 pOpening->contextPos=pLastIsoRun->contextPos;
765 pOpening->flags=0;
766 pLastIsoRun->limit++;
767 return TRUE;
768 }
769
770 /* change N0c1 to N0c2 when a preceding bracket is assigned the embedding level */
771 static void
fixN0c(BracketData * bd,int32_t openingIndex,int32_t newPropPosition,DirProp newProp)772 fixN0c(BracketData *bd, int32_t openingIndex, int32_t newPropPosition, DirProp newProp) {
773 /* This function calls itself recursively */
774 IsoRun *pLastIsoRun=&bd->isoRuns[bd->isoRunLast];
775 Opening *qOpening;
776 DirProp *dirProps=bd->pBiDi->dirProps;
777 int32_t k, openingPosition, closingPosition;
778 for(k=openingIndex+1, qOpening=&bd->openings[k]; k<pLastIsoRun->limit; k++, qOpening++) {
779 if(qOpening->match>=0) /* not an N0c match */
780 continue;
781 if(newPropPosition<qOpening->contextPos)
782 break;
783 if(newPropPosition>=qOpening->position)
784 continue;
785 if(newProp==qOpening->contextDir)
786 break;
787 openingPosition=qOpening->position;
788 dirProps[openingPosition]=newProp;
789 closingPosition=-(qOpening->match);
790 dirProps[closingPosition]=newProp;
791 qOpening->match=0; /* prevent further changes */
792 fixN0c(bd, k, openingPosition, newProp);
793 fixN0c(bd, k, closingPosition, newProp);
794 }
795 }
796
797 /* process closing bracket */
798 static DirProp /* return L or R if N0b or N0c, ON if N0d */
bracketProcessClosing(BracketData * bd,int32_t openIdx,int32_t position)799 bracketProcessClosing(BracketData *bd, int32_t openIdx, int32_t position) {
800 IsoRun *pLastIsoRun=&bd->isoRuns[bd->isoRunLast];
801 Opening *pOpening, *qOpening;
802 UBiDiDirection direction;
803 UBool stable;
804 DirProp newProp;
805 pOpening=&bd->openings[openIdx];
806 direction=(UBiDiDirection)(pLastIsoRun->level&1);
807 stable=TRUE; /* assume stable until proved otherwise */
808
809 /* The stable flag is set when brackets are paired and their
810 level is resolved and cannot be changed by what will be
811 found later in the source string.
812 An unstable match can occur only when applying N0c, where
813 the resolved level depends on the preceding context, and
814 this context may be affected by text occurring later.
815 Example: RTL paragraph containing: abc[(latin) HEBREW]
816 When the closing parenthesis is encountered, it appears
817 that N0c1 must be applied since 'abc' sets an opposite
818 direction context and both parentheses receive level 2.
819 However, when the closing square bracket is processed,
820 N0b applies because of 'HEBREW' being included within the
821 brackets, thus the square brackets are treated like R and
822 receive level 1. However, this changes the preceding
823 context of the opening parenthesis, and it now appears
824 that N0c2 must be applied to the parentheses rather than
825 N0c1. */
826
827 if((direction==0 && pOpening->flags&FOUND_L) ||
828 (direction==1 && pOpening->flags&FOUND_R)) { /* N0b */
829 newProp=static_cast<DirProp>(direction);
830 }
831 else if(pOpening->flags&(FOUND_L|FOUND_R)) { /* N0c */
832 /* it is stable if there is no containing pair or in
833 conditions too complicated and not worth checking */
834 stable=(openIdx==pLastIsoRun->start);
835 if(direction!=pOpening->contextDir)
836 newProp= static_cast<DirProp>(pOpening->contextDir); /* N0c1 */
837 else
838 newProp= static_cast<DirProp>(direction); /* N0c2 */
839 } else {
840 /* forget this and any brackets nested within this pair */
841 pLastIsoRun->limit= static_cast<uint16_t>(openIdx);
842 return ON; /* N0d */
843 }
844 bd->pBiDi->dirProps[pOpening->position]=newProp;
845 bd->pBiDi->dirProps[position]=newProp;
846 /* Update nested N0c pairs that may be affected */
847 fixN0c(bd, openIdx, pOpening->position, newProp);
848 if(stable) {
849 pLastIsoRun->limit= static_cast<uint16_t>(openIdx); /* forget any brackets nested within this pair */
850 /* remove lower located synonyms if any */
851 while(pLastIsoRun->limit>pLastIsoRun->start &&
852 bd->openings[pLastIsoRun->limit-1].position==pOpening->position)
853 pLastIsoRun->limit--;
854 } else {
855 int32_t k;
856 pOpening->match=-position;
857 /* neutralize lower located synonyms if any */
858 k=openIdx-1;
859 while(k>=pLastIsoRun->start &&
860 bd->openings[k].position==pOpening->position)
861 bd->openings[k--].match=0;
862 /* neutralize any unmatched opening between the current pair;
863 this will also neutralize higher located synonyms if any */
864 for(k=openIdx+1; k<pLastIsoRun->limit; k++) {
865 qOpening=&bd->openings[k];
866 if(qOpening->position>=position)
867 break;
868 if(qOpening->match>0)
869 qOpening->match=0;
870 }
871 }
872 return newProp;
873 }
874
875 /* handle strong characters, digits and candidates for closing brackets */
876 static UBool /* return TRUE if success */
bracketProcessChar(BracketData * bd,int32_t position)877 bracketProcessChar(BracketData *bd, int32_t position) {
878 IsoRun *pLastIsoRun=&bd->isoRuns[bd->isoRunLast];
879 DirProp *dirProps, dirProp, newProp;
880 UBiDiLevel level;
881 dirProps=bd->pBiDi->dirProps;
882 dirProp=dirProps[position];
883 if(dirProp==ON) {
884 UChar c, match;
885 int32_t idx;
886 /* First see if it is a matching closing bracket. Hopefully, this is
887 more efficient than checking if it is a closing bracket at all */
888 c=bd->pBiDi->text[position];
889 for(idx=pLastIsoRun->limit-1; idx>=pLastIsoRun->start; idx--) {
890 if(bd->openings[idx].match!=c)
891 continue;
892 /* We have a match */
893 newProp=bracketProcessClosing(bd, idx, position);
894 if(newProp==ON) { /* N0d */
895 c=0; /* prevent handling as an opening */
896 break;
897 }
898 pLastIsoRun->lastBase=ON;
899 pLastIsoRun->contextDir=(UBiDiDirection)newProp;
900 pLastIsoRun->contextPos=position;
901 level=bd->pBiDi->levels[position];
902 if(level&UBIDI_LEVEL_OVERRIDE) { /* X4, X5 */
903 uint16_t flag;
904 int32_t i;
905 newProp=level&1;
906 pLastIsoRun->lastStrong=newProp;
907 flag=DIRPROP_FLAG(newProp);
908 for(i=pLastIsoRun->start; i<idx; i++)
909 bd->openings[i].flags|=flag;
910 /* matching brackets are not overridden by LRO/RLO */
911 bd->pBiDi->levels[position]&=~UBIDI_LEVEL_OVERRIDE;
912 }
913 /* matching brackets are not overridden by LRO/RLO */
914 bd->pBiDi->levels[bd->openings[idx].position]&=~UBIDI_LEVEL_OVERRIDE;
915 return TRUE;
916 }
917 /* We get here only if the ON character is not a matching closing
918 bracket or it is a case of N0d */
919 /* Now see if it is an opening bracket */
920 if(c)
921 match= static_cast<UChar>(u_getBidiPairedBracket(c)); /* get the matching char */
922 else
923 match=0;
924 if(match!=c && /* has a matching char */
925 ubidi_getPairedBracketType(c)==U_BPT_OPEN) { /* opening bracket */
926 /* special case: process synonyms
927 create an opening entry for each synonym */
928 if(match==0x232A) { /* RIGHT-POINTING ANGLE BRACKET */
929 if(!bracketAddOpening(bd, 0x3009, position))
930 return FALSE;
931 }
932 else if(match==0x3009) { /* RIGHT ANGLE BRACKET */
933 if(!bracketAddOpening(bd, 0x232A, position))
934 return FALSE;
935 }
936 if(!bracketAddOpening(bd, match, position))
937 return FALSE;
938 }
939 }
940 level=bd->pBiDi->levels[position];
941 if(level&UBIDI_LEVEL_OVERRIDE) { /* X4, X5 */
942 newProp=level&1;
943 if(dirProp!=S && dirProp!=WS && dirProp!=ON)
944 dirProps[position]=newProp;
945 pLastIsoRun->lastBase=newProp;
946 pLastIsoRun->lastStrong=newProp;
947 pLastIsoRun->contextDir=(UBiDiDirection)newProp;
948 pLastIsoRun->contextPos=position;
949 }
950 else if(dirProp<=R || dirProp==AL) {
951 newProp= static_cast<DirProp>(DIR_FROM_STRONG(dirProp));
952 pLastIsoRun->lastBase=dirProp;
953 pLastIsoRun->lastStrong=dirProp;
954 pLastIsoRun->contextDir=(UBiDiDirection)newProp;
955 pLastIsoRun->contextPos=position;
956 }
957 else if(dirProp==EN) {
958 pLastIsoRun->lastBase=EN;
959 if(pLastIsoRun->lastStrong==L) {
960 newProp=L; /* W7 */
961 if(!bd->isNumbersSpecial)
962 dirProps[position]=ENL;
963 pLastIsoRun->contextDir=(UBiDiDirection)L;
964 pLastIsoRun->contextPos=position;
965 }
966 else {
967 newProp=R; /* N0 */
968 if(pLastIsoRun->lastStrong==AL)
969 dirProps[position]=AN; /* W2 */
970 else
971 dirProps[position]=ENR;
972 pLastIsoRun->contextDir=(UBiDiDirection)R;
973 pLastIsoRun->contextPos=position;
974 }
975 }
976 else if(dirProp==AN) {
977 newProp=R; /* N0 */
978 pLastIsoRun->lastBase=AN;
979 pLastIsoRun->contextDir=(UBiDiDirection)R;
980 pLastIsoRun->contextPos=position;
981 }
982 else if(dirProp==NSM) {
983 /* if the last real char was ON, change NSM to ON so that it
984 will stay ON even if the last real char is a bracket which
985 may be changed to L or R */
986 newProp=pLastIsoRun->lastBase;
987 if(newProp==ON)
988 dirProps[position]=newProp;
989 }
990 else {
991 newProp=dirProp;
992 pLastIsoRun->lastBase=dirProp;
993 }
994 if(newProp<=R || newProp==AL) {
995 int32_t i;
996 uint16_t flag=DIRPROP_FLAG(DIR_FROM_STRONG(newProp));
997 for(i=pLastIsoRun->start; i<pLastIsoRun->limit; i++)
998 if(position>bd->openings[i].position)
999 bd->openings[i].flags|=flag;
1000 }
1001 return TRUE;
1002 }
1003
1004 /* perform (X1)..(X9) ------------------------------------------------------- */
1005
1006 /* determine if the text is mixed-directional or single-directional */
1007 static UBiDiDirection
directionFromFlags(UBiDi * pBiDi)1008 directionFromFlags(UBiDi *pBiDi) {
1009 Flags flags=pBiDi->flags;
1010 /* if the text contains AN and neutrals, then some neutrals may become RTL */
1011 if(!(flags&MASK_RTL || ((flags&DIRPROP_FLAG(AN)) && (flags&MASK_POSSIBLE_N)))) {
1012 return UBIDI_LTR;
1013 } else if(!(flags&MASK_LTR)) {
1014 return UBIDI_RTL;
1015 } else {
1016 return UBIDI_MIXED;
1017 }
1018 }
1019
1020 /*
1021 * Resolve the explicit levels as specified by explicit embedding codes.
1022 * Recalculate the flags to have them reflect the real properties
1023 * after taking the explicit embeddings into account.
1024 *
1025 * The BiDi algorithm is designed to result in the same behavior whether embedding
1026 * levels are externally specified (from "styled text", supposedly the preferred
1027 * method) or set by explicit embedding codes (LRx, RLx, PDF, FSI, PDI) in the plain text.
1028 * That is why (X9) instructs to remove all not-isolate explicit codes (and BN).
1029 * However, in a real implementation, the removal of these codes and their index
1030 * positions in the plain text is undesirable since it would result in
1031 * reallocated, reindexed text.
1032 * Instead, this implementation leaves the codes in there and just ignores them
1033 * in the subsequent processing.
1034 * In order to get the same reordering behavior, positions with a BN or a not-isolate
1035 * explicit embedding code just get the same level assigned as the last "real"
1036 * character.
1037 *
1038 * Some implementations, not this one, then overwrite some of these
1039 * directionality properties at "real" same-level-run boundaries by
1040 * L or R codes so that the resolution of weak types can be performed on the
1041 * entire paragraph at once instead of having to parse it once more and
1042 * perform that resolution on same-level-runs.
1043 * This limits the scope of the implicit rules in effectively
1044 * the same way as the run limits.
1045 *
1046 * Instead, this implementation does not modify these codes, except for
1047 * paired brackets whose properties (ON) may be replaced by L or R.
1048 * On one hand, the paragraph has to be scanned for same-level-runs, but
1049 * on the other hand, this saves another loop to reset these codes,
1050 * or saves making and modifying a copy of dirProps[].
1051 *
1052 *
1053 * Note that (Pn) and (Xn) changed significantly from version 4 of the BiDi algorithm.
1054 *
1055 *
1056 * Handling the stack of explicit levels (Xn):
1057 *
1058 * With the BiDi stack of explicit levels, as pushed with each
1059 * LRE, RLE, LRO, RLO, LRI, RLI and FSI and popped with each PDF and PDI,
1060 * the explicit level must never exceed UBIDI_MAX_EXPLICIT_LEVEL.
1061 *
1062 * In order to have a correct push-pop semantics even in the case of overflows,
1063 * overflow counters and a valid isolate counter are used as described in UAX#9
1064 * section 3.3.2 "Explicit Levels and Directions".
1065 *
1066 * This implementation assumes that UBIDI_MAX_EXPLICIT_LEVEL is odd.
1067 *
1068 * Returns normally the direction; -1 if there was a memory shortage
1069 *
1070 */
1071 static UBiDiDirection
resolveExplicitLevels(UBiDi * pBiDi,UErrorCode * pErrorCode)1072 resolveExplicitLevels(UBiDi *pBiDi, UErrorCode *pErrorCode) {
1073 DirProp *dirProps=pBiDi->dirProps;
1074 UBiDiLevel *levels=pBiDi->levels;
1075 const UChar *text=pBiDi->text;
1076
1077 int32_t i=0, length=pBiDi->length;
1078 Flags flags=pBiDi->flags; /* collect all directionalities in the text */
1079 DirProp dirProp;
1080 UBiDiLevel level=GET_PARALEVEL(pBiDi, 0);
1081 UBiDiDirection direction;
1082 pBiDi->isolateCount=0;
1083
1084 if(U_FAILURE(*pErrorCode)) { return UBIDI_LTR; }
1085
1086 /* determine if the text is mixed-directional or single-directional */
1087 direction=directionFromFlags(pBiDi);
1088
1089 /* we may not need to resolve any explicit levels */
1090 if((direction!=UBIDI_MIXED)) {
1091 /* not mixed directionality: levels don't matter - trailingWSStart will be 0 */
1092 return direction;
1093 }
1094 if(pBiDi->reorderingMode > UBIDI_REORDER_LAST_LOGICAL_TO_VISUAL) {
1095 /* inverse BiDi: mixed, but all characters are at the same embedding level */
1096 /* set all levels to the paragraph level */
1097 int32_t paraIndex, start, limit;
1098 for(paraIndex=0; paraIndex<pBiDi->paraCount; paraIndex++) {
1099 if(paraIndex==0)
1100 start=0;
1101 else
1102 start=pBiDi->paras[paraIndex-1].limit;
1103 limit=pBiDi->paras[paraIndex].limit;
1104 level= static_cast<UBiDiLevel>(pBiDi->paras[paraIndex].level);
1105 for(i=start; i<limit; i++)
1106 levels[i]=level;
1107 }
1108 return direction; /* no bracket matching for inverse BiDi */
1109 }
1110 if(!(flags&(MASK_EXPLICIT|MASK_ISO))) {
1111 /* no embeddings, set all levels to the paragraph level */
1112 /* we still have to perform bracket matching */
1113 int32_t paraIndex, start, limit;
1114 BracketData bracketData;
1115 bracketInit(pBiDi, &bracketData);
1116 for(paraIndex=0; paraIndex<pBiDi->paraCount; paraIndex++) {
1117 if(paraIndex==0)
1118 start=0;
1119 else
1120 start=pBiDi->paras[paraIndex-1].limit;
1121 limit=pBiDi->paras[paraIndex].limit;
1122 level= static_cast<UBiDiLevel>(pBiDi->paras[paraIndex].level);
1123 for(i=start; i<limit; i++) {
1124 levels[i]=level;
1125 dirProp=dirProps[i];
1126 if(dirProp==BN)
1127 continue;
1128 if(dirProp==B) {
1129 if((i+1)<length) {
1130 if(text[i]==CR && text[i+1]==LF)
1131 continue; /* skip CR when followed by LF */
1132 bracketProcessB(&bracketData, level);
1133 }
1134 continue;
1135 }
1136 if(!bracketProcessChar(&bracketData, i)) {
1137 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1138 return UBIDI_LTR;
1139 }
1140 }
1141 }
1142 return direction;
1143 }
1144 {
1145 /* continue to perform (Xn) */
1146
1147 /* (X1) level is set for all codes, embeddingLevel keeps track of the push/pop operations */
1148 /* both variables may carry the UBIDI_LEVEL_OVERRIDE flag to indicate the override status */
1149 UBiDiLevel embeddingLevel=level, newLevel;
1150 UBiDiLevel previousLevel=level; /* previous level for regular (not CC) characters */
1151 int32_t lastCcPos=0; /* index of last effective LRx,RLx, PDx */
1152
1153 /* The following stack remembers the embedding level and the ISOLATE flag of level runs.
1154 stackLast points to its current entry. */
1155 uint16_t stack[UBIDI_MAX_EXPLICIT_LEVEL+2]; /* we never push anything >=UBIDI_MAX_EXPLICIT_LEVEL
1156 but we need one more entry as base */
1157 uint32_t stackLast=0;
1158 int32_t overflowIsolateCount=0;
1159 int32_t overflowEmbeddingCount=0;
1160 int32_t validIsolateCount=0;
1161 BracketData bracketData;
1162 bracketInit(pBiDi, &bracketData);
1163 stack[0]=level; /* initialize base entry to para level, no override, no isolate */
1164
1165 /* recalculate the flags */
1166 flags=0;
1167
1168 for(i=0; i<length; ++i) {
1169 dirProp=dirProps[i];
1170 switch(dirProp) {
1171 case LRE:
1172 case RLE:
1173 case LRO:
1174 case RLO:
1175 /* (X2, X3, X4, X5) */
1176 flags|=DIRPROP_FLAG(BN);
1177 levels[i]=previousLevel;
1178 if (dirProp==LRE || dirProp==LRO)
1179 /* least greater even level */
1180 newLevel=(UBiDiLevel)((embeddingLevel+2)&~(UBIDI_LEVEL_OVERRIDE|1));
1181 else
1182 /* least greater odd level */
1183 newLevel=(UBiDiLevel)((NO_OVERRIDE(embeddingLevel)+1)|1);
1184 if(newLevel<=UBIDI_MAX_EXPLICIT_LEVEL && overflowIsolateCount==0 &&
1185 overflowEmbeddingCount==0) {
1186 lastCcPos=i;
1187 embeddingLevel=newLevel;
1188 if(dirProp==LRO || dirProp==RLO)
1189 embeddingLevel|=UBIDI_LEVEL_OVERRIDE;
1190 stackLast++;
1191 stack[stackLast]=embeddingLevel;
1192 /* we don't need to set UBIDI_LEVEL_OVERRIDE off for LRE and RLE
1193 since this has already been done for newLevel which is
1194 the source for embeddingLevel.
1195 */
1196 } else {
1197 if(overflowIsolateCount==0)
1198 overflowEmbeddingCount++;
1199 }
1200 break;
1201 case PDF:
1202 /* (X7) */
1203 flags|=DIRPROP_FLAG(BN);
1204 levels[i]=previousLevel;
1205 /* handle all the overflow cases first */
1206 if(overflowIsolateCount) {
1207 break;
1208 }
1209 if(overflowEmbeddingCount) {
1210 overflowEmbeddingCount--;
1211 break;
1212 }
1213 if(stackLast>0 && stack[stackLast]<ISOLATE) { /* not an isolate entry */
1214 lastCcPos=i;
1215 stackLast--;
1216 embeddingLevel=(UBiDiLevel)stack[stackLast];
1217 }
1218 break;
1219 case LRI:
1220 case RLI:
1221 flags|=(DIRPROP_FLAG(ON)|DIRPROP_FLAG_LR(embeddingLevel));
1222 levels[i]=NO_OVERRIDE(embeddingLevel);
1223 if(NO_OVERRIDE(embeddingLevel)!=NO_OVERRIDE(previousLevel)) {
1224 bracketProcessBoundary(&bracketData, lastCcPos,
1225 previousLevel, embeddingLevel);
1226 flags|=DIRPROP_FLAG_MULTI_RUNS;
1227 }
1228 previousLevel=embeddingLevel;
1229 /* (X5a, X5b) */
1230 if(dirProp==LRI)
1231 /* least greater even level */
1232 newLevel=(UBiDiLevel)((embeddingLevel+2)&~(UBIDI_LEVEL_OVERRIDE|1));
1233 else
1234 /* least greater odd level */
1235 newLevel=(UBiDiLevel)((NO_OVERRIDE(embeddingLevel)+1)|1);
1236 if(newLevel<=UBIDI_MAX_EXPLICIT_LEVEL && overflowIsolateCount==0 &&
1237 overflowEmbeddingCount==0) {
1238 flags|=DIRPROP_FLAG(dirProp);
1239 lastCcPos=i;
1240 validIsolateCount++;
1241 if(validIsolateCount>pBiDi->isolateCount)
1242 pBiDi->isolateCount=validIsolateCount;
1243 embeddingLevel=newLevel;
1244 /* we can increment stackLast without checking because newLevel
1245 will exceed UBIDI_MAX_EXPLICIT_LEVEL before stackLast overflows */
1246 stackLast++;
1247 stack[stackLast]=embeddingLevel+ISOLATE;
1248 bracketProcessLRI_RLI(&bracketData, embeddingLevel);
1249 } else {
1250 /* make it WS so that it is handled by adjustWSLevels() */
1251 dirProps[i]=WS;
1252 overflowIsolateCount++;
1253 }
1254 break;
1255 case PDI:
1256 if(NO_OVERRIDE(embeddingLevel)!=NO_OVERRIDE(previousLevel)) {
1257 bracketProcessBoundary(&bracketData, lastCcPos,
1258 previousLevel, embeddingLevel);
1259 flags|=DIRPROP_FLAG_MULTI_RUNS;
1260 }
1261 /* (X6a) */
1262 if(overflowIsolateCount) {
1263 overflowIsolateCount--;
1264 /* make it WS so that it is handled by adjustWSLevels() */
1265 dirProps[i]=WS;
1266 }
1267 else if(validIsolateCount) {
1268 flags|=DIRPROP_FLAG(PDI);
1269 lastCcPos=i;
1270 overflowEmbeddingCount=0;
1271 while(stack[stackLast]<ISOLATE) /* pop embedding entries */
1272 stackLast--; /* until the last isolate entry */
1273 stackLast--; /* pop also the last isolate entry */
1274 validIsolateCount--;
1275 bracketProcessPDI(&bracketData);
1276 } else
1277 /* make it WS so that it is handled by adjustWSLevels() */
1278 dirProps[i]=WS;
1279 embeddingLevel=(UBiDiLevel)stack[stackLast]&~ISOLATE;
1280 flags|=(DIRPROP_FLAG(ON)|DIRPROP_FLAG_LR(embeddingLevel));
1281 previousLevel=embeddingLevel;
1282 levels[i]=NO_OVERRIDE(embeddingLevel);
1283 break;
1284 case B:
1285 flags|=DIRPROP_FLAG(B);
1286 levels[i]=GET_PARALEVEL(pBiDi, i);
1287 if((i+1)<length) {
1288 if(text[i]==CR && text[i+1]==LF)
1289 break; /* skip CR when followed by LF */
1290 overflowEmbeddingCount=overflowIsolateCount=0;
1291 validIsolateCount=0;
1292 stackLast=0;
1293 previousLevel=embeddingLevel=GET_PARALEVEL(pBiDi, i+1);
1294 stack[0]=embeddingLevel; /* initialize base entry to para level, no override, no isolate */
1295 bracketProcessB(&bracketData, embeddingLevel);
1296 }
1297 break;
1298 case BN:
1299 /* BN, LRE, RLE, and PDF are supposed to be removed (X9) */
1300 /* they will get their levels set correctly in adjustWSLevels() */
1301 levels[i]=previousLevel;
1302 flags|=DIRPROP_FLAG(BN);
1303 break;
1304 default:
1305 /* all other types are normal characters and get the "real" level */
1306 if(NO_OVERRIDE(embeddingLevel)!=NO_OVERRIDE(previousLevel)) {
1307 bracketProcessBoundary(&bracketData, lastCcPos,
1308 previousLevel, embeddingLevel);
1309 flags|=DIRPROP_FLAG_MULTI_RUNS;
1310 if(embeddingLevel&UBIDI_LEVEL_OVERRIDE)
1311 flags|=DIRPROP_FLAG_O(embeddingLevel);
1312 else
1313 flags|=DIRPROP_FLAG_E(embeddingLevel);
1314 }
1315 previousLevel=embeddingLevel;
1316 levels[i]=embeddingLevel;
1317 if(!bracketProcessChar(&bracketData, i))
1318 return (UBiDiDirection)-1;
1319 /* the dirProp may have been changed in bracketProcessChar() */
1320 flags|=DIRPROP_FLAG(dirProps[i]);
1321 break;
1322 }
1323 }
1324 if(flags&MASK_EMBEDDING)
1325 flags|=DIRPROP_FLAG_LR(pBiDi->paraLevel);
1326 if(pBiDi->orderParagraphsLTR && (flags&DIRPROP_FLAG(B)))
1327 flags|=DIRPROP_FLAG(L);
1328 /* again, determine if the text is mixed-directional or single-directional */
1329 pBiDi->flags=flags;
1330 direction=directionFromFlags(pBiDi);
1331 }
1332 return direction;
1333 }
1334
1335 /*
1336 * Use a pre-specified embedding levels array:
1337 *
1338 * Adjust the directional properties for overrides (->LEVEL_OVERRIDE),
1339 * ignore all explicit codes (X9),
1340 * and check all the preset levels.
1341 *
1342 * Recalculate the flags to have them reflect the real properties
1343 * after taking the explicit embeddings into account.
1344 */
1345 static UBiDiDirection
checkExplicitLevels(UBiDi * pBiDi,UErrorCode * pErrorCode)1346 checkExplicitLevels(UBiDi *pBiDi, UErrorCode *pErrorCode) {
1347 DirProp *dirProps=pBiDi->dirProps;
1348 UBiDiLevel *levels=pBiDi->levels;
1349 int32_t isolateCount=0;
1350
1351 int32_t length=pBiDi->length;
1352 Flags flags=0; /* collect all directionalities in the text */
1353 pBiDi->isolateCount=0;
1354
1355 int32_t currentParaIndex = 0;
1356 int32_t currentParaLimit = pBiDi->paras[0].limit;
1357 int32_t currentParaLevel = pBiDi->paraLevel;
1358
1359 for(int32_t i=0; i<length; ++i) {
1360 UBiDiLevel level=levels[i];
1361 DirProp dirProp=dirProps[i];
1362 if(dirProp==LRI || dirProp==RLI) {
1363 isolateCount++;
1364 if(isolateCount>pBiDi->isolateCount)
1365 pBiDi->isolateCount=isolateCount;
1366 }
1367 else if(dirProp==PDI)
1368 isolateCount--;
1369 else if(dirProp==B)
1370 isolateCount=0;
1371
1372 // optimized version of int32_t currentParaLevel = GET_PARALEVEL(pBiDi, i);
1373 if (pBiDi->defaultParaLevel != 0 &&
1374 i == currentParaLimit && (currentParaIndex + 1) < pBiDi->paraCount) {
1375 currentParaLevel = pBiDi->paras[++currentParaIndex].level;
1376 currentParaLimit = pBiDi->paras[currentParaIndex].limit;
1377 }
1378
1379 UBiDiLevel overrideFlag = level & UBIDI_LEVEL_OVERRIDE;
1380 level &= ~UBIDI_LEVEL_OVERRIDE;
1381 if (level < currentParaLevel || UBIDI_MAX_EXPLICIT_LEVEL < level) {
1382 if (level == 0) {
1383 if (dirProp == B) {
1384 // Paragraph separators are ok with explicit level 0.
1385 // Prevents reordering of paragraphs.
1386 } else {
1387 // Treat explicit level 0 as a wildcard for the paragraph level.
1388 // Avoid making the caller guess what the paragraph level would be.
1389 level = (UBiDiLevel)currentParaLevel;
1390 levels[i] = level | overrideFlag;
1391 }
1392 } else {
1393 // 1 <= level < currentParaLevel or UBIDI_MAX_EXPLICIT_LEVEL < level
1394 /* level out of bounds */
1395 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1396 return UBIDI_LTR;
1397 }
1398 }
1399 if (overrideFlag != 0) {
1400 /* keep the override flag in levels[i] but adjust the flags */
1401 flags|=DIRPROP_FLAG_O(level);
1402 } else {
1403 /* set the flags */
1404 flags|=DIRPROP_FLAG_E(level)|DIRPROP_FLAG(dirProp);
1405 }
1406 }
1407 if(flags&MASK_EMBEDDING)
1408 flags|=DIRPROP_FLAG_LR(pBiDi->paraLevel);
1409 /* determine if the text is mixed-directional or single-directional */
1410 pBiDi->flags=flags;
1411 return directionFromFlags(pBiDi);
1412 }
1413
1414 /******************************************************************
1415 The Properties state machine table
1416 *******************************************************************
1417
1418 All table cells are 8 bits:
1419 bits 0..4: next state
1420 bits 5..7: action to perform (if > 0)
1421
1422 Cells may be of format "n" where n represents the next state
1423 (except for the rightmost column).
1424 Cells may also be of format "s(x,y)" where x represents an action
1425 to perform and y represents the next state.
1426
1427 *******************************************************************
1428 Definitions and type for properties state table
1429 *******************************************************************
1430 */
1431 #define IMPTABPROPS_COLUMNS 16
1432 #define IMPTABPROPS_RES (IMPTABPROPS_COLUMNS - 1)
1433 #define GET_STATEPROPS(cell) ((cell)&0x1f)
1434 #define GET_ACTIONPROPS(cell) ((cell)>>5)
1435 #define s(action, newState) ((uint8_t)(newState+(action<<5)))
1436
1437 static const uint8_t groupProp[] = /* dirProp regrouped */
1438 {
1439 /* L R EN ES ET AN CS B S WS ON LRE LRO AL RLE RLO PDF NSM BN FSI LRI RLI PDI ENL ENR */
1440 0, 1, 2, 7, 8, 3, 9, 6, 5, 4, 4, 10, 10, 12, 10, 10, 10, 11, 10, 4, 4, 4, 4, 13, 14
1441 };
1442 enum { DirProp_L=0, DirProp_R=1, DirProp_EN=2, DirProp_AN=3, DirProp_ON=4, DirProp_S=5, DirProp_B=6 }; /* reduced dirProp */
1443
1444 /******************************************************************
1445
1446 PROPERTIES STATE TABLE
1447
1448 In table impTabProps,
1449 - the ON column regroups ON and WS, FSI, RLI, LRI and PDI
1450 - the BN column regroups BN, LRE, RLE, LRO, RLO, PDF
1451 - the Res column is the reduced property assigned to a run
1452
1453 Action 1: process current run1, init new run1
1454 2: init new run2
1455 3: process run1, process run2, init new run1
1456 4: process run1, set run1=run2, init new run2
1457
1458 Notes:
1459 1) This table is used in resolveImplicitLevels().
1460 2) This table triggers actions when there is a change in the Bidi
1461 property of incoming characters (action 1).
1462 3) Most such property sequences are processed immediately (in
1463 fact, passed to processPropertySeq().
1464 4) However, numbers are assembled as one sequence. This means
1465 that undefined situations (like CS following digits, until
1466 it is known if the next char will be a digit) are held until
1467 following chars define them.
1468 Example: digits followed by CS, then comes another CS or ON;
1469 the digits will be processed, then the CS assigned
1470 as the start of an ON sequence (action 3).
1471 5) There are cases where more than one sequence must be
1472 processed, for instance digits followed by CS followed by L:
1473 the digits must be processed as one sequence, and the CS
1474 must be processed as an ON sequence, all this before starting
1475 assembling chars for the opening L sequence.
1476
1477
1478 */
1479 static const uint8_t impTabProps[][IMPTABPROPS_COLUMNS] =
1480 {
1481 /* L , R , EN , AN , ON , S , B , ES , ET , CS , BN , NSM , AL , ENL , ENR , Res */
1482 /* 0 Init */ { 1 , 2 , 4 , 5 , 7 , 15 , 17 , 7 , 9 , 7 , 0 , 7 , 3 , 18 , 21 , DirProp_ON },
1483 /* 1 L */ { 1 , s(1,2), s(1,4), s(1,5), s(1,7),s(1,15),s(1,17), s(1,7), s(1,9), s(1,7), 1 , 1 , s(1,3),s(1,18),s(1,21), DirProp_L },
1484 /* 2 R */ { s(1,1), 2 , s(1,4), s(1,5), s(1,7),s(1,15),s(1,17), s(1,7), s(1,9), s(1,7), 2 , 2 , s(1,3),s(1,18),s(1,21), DirProp_R },
1485 /* 3 AL */ { s(1,1), s(1,2), s(1,6), s(1,6), s(1,8),s(1,16),s(1,17), s(1,8), s(1,8), s(1,8), 3 , 3 , 3 ,s(1,18),s(1,21), DirProp_R },
1486 /* 4 EN */ { s(1,1), s(1,2), 4 , s(1,5), s(1,7),s(1,15),s(1,17),s(2,10), 11 ,s(2,10), 4 , 4 , s(1,3), 18 , 21 , DirProp_EN },
1487 /* 5 AN */ { s(1,1), s(1,2), s(1,4), 5 , s(1,7),s(1,15),s(1,17), s(1,7), s(1,9),s(2,12), 5 , 5 , s(1,3),s(1,18),s(1,21), DirProp_AN },
1488 /* 6 AL:EN/AN */ { s(1,1), s(1,2), 6 , 6 , s(1,8),s(1,16),s(1,17), s(1,8), s(1,8),s(2,13), 6 , 6 , s(1,3), 18 , 21 , DirProp_AN },
1489 /* 7 ON */ { s(1,1), s(1,2), s(1,4), s(1,5), 7 ,s(1,15),s(1,17), 7 ,s(2,14), 7 , 7 , 7 , s(1,3),s(1,18),s(1,21), DirProp_ON },
1490 /* 8 AL:ON */ { s(1,1), s(1,2), s(1,6), s(1,6), 8 ,s(1,16),s(1,17), 8 , 8 , 8 , 8 , 8 , s(1,3),s(1,18),s(1,21), DirProp_ON },
1491 /* 9 ET */ { s(1,1), s(1,2), 4 , s(1,5), 7 ,s(1,15),s(1,17), 7 , 9 , 7 , 9 , 9 , s(1,3), 18 , 21 , DirProp_ON },
1492 /*10 EN+ES/CS */ { s(3,1), s(3,2), 4 , s(3,5), s(4,7),s(3,15),s(3,17), s(4,7),s(4,14), s(4,7), 10 , s(4,7), s(3,3), 18 , 21 , DirProp_EN },
1493 /*11 EN+ET */ { s(1,1), s(1,2), 4 , s(1,5), s(1,7),s(1,15),s(1,17), s(1,7), 11 , s(1,7), 11 , 11 , s(1,3), 18 , 21 , DirProp_EN },
1494 /*12 AN+CS */ { s(3,1), s(3,2), s(3,4), 5 , s(4,7),s(3,15),s(3,17), s(4,7),s(4,14), s(4,7), 12 , s(4,7), s(3,3),s(3,18),s(3,21), DirProp_AN },
1495 /*13 AL:EN/AN+CS */ { s(3,1), s(3,2), 6 , 6 , s(4,8),s(3,16),s(3,17), s(4,8), s(4,8), s(4,8), 13 , s(4,8), s(3,3), 18 , 21 , DirProp_AN },
1496 /*14 ON+ET */ { s(1,1), s(1,2), s(4,4), s(1,5), 7 ,s(1,15),s(1,17), 7 , 14 , 7 , 14 , 14 , s(1,3),s(4,18),s(4,21), DirProp_ON },
1497 /*15 S */ { s(1,1), s(1,2), s(1,4), s(1,5), s(1,7), 15 ,s(1,17), s(1,7), s(1,9), s(1,7), 15 , s(1,7), s(1,3),s(1,18),s(1,21), DirProp_S },
1498 /*16 AL:S */ { s(1,1), s(1,2), s(1,6), s(1,6), s(1,8), 16 ,s(1,17), s(1,8), s(1,8), s(1,8), 16 , s(1,8), s(1,3),s(1,18),s(1,21), DirProp_S },
1499 /*17 B */ { s(1,1), s(1,2), s(1,4), s(1,5), s(1,7),s(1,15), 17 , s(1,7), s(1,9), s(1,7), 17 , s(1,7), s(1,3),s(1,18),s(1,21), DirProp_B },
1500 /*18 ENL */ { s(1,1), s(1,2), 18 , s(1,5), s(1,7),s(1,15),s(1,17),s(2,19), 20 ,s(2,19), 18 , 18 , s(1,3), 18 , 21 , DirProp_L },
1501 /*19 ENL+ES/CS */ { s(3,1), s(3,2), 18 , s(3,5), s(4,7),s(3,15),s(3,17), s(4,7),s(4,14), s(4,7), 19 , s(4,7), s(3,3), 18 , 21 , DirProp_L },
1502 /*20 ENL+ET */ { s(1,1), s(1,2), 18 , s(1,5), s(1,7),s(1,15),s(1,17), s(1,7), 20 , s(1,7), 20 , 20 , s(1,3), 18 , 21 , DirProp_L },
1503 /*21 ENR */ { s(1,1), s(1,2), 21 , s(1,5), s(1,7),s(1,15),s(1,17),s(2,22), 23 ,s(2,22), 21 , 21 , s(1,3), 18 , 21 , DirProp_AN },
1504 /*22 ENR+ES/CS */ { s(3,1), s(3,2), 21 , s(3,5), s(4,7),s(3,15),s(3,17), s(4,7),s(4,14), s(4,7), 22 , s(4,7), s(3,3), 18 , 21 , DirProp_AN },
1505 /*23 ENR+ET */ { s(1,1), s(1,2), 21 , s(1,5), s(1,7),s(1,15),s(1,17), s(1,7), 23 , s(1,7), 23 , 23 , s(1,3), 18 , 21 , DirProp_AN }
1506 };
1507
1508 /* we must undef macro s because the levels tables have a different
1509 * structure (4 bits for action and 4 bits for next state.
1510 */
1511 #undef s
1512
1513 /******************************************************************
1514 The levels state machine tables
1515 *******************************************************************
1516
1517 All table cells are 8 bits:
1518 bits 0..3: next state
1519 bits 4..7: action to perform (if > 0)
1520
1521 Cells may be of format "n" where n represents the next state
1522 (except for the rightmost column).
1523 Cells may also be of format "s(x,y)" where x represents an action
1524 to perform and y represents the next state.
1525
1526 This format limits each table to 16 states each and to 15 actions.
1527
1528 *******************************************************************
1529 Definitions and type for levels state tables
1530 *******************************************************************
1531 */
1532 #define IMPTABLEVELS_COLUMNS (DirProp_B + 2)
1533 #define IMPTABLEVELS_RES (IMPTABLEVELS_COLUMNS - 1)
1534 #define GET_STATE(cell) ((cell)&0x0f)
1535 #define GET_ACTION(cell) ((cell)>>4)
1536 #define s(action, newState) ((uint8_t)(newState+(action<<4)))
1537
1538 typedef uint8_t ImpTab[][IMPTABLEVELS_COLUMNS];
1539 typedef uint8_t ImpAct[];
1540
1541 /* FOOD FOR THOUGHT: each ImpTab should have its associated ImpAct,
1542 * instead of having a pair of ImpTab and a pair of ImpAct.
1543 */
1544 typedef struct ImpTabPair {
1545 const void * pImpTab[2];
1546 const void * pImpAct[2];
1547 } ImpTabPair;
1548
1549 /******************************************************************
1550
1551 LEVELS STATE TABLES
1552
1553 In all levels state tables,
1554 - state 0 is the initial state
1555 - the Res column is the increment to add to the text level
1556 for this property sequence.
1557
1558 The impAct arrays for each table of a pair map the local action
1559 numbers of the table to the total list of actions. For instance,
1560 action 2 in a given table corresponds to the action number which
1561 appears in entry [2] of the impAct array for that table.
1562 The first entry of all impAct arrays must be 0.
1563
1564 Action 1: init conditional sequence
1565 2: prepend conditional sequence to current sequence
1566 3: set ON sequence to new level - 1
1567 4: init EN/AN/ON sequence
1568 5: fix EN/AN/ON sequence followed by R
1569 6: set previous level sequence to level 2
1570
1571 Notes:
1572 1) These tables are used in processPropertySeq(). The input
1573 is property sequences as determined by resolveImplicitLevels.
1574 2) Most such property sequences are processed immediately
1575 (levels are assigned).
1576 3) However, some sequences cannot be assigned a final level till
1577 one or more following sequences are received. For instance,
1578 ON following an R sequence within an even-level paragraph.
1579 If the following sequence is R, the ON sequence will be
1580 assigned basic run level+1, and so will the R sequence.
1581 4) S is generally handled like ON, since its level will be fixed
1582 to paragraph level in adjustWSLevels().
1583
1584 */
1585
1586 static const ImpTab impTabL_DEFAULT = /* Even paragraph level */
1587 /* In this table, conditional sequences receive the lower possible level
1588 until proven otherwise.
1589 */
1590 {
1591 /* L , R , EN , AN , ON , S , B , Res */
1592 /* 0 : init */ { 0 , 1 , 0 , 2 , 0 , 0 , 0 , 0 },
1593 /* 1 : R */ { 0 , 1 , 3 , 3 , s(1,4), s(1,4), 0 , 1 },
1594 /* 2 : AN */ { 0 , 1 , 0 , 2 , s(1,5), s(1,5), 0 , 2 },
1595 /* 3 : R+EN/AN */ { 0 , 1 , 3 , 3 , s(1,4), s(1,4), 0 , 2 },
1596 /* 4 : R+ON */ { 0 , s(2,1), s(3,3), s(3,3), 4 , 4 , 0 , 0 },
1597 /* 5 : AN+ON */ { 0 , s(2,1), 0 , s(3,2), 5 , 5 , 0 , 0 }
1598 };
1599 static const ImpTab impTabR_DEFAULT = /* Odd paragraph level */
1600 /* In this table, conditional sequences receive the lower possible level
1601 until proven otherwise.
1602 */
1603 {
1604 /* L , R , EN , AN , ON , S , B , Res */
1605 /* 0 : init */ { 1 , 0 , 2 , 2 , 0 , 0 , 0 , 0 },
1606 /* 1 : L */ { 1 , 0 , 1 , 3 , s(1,4), s(1,4), 0 , 1 },
1607 /* 2 : EN/AN */ { 1 , 0 , 2 , 2 , 0 , 0 , 0 , 1 },
1608 /* 3 : L+AN */ { 1 , 0 , 1 , 3 , 5 , 5 , 0 , 1 },
1609 /* 4 : L+ON */ { s(2,1), 0 , s(2,1), 3 , 4 , 4 , 0 , 0 },
1610 /* 5 : L+AN+ON */ { 1 , 0 , 1 , 3 , 5 , 5 , 0 , 0 }
1611 };
1612 static const ImpAct impAct0 = {0,1,2,3,4};
1613 static const ImpTabPair impTab_DEFAULT = {{&impTabL_DEFAULT,
1614 &impTabR_DEFAULT},
1615 {&impAct0, &impAct0}};
1616
1617 static const ImpTab impTabL_NUMBERS_SPECIAL = /* Even paragraph level */
1618 /* In this table, conditional sequences receive the lower possible level
1619 until proven otherwise.
1620 */
1621 {
1622 /* L , R , EN , AN , ON , S , B , Res */
1623 /* 0 : init */ { 0 , 2 , s(1,1), s(1,1), 0 , 0 , 0 , 0 },
1624 /* 1 : L+EN/AN */ { 0 , s(4,2), 1 , 1 , 0 , 0 , 0 , 0 },
1625 /* 2 : R */ { 0 , 2 , 4 , 4 , s(1,3), s(1,3), 0 , 1 },
1626 /* 3 : R+ON */ { 0 , s(2,2), s(3,4), s(3,4), 3 , 3 , 0 , 0 },
1627 /* 4 : R+EN/AN */ { 0 , 2 , 4 , 4 , s(1,3), s(1,3), 0 , 2 }
1628 };
1629 static const ImpTabPair impTab_NUMBERS_SPECIAL = {{&impTabL_NUMBERS_SPECIAL,
1630 &impTabR_DEFAULT},
1631 {&impAct0, &impAct0}};
1632
1633 static const ImpTab impTabL_GROUP_NUMBERS_WITH_R =
1634 /* In this table, EN/AN+ON sequences receive levels as if associated with R
1635 until proven that there is L or sor/eor on both sides. AN is handled like EN.
1636 */
1637 {
1638 /* L , R , EN , AN , ON , S , B , Res */
1639 /* 0 init */ { 0 , 3 , s(1,1), s(1,1), 0 , 0 , 0 , 0 },
1640 /* 1 EN/AN */ { s(2,0), 3 , 1 , 1 , 2 , s(2,0), s(2,0), 2 },
1641 /* 2 EN/AN+ON */ { s(2,0), 3 , 1 , 1 , 2 , s(2,0), s(2,0), 1 },
1642 /* 3 R */ { 0 , 3 , 5 , 5 , s(1,4), 0 , 0 , 1 },
1643 /* 4 R+ON */ { s(2,0), 3 , 5 , 5 , 4 , s(2,0), s(2,0), 1 },
1644 /* 5 R+EN/AN */ { 0 , 3 , 5 , 5 , s(1,4), 0 , 0 , 2 }
1645 };
1646 static const ImpTab impTabR_GROUP_NUMBERS_WITH_R =
1647 /* In this table, EN/AN+ON sequences receive levels as if associated with R
1648 until proven that there is L on both sides. AN is handled like EN.
1649 */
1650 {
1651 /* L , R , EN , AN , ON , S , B , Res */
1652 /* 0 init */ { 2 , 0 , 1 , 1 , 0 , 0 , 0 , 0 },
1653 /* 1 EN/AN */ { 2 , 0 , 1 , 1 , 0 , 0 , 0 , 1 },
1654 /* 2 L */ { 2 , 0 , s(1,4), s(1,4), s(1,3), 0 , 0 , 1 },
1655 /* 3 L+ON */ { s(2,2), 0 , 4 , 4 , 3 , 0 , 0 , 0 },
1656 /* 4 L+EN/AN */ { s(2,2), 0 , 4 , 4 , 3 , 0 , 0 , 1 }
1657 };
1658 static const ImpTabPair impTab_GROUP_NUMBERS_WITH_R = {
1659 {&impTabL_GROUP_NUMBERS_WITH_R,
1660 &impTabR_GROUP_NUMBERS_WITH_R},
1661 {&impAct0, &impAct0}};
1662
1663
1664 static const ImpTab impTabL_INVERSE_NUMBERS_AS_L =
1665 /* This table is identical to the Default LTR table except that EN and AN are
1666 handled like L.
1667 */
1668 {
1669 /* L , R , EN , AN , ON , S , B , Res */
1670 /* 0 : init */ { 0 , 1 , 0 , 0 , 0 , 0 , 0 , 0 },
1671 /* 1 : R */ { 0 , 1 , 0 , 0 , s(1,4), s(1,4), 0 , 1 },
1672 /* 2 : AN */ { 0 , 1 , 0 , 0 , s(1,5), s(1,5), 0 , 2 },
1673 /* 3 : R+EN/AN */ { 0 , 1 , 0 , 0 , s(1,4), s(1,4), 0 , 2 },
1674 /* 4 : R+ON */ { s(2,0), 1 , s(2,0), s(2,0), 4 , 4 , s(2,0), 1 },
1675 /* 5 : AN+ON */ { s(2,0), 1 , s(2,0), s(2,0), 5 , 5 , s(2,0), 1 }
1676 };
1677 static const ImpTab impTabR_INVERSE_NUMBERS_AS_L =
1678 /* This table is identical to the Default RTL table except that EN and AN are
1679 handled like L.
1680 */
1681 {
1682 /* L , R , EN , AN , ON , S , B , Res */
1683 /* 0 : init */ { 1 , 0 , 1 , 1 , 0 , 0 , 0 , 0 },
1684 /* 1 : L */ { 1 , 0 , 1 , 1 , s(1,4), s(1,4), 0 , 1 },
1685 /* 2 : EN/AN */ { 1 , 0 , 1 , 1 , 0 , 0 , 0 , 1 },
1686 /* 3 : L+AN */ { 1 , 0 , 1 , 1 , 5 , 5 , 0 , 1 },
1687 /* 4 : L+ON */ { s(2,1), 0 , s(2,1), s(2,1), 4 , 4 , 0 , 0 },
1688 /* 5 : L+AN+ON */ { 1 , 0 , 1 , 1 , 5 , 5 , 0 , 0 }
1689 };
1690 static const ImpTabPair impTab_INVERSE_NUMBERS_AS_L = {
1691 {&impTabL_INVERSE_NUMBERS_AS_L,
1692 &impTabR_INVERSE_NUMBERS_AS_L},
1693 {&impAct0, &impAct0}};
1694
1695 static const ImpTab impTabR_INVERSE_LIKE_DIRECT = /* Odd paragraph level */
1696 /* In this table, conditional sequences receive the lower possible level
1697 until proven otherwise.
1698 */
1699 {
1700 /* L , R , EN , AN , ON , S , B , Res */
1701 /* 0 : init */ { 1 , 0 , 2 , 2 , 0 , 0 , 0 , 0 },
1702 /* 1 : L */ { 1 , 0 , 1 , 2 , s(1,3), s(1,3), 0 , 1 },
1703 /* 2 : EN/AN */ { 1 , 0 , 2 , 2 , 0 , 0 , 0 , 1 },
1704 /* 3 : L+ON */ { s(2,1), s(3,0), 6 , 4 , 3 , 3 , s(3,0), 0 },
1705 /* 4 : L+ON+AN */ { s(2,1), s(3,0), 6 , 4 , 5 , 5 , s(3,0), 3 },
1706 /* 5 : L+AN+ON */ { s(2,1), s(3,0), 6 , 4 , 5 , 5 , s(3,0), 2 },
1707 /* 6 : L+ON+EN */ { s(2,1), s(3,0), 6 , 4 , 3 , 3 , s(3,0), 1 }
1708 };
1709 static const ImpAct impAct1 = {0,1,13,14};
1710 /* FOOD FOR THOUGHT: in LTR table below, check case "JKL 123abc"
1711 */
1712 static const ImpTabPair impTab_INVERSE_LIKE_DIRECT = {
1713 {&impTabL_DEFAULT,
1714 &impTabR_INVERSE_LIKE_DIRECT},
1715 {&impAct0, &impAct1}};
1716
1717 static const ImpTab impTabL_INVERSE_LIKE_DIRECT_WITH_MARKS =
1718 /* The case handled in this table is (visually): R EN L
1719 */
1720 {
1721 /* L , R , EN , AN , ON , S , B , Res */
1722 /* 0 : init */ { 0 , s(6,3), 0 , 1 , 0 , 0 , 0 , 0 },
1723 /* 1 : L+AN */ { 0 , s(6,3), 0 , 1 , s(1,2), s(3,0), 0 , 4 },
1724 /* 2 : L+AN+ON */ { s(2,0), s(6,3), s(2,0), 1 , 2 , s(3,0), s(2,0), 3 },
1725 /* 3 : R */ { 0 , s(6,3), s(5,5), s(5,6), s(1,4), s(3,0), 0 , 3 },
1726 /* 4 : R+ON */ { s(3,0), s(4,3), s(5,5), s(5,6), 4 , s(3,0), s(3,0), 3 },
1727 /* 5 : R+EN */ { s(3,0), s(4,3), 5 , s(5,6), s(1,4), s(3,0), s(3,0), 4 },
1728 /* 6 : R+AN */ { s(3,0), s(4,3), s(5,5), 6 , s(1,4), s(3,0), s(3,0), 4 }
1729 };
1730 static const ImpTab impTabR_INVERSE_LIKE_DIRECT_WITH_MARKS =
1731 /* The cases handled in this table are (visually): R EN L
1732 R L AN L
1733 */
1734 {
1735 /* L , R , EN , AN , ON , S , B , Res */
1736 /* 0 : init */ { s(1,3), 0 , 1 , 1 , 0 , 0 , 0 , 0 },
1737 /* 1 : R+EN/AN */ { s(2,3), 0 , 1 , 1 , 2 , s(4,0), 0 , 1 },
1738 /* 2 : R+EN/AN+ON */ { s(2,3), 0 , 1 , 1 , 2 , s(4,0), 0 , 0 },
1739 /* 3 : L */ { 3 , 0 , 3 , s(3,6), s(1,4), s(4,0), 0 , 1 },
1740 /* 4 : L+ON */ { s(5,3), s(4,0), 5 , s(3,6), 4 , s(4,0), s(4,0), 0 },
1741 /* 5 : L+ON+EN */ { s(5,3), s(4,0), 5 , s(3,6), 4 , s(4,0), s(4,0), 1 },
1742 /* 6 : L+AN */ { s(5,3), s(4,0), 6 , 6 , 4 , s(4,0), s(4,0), 3 }
1743 };
1744 static const ImpAct impAct2 = {0,1,2,5,6,7,8};
1745 static const ImpAct impAct3 = {0,1,9,10,11,12};
1746 static const ImpTabPair impTab_INVERSE_LIKE_DIRECT_WITH_MARKS = {
1747 {&impTabL_INVERSE_LIKE_DIRECT_WITH_MARKS,
1748 &impTabR_INVERSE_LIKE_DIRECT_WITH_MARKS},
1749 {&impAct2, &impAct3}};
1750
1751 static const ImpTabPair impTab_INVERSE_FOR_NUMBERS_SPECIAL = {
1752 {&impTabL_NUMBERS_SPECIAL,
1753 &impTabR_INVERSE_LIKE_DIRECT},
1754 {&impAct0, &impAct1}};
1755
1756 static const ImpTab impTabL_INVERSE_FOR_NUMBERS_SPECIAL_WITH_MARKS =
1757 /* The case handled in this table is (visually): R EN L
1758 */
1759 {
1760 /* L , R , EN , AN , ON , S , B , Res */
1761 /* 0 : init */ { 0 , s(6,2), 1 , 1 , 0 , 0 , 0 , 0 },
1762 /* 1 : L+EN/AN */ { 0 , s(6,2), 1 , 1 , 0 , s(3,0), 0 , 4 },
1763 /* 2 : R */ { 0 , s(6,2), s(5,4), s(5,4), s(1,3), s(3,0), 0 , 3 },
1764 /* 3 : R+ON */ { s(3,0), s(4,2), s(5,4), s(5,4), 3 , s(3,0), s(3,0), 3 },
1765 /* 4 : R+EN/AN */ { s(3,0), s(4,2), 4 , 4 , s(1,3), s(3,0), s(3,0), 4 }
1766 };
1767 static const ImpTabPair impTab_INVERSE_FOR_NUMBERS_SPECIAL_WITH_MARKS = {
1768 {&impTabL_INVERSE_FOR_NUMBERS_SPECIAL_WITH_MARKS,
1769 &impTabR_INVERSE_LIKE_DIRECT_WITH_MARKS},
1770 {&impAct2, &impAct3}};
1771
1772 #undef s
1773
1774 typedef struct {
1775 const ImpTab * pImpTab; /* level table pointer */
1776 const ImpAct * pImpAct; /* action map array */
1777 int32_t startON; /* start of ON sequence */
1778 int32_t startL2EN; /* start of level 2 sequence */
1779 int32_t lastStrongRTL; /* index of last found R or AL */
1780 int32_t state; /* current state */
1781 int32_t runStart; /* start position of the run */
1782 UBiDiLevel runLevel; /* run level before implicit solving */
1783 } LevState;
1784
1785 /*------------------------------------------------------------------------*/
1786
1787 static void
addPoint(UBiDi * pBiDi,int32_t pos,int32_t flag)1788 addPoint(UBiDi *pBiDi, int32_t pos, int32_t flag)
1789 /* param pos: position where to insert
1790 param flag: one of LRM_BEFORE, LRM_AFTER, RLM_BEFORE, RLM_AFTER
1791 */
1792 {
1793 #define FIRSTALLOC 10
1794 Point point;
1795 InsertPoints * pInsertPoints=&(pBiDi->insertPoints);
1796
1797 if (pInsertPoints->capacity == 0)
1798 {
1799 pInsertPoints->points=static_cast<Point *>(uprv_malloc(sizeof(Point)*FIRSTALLOC));
1800 if (pInsertPoints->points == NULL)
1801 {
1802 pInsertPoints->errorCode=U_MEMORY_ALLOCATION_ERROR;
1803 return;
1804 }
1805 pInsertPoints->capacity=FIRSTALLOC;
1806 }
1807 if (pInsertPoints->size >= pInsertPoints->capacity) /* no room for new point */
1808 {
1809 Point * savePoints=pInsertPoints->points;
1810 pInsertPoints->points=static_cast<Point *>(uprv_realloc(pInsertPoints->points,
1811 pInsertPoints->capacity*2*sizeof(Point)));
1812 if (pInsertPoints->points == NULL)
1813 {
1814 pInsertPoints->points=savePoints;
1815 pInsertPoints->errorCode=U_MEMORY_ALLOCATION_ERROR;
1816 return;
1817 }
1818 else pInsertPoints->capacity*=2;
1819 }
1820 point.pos=pos;
1821 point.flag=flag;
1822 pInsertPoints->points[pInsertPoints->size]=point;
1823 pInsertPoints->size++;
1824 #undef FIRSTALLOC
1825 }
1826
1827 static void
setLevelsOutsideIsolates(UBiDi * pBiDi,int32_t start,int32_t limit,UBiDiLevel level)1828 setLevelsOutsideIsolates(UBiDi *pBiDi, int32_t start, int32_t limit, UBiDiLevel level)
1829 {
1830 DirProp *dirProps=pBiDi->dirProps, dirProp;
1831 UBiDiLevel *levels=pBiDi->levels;
1832 int32_t isolateCount=0, k;
1833 for(k=start; k<limit; k++) {
1834 dirProp=dirProps[k];
1835 if(dirProp==PDI)
1836 isolateCount--;
1837 if(isolateCount==0)
1838 levels[k]=level;
1839 if(dirProp==LRI || dirProp==RLI)
1840 isolateCount++;
1841 }
1842 }
1843
1844 /* perform rules (Wn), (Nn), and (In) on a run of the text ------------------ */
1845
1846 /*
1847 * This implementation of the (Wn) rules applies all rules in one pass.
1848 * In order to do so, it needs a look-ahead of typically 1 character
1849 * (except for W5: sequences of ET) and keeps track of changes
1850 * in a rule Wp that affect a later Wq (p<q).
1851 *
1852 * The (Nn) and (In) rules are also performed in that same single loop,
1853 * but effectively one iteration behind for white space.
1854 *
1855 * Since all implicit rules are performed in one step, it is not necessary
1856 * to actually store the intermediate directional properties in dirProps[].
1857 */
1858
1859 static void
processPropertySeq(UBiDi * pBiDi,LevState * pLevState,uint8_t _prop,int32_t start,int32_t limit)1860 processPropertySeq(UBiDi *pBiDi, LevState *pLevState, uint8_t _prop,
1861 int32_t start, int32_t limit) {
1862 uint8_t cell, oldStateSeq, actionSeq;
1863 const ImpTab * pImpTab=pLevState->pImpTab;
1864 const ImpAct * pImpAct=pLevState->pImpAct;
1865 UBiDiLevel * levels=pBiDi->levels;
1866 UBiDiLevel level, addLevel;
1867 InsertPoints * pInsertPoints;
1868 int32_t start0, k;
1869
1870 start0=start; /* save original start position */
1871 oldStateSeq=(uint8_t)pLevState->state;
1872 cell=(*pImpTab)[oldStateSeq][_prop];
1873 pLevState->state=GET_STATE(cell); /* isolate the new state */
1874 actionSeq=(*pImpAct)[GET_ACTION(cell)]; /* isolate the action */
1875 addLevel=(*pImpTab)[pLevState->state][IMPTABLEVELS_RES];
1876
1877 if(actionSeq) {
1878 switch(actionSeq) {
1879 case 1: /* init ON seq */
1880 pLevState->startON=start0;
1881 break;
1882
1883 case 2: /* prepend ON seq to current seq */
1884 start=pLevState->startON;
1885 break;
1886
1887 case 3: /* EN/AN after R+ON */
1888 level=pLevState->runLevel+1;
1889 setLevelsOutsideIsolates(pBiDi, pLevState->startON, start0, level);
1890 break;
1891
1892 case 4: /* EN/AN before R for NUMBERS_SPECIAL */
1893 level=pLevState->runLevel+2;
1894 setLevelsOutsideIsolates(pBiDi, pLevState->startON, start0, level);
1895 break;
1896
1897 case 5: /* L or S after possible relevant EN/AN */
1898 /* check if we had EN after R/AL */
1899 if (pLevState->startL2EN >= 0) {
1900 addPoint(pBiDi, pLevState->startL2EN, LRM_BEFORE);
1901 }
1902 pLevState->startL2EN=-1; /* not within previous if since could also be -2 */
1903 /* check if we had any relevant EN/AN after R/AL */
1904 pInsertPoints=&(pBiDi->insertPoints);
1905 if ((pInsertPoints->capacity == 0) ||
1906 (pInsertPoints->size <= pInsertPoints->confirmed))
1907 {
1908 /* nothing, just clean up */
1909 pLevState->lastStrongRTL=-1;
1910 /* check if we have a pending conditional segment */
1911 level=(*pImpTab)[oldStateSeq][IMPTABLEVELS_RES];
1912 if ((level & 1) && (pLevState->startON > 0)) { /* after ON */
1913 start=pLevState->startON; /* reset to basic run level */
1914 }
1915 if (_prop == DirProp_S) /* add LRM before S */
1916 {
1917 addPoint(pBiDi, start0, LRM_BEFORE);
1918 pInsertPoints->confirmed=pInsertPoints->size;
1919 }
1920 break;
1921 }
1922 /* reset previous RTL cont to level for LTR text */
1923 for (k=pLevState->lastStrongRTL+1; k<start0; k++)
1924 {
1925 /* reset odd level, leave runLevel+2 as is */
1926 levels[k]=(levels[k] - 2) & ~1;
1927 }
1928 /* mark insert points as confirmed */
1929 pInsertPoints->confirmed=pInsertPoints->size;
1930 pLevState->lastStrongRTL=-1;
1931 if (_prop == DirProp_S) /* add LRM before S */
1932 {
1933 addPoint(pBiDi, start0, LRM_BEFORE);
1934 pInsertPoints->confirmed=pInsertPoints->size;
1935 }
1936 break;
1937
1938 case 6: /* R/AL after possible relevant EN/AN */
1939 /* just clean up */
1940 pInsertPoints=&(pBiDi->insertPoints);
1941 if (pInsertPoints->capacity > 0)
1942 /* remove all non confirmed insert points */
1943 pInsertPoints->size=pInsertPoints->confirmed;
1944 pLevState->startON=-1;
1945 pLevState->startL2EN=-1;
1946 pLevState->lastStrongRTL=limit - 1;
1947 break;
1948
1949 case 7: /* EN/AN after R/AL + possible cont */
1950 /* check for real AN */
1951 if ((_prop == DirProp_AN) && (pBiDi->dirProps[start0] == AN) &&
1952 (pBiDi->reorderingMode!=UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL))
1953 {
1954 /* real AN */
1955 if (pLevState->startL2EN == -1) /* if no relevant EN already found */
1956 {
1957 /* just note the righmost digit as a strong RTL */
1958 pLevState->lastStrongRTL=limit - 1;
1959 break;
1960 }
1961 if (pLevState->startL2EN >= 0) /* after EN, no AN */
1962 {
1963 addPoint(pBiDi, pLevState->startL2EN, LRM_BEFORE);
1964 pLevState->startL2EN=-2;
1965 }
1966 /* note AN */
1967 addPoint(pBiDi, start0, LRM_BEFORE);
1968 break;
1969 }
1970 /* if first EN/AN after R/AL */
1971 if (pLevState->startL2EN == -1) {
1972 pLevState->startL2EN=start0;
1973 }
1974 break;
1975
1976 case 8: /* note location of latest R/AL */
1977 pLevState->lastStrongRTL=limit - 1;
1978 pLevState->startON=-1;
1979 break;
1980
1981 case 9: /* L after R+ON/EN/AN */
1982 /* include possible adjacent number on the left */
1983 for (k=start0-1; k>=0 && !(levels[k]&1); k--);
1984 if(k>=0) {
1985 addPoint(pBiDi, k, RLM_BEFORE); /* add RLM before */
1986 pInsertPoints=&(pBiDi->insertPoints);
1987 pInsertPoints->confirmed=pInsertPoints->size; /* confirm it */
1988 }
1989 pLevState->startON=start0;
1990 break;
1991
1992 case 10: /* AN after L */
1993 /* AN numbers between L text on both sides may be trouble. */
1994 /* tentatively bracket with LRMs; will be confirmed if followed by L */
1995 addPoint(pBiDi, start0, LRM_BEFORE); /* add LRM before */
1996 addPoint(pBiDi, start0, LRM_AFTER); /* add LRM after */
1997 break;
1998
1999 case 11: /* R after L+ON/EN/AN */
2000 /* false alert, infirm LRMs around previous AN */
2001 pInsertPoints=&(pBiDi->insertPoints);
2002 pInsertPoints->size=pInsertPoints->confirmed;
2003 if (_prop == DirProp_S) /* add RLM before S */
2004 {
2005 addPoint(pBiDi, start0, RLM_BEFORE);
2006 pInsertPoints->confirmed=pInsertPoints->size;
2007 }
2008 break;
2009
2010 case 12: /* L after L+ON/AN */
2011 level=pLevState->runLevel + addLevel;
2012 for(k=pLevState->startON; k<start0; k++) {
2013 if (levels[k]<level)
2014 levels[k]=level;
2015 }
2016 pInsertPoints=&(pBiDi->insertPoints);
2017 pInsertPoints->confirmed=pInsertPoints->size; /* confirm inserts */
2018 pLevState->startON=start0;
2019 break;
2020
2021 case 13: /* L after L+ON+EN/AN/ON */
2022 level=pLevState->runLevel;
2023 for(k=start0-1; k>=pLevState->startON; k--) {
2024 if(levels[k]==level+3) {
2025 while(levels[k]==level+3) {
2026 levels[k--]-=2;
2027 }
2028 while(levels[k]==level) {
2029 k--;
2030 }
2031 }
2032 if(levels[k]==level+2) {
2033 levels[k]=level;
2034 continue;
2035 }
2036 levels[k]=level+1;
2037 }
2038 break;
2039
2040 case 14: /* R after L+ON+EN/AN/ON */
2041 level=pLevState->runLevel+1;
2042 for(k=start0-1; k>=pLevState->startON; k--) {
2043 if(levels[k]>level) {
2044 levels[k]-=2;
2045 }
2046 }
2047 break;
2048
2049 default: /* we should never get here */
2050 U_ASSERT(FALSE);
2051 break;
2052 }
2053 }
2054 if((addLevel) || (start < start0)) {
2055 level=pLevState->runLevel + addLevel;
2056 if(start>=pLevState->runStart) {
2057 for(k=start; k<limit; k++) {
2058 levels[k]=level;
2059 }
2060 } else {
2061 setLevelsOutsideIsolates(pBiDi, start, limit, level);
2062 }
2063 }
2064 }
2065
2066 /**
2067 * Returns the directionality of the last strong character at the end of the prologue, if any.
2068 * Requires prologue!=null.
2069 */
2070 static DirProp
lastL_R_AL(UBiDi * pBiDi)2071 lastL_R_AL(UBiDi *pBiDi) {
2072 const UChar *text=pBiDi->prologue;
2073 int32_t length=pBiDi->proLength;
2074 int32_t i;
2075 UChar32 uchar;
2076 DirProp dirProp;
2077 for(i=length; i>0; ) {
2078 /* i is decremented by U16_PREV */
2079 U16_PREV(text, 0, i, uchar);
2080 dirProp=(DirProp)ubidi_getCustomizedClass(pBiDi, uchar);
2081 if(dirProp==L) {
2082 return DirProp_L;
2083 }
2084 if(dirProp==R || dirProp==AL) {
2085 return DirProp_R;
2086 }
2087 if(dirProp==B) {
2088 return DirProp_ON;
2089 }
2090 }
2091 return DirProp_ON;
2092 }
2093
2094 /**
2095 * Returns the directionality of the first strong character, or digit, in the epilogue, if any.
2096 * Requires epilogue!=null.
2097 */
2098 static DirProp
firstL_R_AL_EN_AN(UBiDi * pBiDi)2099 firstL_R_AL_EN_AN(UBiDi *pBiDi) {
2100 const UChar *text=pBiDi->epilogue;
2101 int32_t length=pBiDi->epiLength;
2102 int32_t i;
2103 UChar32 uchar;
2104 DirProp dirProp;
2105 for(i=0; i<length; ) {
2106 /* i is incremented by U16_NEXT */
2107 U16_NEXT(text, i, length, uchar);
2108 dirProp=(DirProp)ubidi_getCustomizedClass(pBiDi, uchar);
2109 if(dirProp==L) {
2110 return DirProp_L;
2111 }
2112 if(dirProp==R || dirProp==AL) {
2113 return DirProp_R;
2114 }
2115 if(dirProp==EN) {
2116 return DirProp_EN;
2117 }
2118 if(dirProp==AN) {
2119 return DirProp_AN;
2120 }
2121 }
2122 return DirProp_ON;
2123 }
2124
2125 static void
resolveImplicitLevels(UBiDi * pBiDi,int32_t start,int32_t limit,DirProp sor,DirProp eor)2126 resolveImplicitLevels(UBiDi *pBiDi,
2127 int32_t start, int32_t limit,
2128 DirProp sor, DirProp eor) {
2129 const DirProp *dirProps=pBiDi->dirProps;
2130 DirProp dirProp;
2131 LevState levState;
2132 int32_t i, start1, start2;
2133 uint16_t oldStateImp, stateImp, actionImp;
2134 uint8_t gprop, resProp, cell;
2135 UBool inverseRTL;
2136 DirProp nextStrongProp=R;
2137 int32_t nextStrongPos=-1;
2138
2139 /* check for RTL inverse BiDi mode */
2140 /* FOOD FOR THOUGHT: in case of RTL inverse BiDi, it would make sense to
2141 * loop on the text characters from end to start.
2142 * This would need a different properties state table (at least different
2143 * actions) and different levels state tables (maybe very similar to the
2144 * LTR corresponding ones.
2145 */
2146 inverseRTL=(UBool)
2147 ((start<pBiDi->lastArabicPos) && (GET_PARALEVEL(pBiDi, start) & 1) &&
2148 (pBiDi->reorderingMode==UBIDI_REORDER_INVERSE_LIKE_DIRECT ||
2149 pBiDi->reorderingMode==UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL));
2150
2151 /* initialize for property and levels state tables */
2152 levState.startL2EN=-1; /* used for INVERSE_LIKE_DIRECT_WITH_MARKS */
2153 levState.lastStrongRTL=-1; /* used for INVERSE_LIKE_DIRECT_WITH_MARKS */
2154 levState.runStart=start;
2155 levState.runLevel=pBiDi->levels[start];
2156 levState.pImpTab=(const ImpTab*)((pBiDi->pImpTabPair)->pImpTab)[levState.runLevel&1];
2157 levState.pImpAct=(const ImpAct*)((pBiDi->pImpTabPair)->pImpAct)[levState.runLevel&1];
2158 if(start==0 && pBiDi->proLength>0) {
2159 DirProp lastStrong=lastL_R_AL(pBiDi);
2160 if(lastStrong!=DirProp_ON) {
2161 sor=lastStrong;
2162 }
2163 }
2164 /* The isolates[] entries contain enough information to
2165 resume the bidi algorithm in the same state as it was
2166 when it was interrupted by an isolate sequence. */
2167 if(dirProps[start]==PDI && pBiDi->isolateCount >= 0) {
2168 levState.startON=pBiDi->isolates[pBiDi->isolateCount].startON;
2169 start1=pBiDi->isolates[pBiDi->isolateCount].start1;
2170 stateImp=pBiDi->isolates[pBiDi->isolateCount].stateImp;
2171 levState.state=pBiDi->isolates[pBiDi->isolateCount].state;
2172 pBiDi->isolateCount--;
2173 } else {
2174 levState.startON=-1;
2175 start1=start;
2176 if(dirProps[start]==NSM)
2177 stateImp = 1 + sor;
2178 else
2179 stateImp=0;
2180 levState.state=0;
2181 processPropertySeq(pBiDi, &levState, sor, start, start);
2182 }
2183 start2=start; /* to make Java compiler happy */
2184
2185 for(i=start; i<=limit; i++) {
2186 if(i>=limit) {
2187 int32_t k;
2188 for(k=limit-1; k>start&&(DIRPROP_FLAG(dirProps[k])&MASK_BN_EXPLICIT); k--);
2189 dirProp=dirProps[k];
2190 if(dirProp==LRI || dirProp==RLI)
2191 break; /* no forced closing for sequence ending with LRI/RLI */
2192 gprop=eor;
2193 } else {
2194 DirProp prop, prop1;
2195 prop=dirProps[i];
2196 if(prop==B) {
2197 pBiDi->isolateCount=-1; /* current isolates stack entry == none */
2198 }
2199 if(inverseRTL) {
2200 if(prop==AL) {
2201 /* AL before EN does not make it AN */
2202 prop=R;
2203 } else if(prop==EN) {
2204 if(nextStrongPos<=i) {
2205 /* look for next strong char (L/R/AL) */
2206 int32_t j;
2207 nextStrongProp=R; /* set default */
2208 nextStrongPos=limit;
2209 for(j=i+1; j<limit; j++) {
2210 prop1=dirProps[j];
2211 if(prop1==L || prop1==R || prop1==AL) {
2212 nextStrongProp=prop1;
2213 nextStrongPos=j;
2214 break;
2215 }
2216 }
2217 }
2218 if(nextStrongProp==AL) {
2219 prop=AN;
2220 }
2221 }
2222 }
2223 gprop=groupProp[prop];
2224 }
2225 oldStateImp=stateImp;
2226 cell=impTabProps[oldStateImp][gprop];
2227 stateImp=GET_STATEPROPS(cell); /* isolate the new state */
2228 actionImp=GET_ACTIONPROPS(cell); /* isolate the action */
2229 if((i==limit) && (actionImp==0)) {
2230 /* there is an unprocessed sequence if its property == eor */
2231 actionImp=1; /* process the last sequence */
2232 }
2233 if(actionImp) {
2234 resProp=impTabProps[oldStateImp][IMPTABPROPS_RES];
2235 switch(actionImp) {
2236 case 1: /* process current seq1, init new seq1 */
2237 processPropertySeq(pBiDi, &levState, resProp, start1, i);
2238 start1=i;
2239 break;
2240 case 2: /* init new seq2 */
2241 start2=i;
2242 break;
2243 case 3: /* process seq1, process seq2, init new seq1 */
2244 processPropertySeq(pBiDi, &levState, resProp, start1, start2);
2245 processPropertySeq(pBiDi, &levState, DirProp_ON, start2, i);
2246 start1=i;
2247 break;
2248 case 4: /* process seq1, set seq1=seq2, init new seq2 */
2249 processPropertySeq(pBiDi, &levState, resProp, start1, start2);
2250 start1=start2;
2251 start2=i;
2252 break;
2253 default: /* we should never get here */
2254 U_ASSERT(FALSE);
2255 break;
2256 }
2257 }
2258 }
2259
2260 /* flush possible pending sequence, e.g. ON */
2261 if(limit==pBiDi->length && pBiDi->epiLength>0) {
2262 DirProp firstStrong=firstL_R_AL_EN_AN(pBiDi);
2263 if(firstStrong!=DirProp_ON) {
2264 eor=firstStrong;
2265 }
2266 }
2267
2268 /* look for the last char not a BN or LRE/RLE/LRO/RLO/PDF */
2269 for(i=limit-1; i>start&&(DIRPROP_FLAG(dirProps[i])&MASK_BN_EXPLICIT); i--);
2270 dirProp=dirProps[i];
2271 if((dirProp==LRI || dirProp==RLI) && limit<pBiDi->length) {
2272 pBiDi->isolateCount++;
2273 pBiDi->isolates[pBiDi->isolateCount].stateImp=stateImp;
2274 pBiDi->isolates[pBiDi->isolateCount].state=levState.state;
2275 pBiDi->isolates[pBiDi->isolateCount].start1=start1;
2276 pBiDi->isolates[pBiDi->isolateCount].startON=levState.startON;
2277 }
2278 else
2279 processPropertySeq(pBiDi, &levState, eor, limit, limit);
2280 }
2281
2282 /* perform (L1) and (X9) ---------------------------------------------------- */
2283
2284 /*
2285 * Reset the embedding levels for some non-graphic characters (L1).
2286 * This function also sets appropriate levels for BN, and
2287 * explicit embedding types that are supposed to have been removed
2288 * from the paragraph in (X9).
2289 */
2290 static void
adjustWSLevels(UBiDi * pBiDi)2291 adjustWSLevels(UBiDi *pBiDi) {
2292 const DirProp *dirProps=pBiDi->dirProps;
2293 UBiDiLevel *levels=pBiDi->levels;
2294 int32_t i;
2295
2296 if(pBiDi->flags&MASK_WS) {
2297 UBool orderParagraphsLTR=pBiDi->orderParagraphsLTR;
2298 Flags flag;
2299
2300 i=pBiDi->trailingWSStart;
2301 while(i>0) {
2302 /* reset a sequence of WS/BN before eop and B/S to the paragraph paraLevel */
2303 while(i>0 && (flag=DIRPROP_FLAG(dirProps[--i]))&MASK_WS) {
2304 if(orderParagraphsLTR&&(flag&DIRPROP_FLAG(B))) {
2305 levels[i]=0;
2306 } else {
2307 levels[i]=GET_PARALEVEL(pBiDi, i);
2308 }
2309 }
2310
2311 /* reset BN to the next character's paraLevel until B/S, which restarts above loop */
2312 /* here, i+1 is guaranteed to be <length */
2313 while(i>0) {
2314 flag=DIRPROP_FLAG(dirProps[--i]);
2315 if(flag&MASK_BN_EXPLICIT) {
2316 levels[i]=levels[i+1];
2317 } else if(orderParagraphsLTR&&(flag&DIRPROP_FLAG(B))) {
2318 levels[i]=0;
2319 break;
2320 } else if(flag&MASK_B_S) {
2321 levels[i]=GET_PARALEVEL(pBiDi, i);
2322 break;
2323 }
2324 }
2325 }
2326 }
2327 }
2328
2329 U_CAPI void U_EXPORT2
ubidi_setContext(UBiDi * pBiDi,const UChar * prologue,int32_t proLength,const UChar * epilogue,int32_t epiLength,UErrorCode * pErrorCode)2330 ubidi_setContext(UBiDi *pBiDi,
2331 const UChar *prologue, int32_t proLength,
2332 const UChar *epilogue, int32_t epiLength,
2333 UErrorCode *pErrorCode) {
2334 /* check the argument values */
2335 RETURN_VOID_IF_NULL_OR_FAILING_ERRCODE(pErrorCode);
2336 if(pBiDi==NULL || proLength<-1 || epiLength<-1 ||
2337 (prologue==NULL && proLength!=0) || (epilogue==NULL && epiLength!=0)) {
2338 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
2339 return;
2340 }
2341
2342 if(proLength==-1) {
2343 pBiDi->proLength=u_strlen(prologue);
2344 } else {
2345 pBiDi->proLength=proLength;
2346 }
2347 if(epiLength==-1) {
2348 pBiDi->epiLength=u_strlen(epilogue);
2349 } else {
2350 pBiDi->epiLength=epiLength;
2351 }
2352 pBiDi->prologue=prologue;
2353 pBiDi->epilogue=epilogue;
2354 }
2355
2356 static void
setParaSuccess(UBiDi * pBiDi)2357 setParaSuccess(UBiDi *pBiDi) {
2358 pBiDi->proLength=0; /* forget the last context */
2359 pBiDi->epiLength=0;
2360 pBiDi->pParaBiDi=pBiDi; /* mark successful setPara */
2361 }
2362
2363 #define BIDI_MIN(x, y) ((x)<(y) ? (x) : (y))
2364 #define BIDI_ABS(x) ((x)>=0 ? (x) : (-(x)))
2365
2366 static void
setParaRunsOnly(UBiDi * pBiDi,const UChar * text,int32_t length,UBiDiLevel paraLevel,UErrorCode * pErrorCode)2367 setParaRunsOnly(UBiDi *pBiDi, const UChar *text, int32_t length,
2368 UBiDiLevel paraLevel, UErrorCode *pErrorCode) {
2369 int32_t *runsOnlyMemory = NULL;
2370 int32_t *visualMap;
2371 UChar *visualText;
2372 int32_t saveLength, saveTrailingWSStart;
2373 const UBiDiLevel *levels;
2374 UBiDiLevel *saveLevels;
2375 UBiDiDirection saveDirection;
2376 UBool saveMayAllocateText;
2377 Run *runs;
2378 int32_t visualLength, i, j, visualStart, logicalStart,
2379 runCount, runLength, addedRuns, insertRemove,
2380 start, limit, step, indexOddBit, logicalPos,
2381 index0, index1;
2382 uint32_t saveOptions;
2383
2384 pBiDi->reorderingMode=UBIDI_REORDER_DEFAULT;
2385 if(length==0) {
2386 ubidi_setPara(pBiDi, text, length, paraLevel, NULL, pErrorCode);
2387 goto cleanup3;
2388 }
2389 /* obtain memory for mapping table and visual text */
2390 runsOnlyMemory=static_cast<int32_t *>(uprv_malloc(length*(sizeof(int32_t)+sizeof(UChar)+sizeof(UBiDiLevel))));
2391 if(runsOnlyMemory==NULL) {
2392 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
2393 goto cleanup3;
2394 }
2395 visualMap=runsOnlyMemory;
2396 visualText=(UChar *)&visualMap[length];
2397 saveLevels=(UBiDiLevel *)&visualText[length];
2398 saveOptions=pBiDi->reorderingOptions;
2399 if(saveOptions & UBIDI_OPTION_INSERT_MARKS) {
2400 pBiDi->reorderingOptions&=~UBIDI_OPTION_INSERT_MARKS;
2401 pBiDi->reorderingOptions|=UBIDI_OPTION_REMOVE_CONTROLS;
2402 }
2403 paraLevel&=1; /* accept only 0 or 1 */
2404 ubidi_setPara(pBiDi, text, length, paraLevel, NULL, pErrorCode);
2405 if(U_FAILURE(*pErrorCode)) {
2406 goto cleanup3;
2407 }
2408 /* we cannot access directly pBiDi->levels since it is not yet set if
2409 * direction is not MIXED
2410 */
2411 levels=ubidi_getLevels(pBiDi, pErrorCode);
2412 uprv_memcpy(saveLevels, levels, (size_t)pBiDi->length*sizeof(UBiDiLevel));
2413 saveTrailingWSStart=pBiDi->trailingWSStart;
2414 saveLength=pBiDi->length;
2415 saveDirection=pBiDi->direction;
2416
2417 /* FOOD FOR THOUGHT: instead of writing the visual text, we could use
2418 * the visual map and the dirProps array to drive the second call
2419 * to ubidi_setPara (but must make provision for possible removal of
2420 * BiDi controls. Alternatively, only use the dirProps array via
2421 * customized classifier callback.
2422 */
2423 visualLength=ubidi_writeReordered(pBiDi, visualText, length,
2424 UBIDI_DO_MIRRORING, pErrorCode);
2425 ubidi_getVisualMap(pBiDi, visualMap, pErrorCode);
2426 if(U_FAILURE(*pErrorCode)) {
2427 goto cleanup2;
2428 }
2429 pBiDi->reorderingOptions=saveOptions;
2430
2431 pBiDi->reorderingMode=UBIDI_REORDER_INVERSE_LIKE_DIRECT;
2432 paraLevel^=1;
2433 /* Because what we did with reorderingOptions, visualText may be shorter
2434 * than the original text. But we don't want the levels memory to be
2435 * reallocated shorter than the original length, since we need to restore
2436 * the levels as after the first call to ubidi_setpara() before returning.
2437 * We will force mayAllocateText to FALSE before the second call to
2438 * ubidi_setpara(), and will restore it afterwards.
2439 */
2440 saveMayAllocateText=pBiDi->mayAllocateText;
2441 pBiDi->mayAllocateText=FALSE;
2442 ubidi_setPara(pBiDi, visualText, visualLength, paraLevel, NULL, pErrorCode);
2443 pBiDi->mayAllocateText=saveMayAllocateText;
2444 ubidi_getRuns(pBiDi, pErrorCode);
2445 if(U_FAILURE(*pErrorCode)) {
2446 goto cleanup1;
2447 }
2448 /* check if some runs must be split, count how many splits */
2449 addedRuns=0;
2450 runCount=pBiDi->runCount;
2451 runs=pBiDi->runs;
2452 visualStart=0;
2453 for(i=0; i<runCount; i++, visualStart+=runLength) {
2454 runLength=runs[i].visualLimit-visualStart;
2455 if(runLength<2) {
2456 continue;
2457 }
2458 logicalStart=GET_INDEX(runs[i].logicalStart);
2459 for(j=logicalStart+1; j<logicalStart+runLength; j++) {
2460 index0=visualMap[j];
2461 index1=visualMap[j-1];
2462 if((BIDI_ABS(index0-index1)!=1) || (saveLevels[index0]!=saveLevels[index1])) {
2463 addedRuns++;
2464 }
2465 }
2466 }
2467 if(addedRuns) {
2468 if(getRunsMemory(pBiDi, runCount+addedRuns)) {
2469 if(runCount==1) {
2470 /* because we switch from UBiDi.simpleRuns to UBiDi.runs */
2471 pBiDi->runsMemory[0]=runs[0];
2472 }
2473 runs=pBiDi->runs=pBiDi->runsMemory;
2474 pBiDi->runCount+=addedRuns;
2475 } else {
2476 goto cleanup1;
2477 }
2478 }
2479 /* split runs which are not consecutive in source text */
2480 for(i=runCount-1; i>=0; i--) {
2481 runLength= i==0 ? runs[0].visualLimit :
2482 runs[i].visualLimit-runs[i-1].visualLimit;
2483 logicalStart=runs[i].logicalStart;
2484 indexOddBit=GET_ODD_BIT(logicalStart);
2485 logicalStart=GET_INDEX(logicalStart);
2486 if(runLength<2) {
2487 if(addedRuns) {
2488 runs[i+addedRuns]=runs[i];
2489 }
2490 logicalPos=visualMap[logicalStart];
2491 runs[i+addedRuns].logicalStart=MAKE_INDEX_ODD_PAIR(logicalPos,
2492 saveLevels[logicalPos]^indexOddBit);
2493 continue;
2494 }
2495 if(indexOddBit) {
2496 start=logicalStart;
2497 limit=logicalStart+runLength-1;
2498 step=1;
2499 } else {
2500 start=logicalStart+runLength-1;
2501 limit=logicalStart;
2502 step=-1;
2503 }
2504 for(j=start; j!=limit; j+=step) {
2505 index0=visualMap[j];
2506 index1=visualMap[j+step];
2507 if((BIDI_ABS(index0-index1)!=1) || (saveLevels[index0]!=saveLevels[index1])) {
2508 logicalPos=BIDI_MIN(visualMap[start], index0);
2509 runs[i+addedRuns].logicalStart=MAKE_INDEX_ODD_PAIR(logicalPos,
2510 saveLevels[logicalPos]^indexOddBit);
2511 runs[i+addedRuns].visualLimit=runs[i].visualLimit;
2512 runs[i].visualLimit-=BIDI_ABS(j-start)+1;
2513 insertRemove=runs[i].insertRemove&(LRM_AFTER|RLM_AFTER);
2514 runs[i+addedRuns].insertRemove=insertRemove;
2515 runs[i].insertRemove&=~insertRemove;
2516 start=j+step;
2517 addedRuns--;
2518 }
2519 }
2520 if(addedRuns) {
2521 runs[i+addedRuns]=runs[i];
2522 }
2523 logicalPos=BIDI_MIN(visualMap[start], visualMap[limit]);
2524 runs[i+addedRuns].logicalStart=MAKE_INDEX_ODD_PAIR(logicalPos,
2525 saveLevels[logicalPos]^indexOddBit);
2526 }
2527
2528 cleanup1:
2529 /* restore initial paraLevel */
2530 pBiDi->paraLevel^=1;
2531 cleanup2:
2532 /* restore real text */
2533 pBiDi->text=text;
2534 pBiDi->length=saveLength;
2535 pBiDi->originalLength=length;
2536 pBiDi->direction=saveDirection;
2537 /* the saved levels should never excess levelsSize, but we check anyway */
2538 if(saveLength>pBiDi->levelsSize) {
2539 saveLength=pBiDi->levelsSize;
2540 }
2541 uprv_memcpy(pBiDi->levels, saveLevels, (size_t)saveLength*sizeof(UBiDiLevel));
2542 pBiDi->trailingWSStart=saveTrailingWSStart;
2543 if(pBiDi->runCount>1) {
2544 pBiDi->direction=UBIDI_MIXED;
2545 }
2546 cleanup3:
2547 /* free memory for mapping table and visual text */
2548 uprv_free(runsOnlyMemory);
2549
2550 pBiDi->reorderingMode=UBIDI_REORDER_RUNS_ONLY;
2551 }
2552
2553 /* ubidi_setPara ------------------------------------------------------------ */
2554
2555 U_CAPI void U_EXPORT2
ubidi_setPara(UBiDi * pBiDi,const UChar * text,int32_t length,UBiDiLevel paraLevel,UBiDiLevel * embeddingLevels,UErrorCode * pErrorCode)2556 ubidi_setPara(UBiDi *pBiDi, const UChar *text, int32_t length,
2557 UBiDiLevel paraLevel, UBiDiLevel *embeddingLevels,
2558 UErrorCode *pErrorCode) {
2559 UBiDiDirection direction;
2560 DirProp *dirProps;
2561
2562 /* check the argument values */
2563 RETURN_VOID_IF_NULL_OR_FAILING_ERRCODE(pErrorCode);
2564 if(pBiDi==NULL || text==NULL || length<-1 ||
2565 (paraLevel>UBIDI_MAX_EXPLICIT_LEVEL && paraLevel<UBIDI_DEFAULT_LTR)) {
2566 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
2567 return;
2568 }
2569
2570 if(length==-1) {
2571 length=u_strlen(text);
2572 }
2573
2574 /* special treatment for RUNS_ONLY mode */
2575 if(pBiDi->reorderingMode==UBIDI_REORDER_RUNS_ONLY) {
2576 setParaRunsOnly(pBiDi, text, length, paraLevel, pErrorCode);
2577 return;
2578 }
2579
2580 /* initialize the UBiDi structure */
2581 pBiDi->pParaBiDi=NULL; /* mark unfinished setPara */
2582 pBiDi->text=text;
2583 pBiDi->length=pBiDi->originalLength=pBiDi->resultLength=length;
2584 pBiDi->paraLevel=paraLevel;
2585 pBiDi->direction=(UBiDiDirection)(paraLevel&1);
2586 pBiDi->paraCount=1;
2587
2588 pBiDi->dirProps=NULL;
2589 pBiDi->levels=NULL;
2590 pBiDi->runs=NULL;
2591 pBiDi->insertPoints.size=0; /* clean up from last call */
2592 pBiDi->insertPoints.confirmed=0; /* clean up from last call */
2593
2594 /*
2595 * Save the original paraLevel if contextual; otherwise, set to 0.
2596 */
2597 pBiDi->defaultParaLevel=IS_DEFAULT_LEVEL(paraLevel);
2598
2599 if(length==0) {
2600 /*
2601 * For an empty paragraph, create a UBiDi object with the paraLevel and
2602 * the flags and the direction set but without allocating zero-length arrays.
2603 * There is nothing more to do.
2604 */
2605 if(IS_DEFAULT_LEVEL(paraLevel)) {
2606 pBiDi->paraLevel&=1;
2607 pBiDi->defaultParaLevel=0;
2608 }
2609 pBiDi->flags=DIRPROP_FLAG_LR(paraLevel);
2610 pBiDi->runCount=0;
2611 pBiDi->paraCount=0;
2612 setParaSuccess(pBiDi); /* mark successful setPara */
2613 return;
2614 }
2615
2616 pBiDi->runCount=-1;
2617
2618 /* allocate paras memory */
2619 if(pBiDi->parasMemory)
2620 pBiDi->paras=pBiDi->parasMemory;
2621 else
2622 pBiDi->paras=pBiDi->simpleParas;
2623
2624 /*
2625 * Get the directional properties,
2626 * the flags bit-set, and
2627 * determine the paragraph level if necessary.
2628 */
2629 if(getDirPropsMemory(pBiDi, length)) {
2630 pBiDi->dirProps=pBiDi->dirPropsMemory;
2631 if(!getDirProps(pBiDi)) {
2632 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
2633 return;
2634 }
2635 } else {
2636 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
2637 return;
2638 }
2639 dirProps=pBiDi->dirProps;
2640 /* the processed length may have changed if UBIDI_OPTION_STREAMING */
2641 length= pBiDi->length;
2642 pBiDi->trailingWSStart=length; /* the levels[] will reflect the WS run */
2643
2644 /* are explicit levels specified? */
2645 if(embeddingLevels==NULL) {
2646 /* no: determine explicit levels according to the (Xn) rules */\
2647 if(getLevelsMemory(pBiDi, length)) {
2648 pBiDi->levels=pBiDi->levelsMemory;
2649 direction=resolveExplicitLevels(pBiDi, pErrorCode);
2650 if(U_FAILURE(*pErrorCode)) {
2651 return;
2652 }
2653 } else {
2654 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
2655 return;
2656 }
2657 } else {
2658 /* set BN for all explicit codes, check that all levels are 0 or paraLevel..UBIDI_MAX_EXPLICIT_LEVEL */
2659 pBiDi->levels=embeddingLevels;
2660 direction=checkExplicitLevels(pBiDi, pErrorCode);
2661 if(U_FAILURE(*pErrorCode)) {
2662 return;
2663 }
2664 }
2665
2666 /* allocate isolate memory */
2667 if(pBiDi->isolateCount<=SIMPLE_ISOLATES_COUNT)
2668 pBiDi->isolates=pBiDi->simpleIsolates;
2669 else
2670 if((int32_t)(pBiDi->isolateCount*sizeof(Isolate))<=pBiDi->isolatesSize)
2671 pBiDi->isolates=pBiDi->isolatesMemory;
2672 else {
2673 if(getInitialIsolatesMemory(pBiDi, pBiDi->isolateCount)) {
2674 pBiDi->isolates=pBiDi->isolatesMemory;
2675 } else {
2676 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
2677 return;
2678 }
2679 }
2680 pBiDi->isolateCount=-1; /* current isolates stack entry == none */
2681
2682 /*
2683 * The steps after (X9) in the UBiDi algorithm are performed only if
2684 * the paragraph text has mixed directionality!
2685 */
2686 pBiDi->direction=direction;
2687 switch(direction) {
2688 case UBIDI_LTR:
2689 /* all levels are implicitly at paraLevel (important for ubidi_getLevels()) */
2690 pBiDi->trailingWSStart=0;
2691 break;
2692 case UBIDI_RTL:
2693 /* all levels are implicitly at paraLevel (important for ubidi_getLevels()) */
2694 pBiDi->trailingWSStart=0;
2695 break;
2696 default:
2697 /*
2698 * Choose the right implicit state table
2699 */
2700 switch(pBiDi->reorderingMode) {
2701 case UBIDI_REORDER_DEFAULT:
2702 pBiDi->pImpTabPair=&impTab_DEFAULT;
2703 break;
2704 case UBIDI_REORDER_NUMBERS_SPECIAL:
2705 pBiDi->pImpTabPair=&impTab_NUMBERS_SPECIAL;
2706 break;
2707 case UBIDI_REORDER_GROUP_NUMBERS_WITH_R:
2708 pBiDi->pImpTabPair=&impTab_GROUP_NUMBERS_WITH_R;
2709 break;
2710 case UBIDI_REORDER_INVERSE_NUMBERS_AS_L:
2711 pBiDi->pImpTabPair=&impTab_INVERSE_NUMBERS_AS_L;
2712 break;
2713 case UBIDI_REORDER_INVERSE_LIKE_DIRECT:
2714 if (pBiDi->reorderingOptions & UBIDI_OPTION_INSERT_MARKS) {
2715 pBiDi->pImpTabPair=&impTab_INVERSE_LIKE_DIRECT_WITH_MARKS;
2716 } else {
2717 pBiDi->pImpTabPair=&impTab_INVERSE_LIKE_DIRECT;
2718 }
2719 break;
2720 case UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL:
2721 if (pBiDi->reorderingOptions & UBIDI_OPTION_INSERT_MARKS) {
2722 pBiDi->pImpTabPair=&impTab_INVERSE_FOR_NUMBERS_SPECIAL_WITH_MARKS;
2723 } else {
2724 pBiDi->pImpTabPair=&impTab_INVERSE_FOR_NUMBERS_SPECIAL;
2725 }
2726 break;
2727 default:
2728 /* we should never get here */
2729 U_ASSERT(FALSE);
2730 break;
2731 }
2732 /*
2733 * If there are no external levels specified and there
2734 * are no significant explicit level codes in the text,
2735 * then we can treat the entire paragraph as one run.
2736 * Otherwise, we need to perform the following rules on runs of
2737 * the text with the same embedding levels. (X10)
2738 * "Significant" explicit level codes are ones that actually
2739 * affect non-BN characters.
2740 * Examples for "insignificant" ones are empty embeddings
2741 * LRE-PDF, LRE-RLE-PDF-PDF, etc.
2742 */
2743 if(embeddingLevels==NULL && pBiDi->paraCount<=1 &&
2744 !(pBiDi->flags&DIRPROP_FLAG_MULTI_RUNS)) {
2745 resolveImplicitLevels(pBiDi, 0, length,
2746 GET_LR_FROM_LEVEL(GET_PARALEVEL(pBiDi, 0)),
2747 GET_LR_FROM_LEVEL(GET_PARALEVEL(pBiDi, length-1)));
2748 } else {
2749 /* sor, eor: start and end types of same-level-run */
2750 UBiDiLevel *levels=pBiDi->levels;
2751 int32_t start, limit=0;
2752 UBiDiLevel level, nextLevel;
2753 DirProp sor, eor;
2754
2755 /* determine the first sor and set eor to it because of the loop body (sor=eor there) */
2756 level=GET_PARALEVEL(pBiDi, 0);
2757 nextLevel=levels[0];
2758 if(level<nextLevel) {
2759 eor=GET_LR_FROM_LEVEL(nextLevel);
2760 } else {
2761 eor=GET_LR_FROM_LEVEL(level);
2762 }
2763
2764 do {
2765 /* determine start and limit of the run (end points just behind the run) */
2766
2767 /* the values for this run's start are the same as for the previous run's end */
2768 start=limit;
2769 level=nextLevel;
2770 if((start>0) && (dirProps[start-1]==B)) {
2771 /* except if this is a new paragraph, then set sor = para level */
2772 sor=GET_LR_FROM_LEVEL(GET_PARALEVEL(pBiDi, start));
2773 } else {
2774 sor=eor;
2775 }
2776
2777 /* search for the limit of this run */
2778 while((++limit<length) &&
2779 ((levels[limit]==level) ||
2780 (DIRPROP_FLAG(dirProps[limit])&MASK_BN_EXPLICIT))) {}
2781
2782 /* get the correct level of the next run */
2783 if(limit<length) {
2784 nextLevel=levels[limit];
2785 } else {
2786 nextLevel=GET_PARALEVEL(pBiDi, length-1);
2787 }
2788
2789 /* determine eor from max(level, nextLevel); sor is last run's eor */
2790 if(NO_OVERRIDE(level)<NO_OVERRIDE(nextLevel)) {
2791 eor=GET_LR_FROM_LEVEL(nextLevel);
2792 } else {
2793 eor=GET_LR_FROM_LEVEL(level);
2794 }
2795
2796 /* if the run consists of overridden directional types, then there
2797 are no implicit types to be resolved */
2798 if(!(level&UBIDI_LEVEL_OVERRIDE)) {
2799 resolveImplicitLevels(pBiDi, start, limit, sor, eor);
2800 } else {
2801 /* remove the UBIDI_LEVEL_OVERRIDE flags */
2802 do {
2803 levels[start++]&=~UBIDI_LEVEL_OVERRIDE;
2804 } while(start<limit);
2805 }
2806 } while(limit<length);
2807 }
2808 /* check if we got any memory shortage while adding insert points */
2809 if (U_FAILURE(pBiDi->insertPoints.errorCode))
2810 {
2811 *pErrorCode=pBiDi->insertPoints.errorCode;
2812 return;
2813 }
2814 /* reset the embedding levels for some non-graphic characters (L1), (X9) */
2815 adjustWSLevels(pBiDi);
2816 break;
2817 }
2818 /* add RLM for inverse Bidi with contextual orientation resolving
2819 * to RTL which would not round-trip otherwise
2820 */
2821 if((pBiDi->defaultParaLevel>0) &&
2822 (pBiDi->reorderingOptions & UBIDI_OPTION_INSERT_MARKS) &&
2823 ((pBiDi->reorderingMode==UBIDI_REORDER_INVERSE_LIKE_DIRECT) ||
2824 (pBiDi->reorderingMode==UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL))) {
2825 int32_t i, j, start, last;
2826 UBiDiLevel level;
2827 DirProp dirProp;
2828 for(i=0; i<pBiDi->paraCount; i++) {
2829 last=(pBiDi->paras[i].limit)-1;
2830 level= static_cast<UBiDiLevel>(pBiDi->paras[i].level);
2831 if(level==0)
2832 continue; /* LTR paragraph */
2833 start= i==0 ? 0 : pBiDi->paras[i-1].limit;
2834 for(j=last; j>=start; j--) {
2835 dirProp=dirProps[j];
2836 if(dirProp==L) {
2837 if(j<last) {
2838 while(dirProps[last]==B) {
2839 last--;
2840 }
2841 }
2842 addPoint(pBiDi, last, RLM_BEFORE);
2843 break;
2844 }
2845 if(DIRPROP_FLAG(dirProp) & MASK_R_AL) {
2846 break;
2847 }
2848 }
2849 }
2850 }
2851
2852 if(pBiDi->reorderingOptions & UBIDI_OPTION_REMOVE_CONTROLS) {
2853 pBiDi->resultLength -= pBiDi->controlCount;
2854 } else {
2855 pBiDi->resultLength += pBiDi->insertPoints.size;
2856 }
2857 setParaSuccess(pBiDi); /* mark successful setPara */
2858 }
2859
2860 U_CAPI void U_EXPORT2
ubidi_orderParagraphsLTR(UBiDi * pBiDi,UBool orderParagraphsLTR)2861 ubidi_orderParagraphsLTR(UBiDi *pBiDi, UBool orderParagraphsLTR) {
2862 if(pBiDi!=NULL) {
2863 pBiDi->orderParagraphsLTR=orderParagraphsLTR;
2864 }
2865 }
2866
2867 U_CAPI UBool U_EXPORT2
ubidi_isOrderParagraphsLTR(UBiDi * pBiDi)2868 ubidi_isOrderParagraphsLTR(UBiDi *pBiDi) {
2869 if(pBiDi!=NULL) {
2870 return pBiDi->orderParagraphsLTR;
2871 } else {
2872 return FALSE;
2873 }
2874 }
2875
2876 U_CAPI UBiDiDirection U_EXPORT2
ubidi_getDirection(const UBiDi * pBiDi)2877 ubidi_getDirection(const UBiDi *pBiDi) {
2878 if(IS_VALID_PARA_OR_LINE(pBiDi)) {
2879 return pBiDi->direction;
2880 } else {
2881 return UBIDI_LTR;
2882 }
2883 }
2884
2885 U_CAPI const UChar * U_EXPORT2
ubidi_getText(const UBiDi * pBiDi)2886 ubidi_getText(const UBiDi *pBiDi) {
2887 if(IS_VALID_PARA_OR_LINE(pBiDi)) {
2888 return pBiDi->text;
2889 } else {
2890 return NULL;
2891 }
2892 }
2893
2894 U_CAPI int32_t U_EXPORT2
ubidi_getLength(const UBiDi * pBiDi)2895 ubidi_getLength(const UBiDi *pBiDi) {
2896 if(IS_VALID_PARA_OR_LINE(pBiDi)) {
2897 return pBiDi->originalLength;
2898 } else {
2899 return 0;
2900 }
2901 }
2902
2903 U_CAPI int32_t U_EXPORT2
ubidi_getProcessedLength(const UBiDi * pBiDi)2904 ubidi_getProcessedLength(const UBiDi *pBiDi) {
2905 if(IS_VALID_PARA_OR_LINE(pBiDi)) {
2906 return pBiDi->length;
2907 } else {
2908 return 0;
2909 }
2910 }
2911
2912 U_CAPI int32_t U_EXPORT2
ubidi_getResultLength(const UBiDi * pBiDi)2913 ubidi_getResultLength(const UBiDi *pBiDi) {
2914 if(IS_VALID_PARA_OR_LINE(pBiDi)) {
2915 return pBiDi->resultLength;
2916 } else {
2917 return 0;
2918 }
2919 }
2920
2921 /* paragraphs API functions ------------------------------------------------- */
2922
2923 U_CAPI UBiDiLevel U_EXPORT2
ubidi_getParaLevel(const UBiDi * pBiDi)2924 ubidi_getParaLevel(const UBiDi *pBiDi) {
2925 if(IS_VALID_PARA_OR_LINE(pBiDi)) {
2926 return pBiDi->paraLevel;
2927 } else {
2928 return 0;
2929 }
2930 }
2931
2932 U_CAPI int32_t U_EXPORT2
ubidi_countParagraphs(UBiDi * pBiDi)2933 ubidi_countParagraphs(UBiDi *pBiDi) {
2934 if(!IS_VALID_PARA_OR_LINE(pBiDi)) {
2935 return 0;
2936 } else {
2937 return pBiDi->paraCount;
2938 }
2939 }
2940
2941 U_CAPI void U_EXPORT2
ubidi_getParagraphByIndex(const UBiDi * pBiDi,int32_t paraIndex,int32_t * pParaStart,int32_t * pParaLimit,UBiDiLevel * pParaLevel,UErrorCode * pErrorCode)2942 ubidi_getParagraphByIndex(const UBiDi *pBiDi, int32_t paraIndex,
2943 int32_t *pParaStart, int32_t *pParaLimit,
2944 UBiDiLevel *pParaLevel, UErrorCode *pErrorCode) {
2945 int32_t paraStart;
2946
2947 /* check the argument values */
2948 RETURN_VOID_IF_NULL_OR_FAILING_ERRCODE(pErrorCode);
2949 RETURN_VOID_IF_NOT_VALID_PARA_OR_LINE(pBiDi, *pErrorCode);
2950 RETURN_VOID_IF_BAD_RANGE(paraIndex, 0, pBiDi->paraCount, *pErrorCode);
2951
2952 pBiDi=pBiDi->pParaBiDi; /* get Para object if Line object */
2953 if(paraIndex) {
2954 paraStart=pBiDi->paras[paraIndex-1].limit;
2955 } else {
2956 paraStart=0;
2957 }
2958 if(pParaStart!=NULL) {
2959 *pParaStart=paraStart;
2960 }
2961 if(pParaLimit!=NULL) {
2962 *pParaLimit=pBiDi->paras[paraIndex].limit;
2963 }
2964 if(pParaLevel!=NULL) {
2965 *pParaLevel=GET_PARALEVEL(pBiDi, paraStart);
2966 }
2967 }
2968
2969 U_CAPI int32_t U_EXPORT2
ubidi_getParagraph(const UBiDi * pBiDi,int32_t charIndex,int32_t * pParaStart,int32_t * pParaLimit,UBiDiLevel * pParaLevel,UErrorCode * pErrorCode)2970 ubidi_getParagraph(const UBiDi *pBiDi, int32_t charIndex,
2971 int32_t *pParaStart, int32_t *pParaLimit,
2972 UBiDiLevel *pParaLevel, UErrorCode *pErrorCode) {
2973 int32_t paraIndex;
2974
2975 /* check the argument values */
2976 /* pErrorCode will be checked by the call to ubidi_getParagraphByIndex */
2977 RETURN_IF_NULL_OR_FAILING_ERRCODE(pErrorCode, -1);
2978 RETURN_IF_NOT_VALID_PARA_OR_LINE(pBiDi, *pErrorCode, -1);
2979 pBiDi=pBiDi->pParaBiDi; /* get Para object if Line object */
2980 RETURN_IF_BAD_RANGE(charIndex, 0, pBiDi->length, *pErrorCode, -1);
2981
2982 for(paraIndex=0; charIndex>=pBiDi->paras[paraIndex].limit; paraIndex++);
2983 ubidi_getParagraphByIndex(pBiDi, paraIndex, pParaStart, pParaLimit, pParaLevel, pErrorCode);
2984 return paraIndex;
2985 }
2986
2987 U_CAPI void U_EXPORT2
ubidi_setClassCallback(UBiDi * pBiDi,UBiDiClassCallback * newFn,const void * newContext,UBiDiClassCallback ** oldFn,const void ** oldContext,UErrorCode * pErrorCode)2988 ubidi_setClassCallback(UBiDi *pBiDi, UBiDiClassCallback *newFn,
2989 const void *newContext, UBiDiClassCallback **oldFn,
2990 const void **oldContext, UErrorCode *pErrorCode)
2991 {
2992 RETURN_VOID_IF_NULL_OR_FAILING_ERRCODE(pErrorCode);
2993 if(pBiDi==NULL) {
2994 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
2995 return;
2996 }
2997 if( oldFn )
2998 {
2999 *oldFn = pBiDi->fnClassCallback;
3000 }
3001 if( oldContext )
3002 {
3003 *oldContext = pBiDi->coClassCallback;
3004 }
3005 pBiDi->fnClassCallback = newFn;
3006 pBiDi->coClassCallback = newContext;
3007 }
3008
3009 U_CAPI void U_EXPORT2
ubidi_getClassCallback(UBiDi * pBiDi,UBiDiClassCallback ** fn,const void ** context)3010 ubidi_getClassCallback(UBiDi *pBiDi, UBiDiClassCallback **fn, const void **context)
3011 {
3012 if(pBiDi==NULL) {
3013 return;
3014 }
3015 if( fn )
3016 {
3017 *fn = pBiDi->fnClassCallback;
3018 }
3019 if( context )
3020 {
3021 *context = pBiDi->coClassCallback;
3022 }
3023 }
3024
3025 U_CAPI UCharDirection U_EXPORT2
ubidi_getCustomizedClass(UBiDi * pBiDi,UChar32 c)3026 ubidi_getCustomizedClass(UBiDi *pBiDi, UChar32 c)
3027 {
3028 UCharDirection dir;
3029
3030 if( pBiDi->fnClassCallback == NULL ||
3031 (dir = (*pBiDi->fnClassCallback)(pBiDi->coClassCallback, c)) == U_BIDI_CLASS_DEFAULT )
3032 {
3033 dir = ubidi_getClass(c);
3034 }
3035 if(dir >= U_CHAR_DIRECTION_COUNT) {
3036 dir = (UCharDirection)ON;
3037 }
3038 return dir;
3039 }
3040