1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 2000-2007, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 * file name: ubidiwrt.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 1999aug06
14 * created by: Markus W. Scherer, updated by Matitiahu Allouche
15 *
16 * This file contains implementations for BiDi functions that use
17 * the core algorithm and core API to write reordered text.
18 */
19
20 /* set import/export definitions */
21 #ifndef U_COMMON_IMPLEMENTATION
22 # define U_COMMON_IMPLEMENTATION
23 #endif
24
25 #include "unicode/utypes.h"
26 #include "unicode/ustring.h"
27 #include "unicode/uchar.h"
28 #include "unicode/ubidi.h"
29 #include "cmemory.h"
30 #include "ustr_imp.h"
31 #include "ubidiimp.h"
32
33 /*
34 * The function implementations in this file are designed
35 * for UTF-16 and UTF-32, not for UTF-8.
36 *
37 * Assumptions that are not true for UTF-8:
38 * - Any code point always needs the same number of code units
39 * ("minimum-length-problem" of UTF-8)
40 * - The BiDi control characters need only one code unit each
41 *
42 * Further assumptions for all UTFs:
43 * - u_charMirror(c) needs the same number of code units as c
44 */
45 #if UTF_SIZE==8
46 # error reimplement ubidi_writeReordered() for UTF-8, see comment above
47 #endif
48
49 #define IS_COMBINING(type) ((1UL<<(type))&(1UL<<U_NON_SPACING_MARK|1UL<<U_COMBINING_SPACING_MARK|1UL<<U_ENCLOSING_MARK))
50
51 /*
52 * When we have UBIDI_OUTPUT_REVERSE set on ubidi_writeReordered(), then we
53 * semantically write RTL runs in reverse and later reverse them again.
54 * Instead, we actually write them in forward order to begin with.
55 * However, if the RTL run was to be mirrored, we need to mirror here now
56 * since the implicit second reversal must not do it.
57 * It looks strange to do mirroring in LTR output, but it is only because
58 * we are writing RTL output in reverse.
59 */
60 static int32_t
doWriteForward(const UChar * src,int32_t srcLength,UChar * dest,int32_t destSize,uint16_t options,UErrorCode * pErrorCode)61 doWriteForward(const UChar *src, int32_t srcLength,
62 UChar *dest, int32_t destSize,
63 uint16_t options,
64 UErrorCode *pErrorCode) {
65 /* optimize for several combinations of options */
66 switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING)) {
67 case 0: {
68 /* simply copy the LTR run to the destination */
69 int32_t length=srcLength;
70 if(destSize<length) {
71 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
72 return srcLength;
73 }
74 do {
75 *dest++=*src++;
76 } while(--length>0);
77 return srcLength;
78 }
79 case UBIDI_DO_MIRRORING: {
80 /* do mirroring */
81 int32_t i=0, j=0;
82 UChar32 c;
83
84 if(destSize<srcLength) {
85 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
86 return srcLength;
87 }
88 do {
89 UTF_NEXT_CHAR(src, i, srcLength, c);
90 c=u_charMirror(c);
91 UTF_APPEND_CHAR_UNSAFE(dest, j, c);
92 } while(i<srcLength);
93 return srcLength;
94 }
95 case UBIDI_REMOVE_BIDI_CONTROLS: {
96 /* copy the LTR run and remove any BiDi control characters */
97 int32_t remaining=destSize;
98 UChar c;
99 do {
100 c=*src++;
101 if(!IS_BIDI_CONTROL_CHAR(c)) {
102 if(--remaining<0) {
103 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
104
105 /* preflight the length */
106 while(--srcLength>0) {
107 c=*src++;
108 if(!IS_BIDI_CONTROL_CHAR(c)) {
109 --remaining;
110 }
111 }
112 return destSize-remaining;
113 }
114 *dest++=c;
115 }
116 } while(--srcLength>0);
117 return destSize-remaining;
118 }
119 default: {
120 /* remove BiDi control characters and do mirroring */
121 int32_t remaining=destSize;
122 int32_t i, j=0;
123 UChar32 c;
124 do {
125 i=0;
126 UTF_NEXT_CHAR(src, i, srcLength, c);
127 src+=i;
128 srcLength-=i;
129 if(!IS_BIDI_CONTROL_CHAR(c)) {
130 remaining-=i;
131 if(remaining<0) {
132 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
133
134 /* preflight the length */
135 while(srcLength>0) {
136 c=*src++;
137 if(!IS_BIDI_CONTROL_CHAR(c)) {
138 --remaining;
139 }
140 --srcLength;
141 }
142 return destSize-remaining;
143 }
144 c=u_charMirror(c);
145 UTF_APPEND_CHAR_UNSAFE(dest, j, c);
146 }
147 } while(srcLength>0);
148 return j;
149 }
150 } /* end of switch */
151 }
152
153 static int32_t
doWriteReverse(const UChar * src,int32_t srcLength,UChar * dest,int32_t destSize,uint16_t options,UErrorCode * pErrorCode)154 doWriteReverse(const UChar *src, int32_t srcLength,
155 UChar *dest, int32_t destSize,
156 uint16_t options,
157 UErrorCode *pErrorCode) {
158 /*
159 * RTL run -
160 *
161 * RTL runs need to be copied to the destination in reverse order
162 * of code points, not code units, to keep Unicode characters intact.
163 *
164 * The general strategy for this is to read the source text
165 * in backward order, collect all code units for a code point
166 * (and optionally following combining characters, see below),
167 * and copy all these code units in ascending order
168 * to the destination for this run.
169 *
170 * Several options request whether combining characters
171 * should be kept after their base characters,
172 * whether BiDi control characters should be removed, and
173 * whether characters should be replaced by their mirror-image
174 * equivalent Unicode characters.
175 */
176 int32_t i, j;
177 UChar32 c;
178
179 /* optimize for several combinations of options */
180 switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING|UBIDI_KEEP_BASE_COMBINING)) {
181 case 0:
182 /*
183 * With none of the "complicated" options set, the destination
184 * run will have the same length as the source run,
185 * and there is no mirroring and no keeping combining characters
186 * with their base characters.
187 */
188 if(destSize<srcLength) {
189 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
190 return srcLength;
191 }
192 destSize=srcLength;
193
194 /* preserve character integrity */
195 do {
196 /* i is always after the last code unit known to need to be kept in this segment */
197 i=srcLength;
198
199 /* collect code units for one base character */
200 UTF_BACK_1(src, 0, srcLength);
201
202 /* copy this base character */
203 j=srcLength;
204 do {
205 *dest++=src[j++];
206 } while(j<i);
207 } while(srcLength>0);
208 break;
209 case UBIDI_KEEP_BASE_COMBINING:
210 /*
211 * Here, too, the destination
212 * run will have the same length as the source run,
213 * and there is no mirroring.
214 * We do need to keep combining characters with their base characters.
215 */
216 if(destSize<srcLength) {
217 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
218 return srcLength;
219 }
220 destSize=srcLength;
221
222 /* preserve character integrity */
223 do {
224 /* i is always after the last code unit known to need to be kept in this segment */
225 i=srcLength;
226
227 /* collect code units and modifier letters for one base character */
228 do {
229 UTF_PREV_CHAR(src, 0, srcLength, c);
230 } while(srcLength>0 && IS_COMBINING(u_charType(c)));
231
232 /* copy this "user character" */
233 j=srcLength;
234 do {
235 *dest++=src[j++];
236 } while(j<i);
237 } while(srcLength>0);
238 break;
239 default:
240 /*
241 * With several "complicated" options set, this is the most
242 * general and the slowest copying of an RTL run.
243 * We will do mirroring, remove BiDi controls, and
244 * keep combining characters with their base characters
245 * as requested.
246 */
247 if(!(options&UBIDI_REMOVE_BIDI_CONTROLS)) {
248 i=srcLength;
249 } else {
250 /* we need to find out the destination length of the run,
251 which will not include the BiDi control characters */
252 int32_t length=srcLength;
253 UChar ch;
254
255 i=0;
256 do {
257 ch=*src++;
258 if(!IS_BIDI_CONTROL_CHAR(ch)) {
259 ++i;
260 }
261 } while(--length>0);
262 src-=srcLength;
263 }
264
265 if(destSize<i) {
266 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
267 return i;
268 }
269 destSize=i;
270
271 /* preserve character integrity */
272 do {
273 /* i is always after the last code unit known to need to be kept in this segment */
274 i=srcLength;
275
276 /* collect code units for one base character */
277 UTF_PREV_CHAR(src, 0, srcLength, c);
278 if(options&UBIDI_KEEP_BASE_COMBINING) {
279 /* collect modifier letters for this base character */
280 while(srcLength>0 && IS_COMBINING(u_charType(c))) {
281 UTF_PREV_CHAR(src, 0, srcLength, c);
282 }
283 }
284
285 if(options&UBIDI_REMOVE_BIDI_CONTROLS && IS_BIDI_CONTROL_CHAR(c)) {
286 /* do not copy this BiDi control character */
287 continue;
288 }
289
290 /* copy this "user character" */
291 j=srcLength;
292 if(options&UBIDI_DO_MIRRORING) {
293 /* mirror only the base character */
294 int32_t k=0;
295 c=u_charMirror(c);
296 UTF_APPEND_CHAR_UNSAFE(dest, k, c);
297 dest+=k;
298 j+=k;
299 }
300 while(j<i) {
301 *dest++=src[j++];
302 }
303 } while(srcLength>0);
304 break;
305 } /* end of switch */
306
307 return destSize;
308 }
309
310 U_CAPI int32_t U_EXPORT2
ubidi_writeReverse(const UChar * src,int32_t srcLength,UChar * dest,int32_t destSize,uint16_t options,UErrorCode * pErrorCode)311 ubidi_writeReverse(const UChar *src, int32_t srcLength,
312 UChar *dest, int32_t destSize,
313 uint16_t options,
314 UErrorCode *pErrorCode) {
315 int32_t destLength;
316
317 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
318 return 0;
319 }
320
321 /* more error checking */
322 if( src==NULL || srcLength<-1 ||
323 destSize<0 || (destSize>0 && dest==NULL))
324 {
325 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
326 return 0;
327 }
328
329 /* do input and output overlap? */
330 if( dest!=NULL &&
331 ((src>=dest && src<dest+destSize) ||
332 (dest>=src && dest<src+srcLength)))
333 {
334 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
335 return 0;
336 }
337
338 if(srcLength==-1) {
339 srcLength=u_strlen(src);
340 }
341 if(srcLength>0) {
342 destLength=doWriteReverse(src, srcLength, dest, destSize, options, pErrorCode);
343 } else {
344 /* nothing to do */
345 destLength=0;
346 }
347
348 return u_terminateUChars(dest, destSize, destLength, pErrorCode);
349 }
350
351 U_CAPI int32_t U_EXPORT2
ubidi_writeReordered(UBiDi * pBiDi,UChar * dest,int32_t destSize,uint16_t options,UErrorCode * pErrorCode)352 ubidi_writeReordered(UBiDi *pBiDi,
353 UChar *dest, int32_t destSize,
354 uint16_t options,
355 UErrorCode *pErrorCode) {
356 const UChar *text;
357 UChar *saveDest;
358 int32_t length, destCapacity;
359 int32_t run, runCount, logicalStart, runLength;
360
361 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
362 return 0;
363 }
364
365 /* more error checking */
366 if( pBiDi==NULL ||
367 (text=pBiDi->text)==NULL || (length=pBiDi->length)<0 ||
368 destSize<0 || (destSize>0 && dest==NULL))
369 {
370 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
371 return 0;
372 }
373
374 /* do input and output overlap? */
375 if( dest!=NULL &&
376 ((text>=dest && text<dest+destSize) ||
377 (dest>=text && dest<text+pBiDi->originalLength)))
378 {
379 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
380 return 0;
381 }
382
383 if(length==0) {
384 /* nothing to do */
385 return u_terminateUChars(dest, destSize, 0, pErrorCode);
386 }
387
388 runCount=ubidi_countRuns(pBiDi, pErrorCode);
389 if(U_FAILURE(*pErrorCode)) {
390 return 0;
391 }
392
393 /* destSize shrinks, later destination length=destCapacity-destSize */
394 saveDest=dest;
395 destCapacity=destSize;
396
397 /*
398 * Option "insert marks" implies UBIDI_INSERT_LRM_FOR_NUMERIC if the
399 * reordering mode (checked below) is appropriate.
400 */
401 if(pBiDi->reorderingOptions & UBIDI_OPTION_INSERT_MARKS) {
402 options|=UBIDI_INSERT_LRM_FOR_NUMERIC;
403 options&=~UBIDI_REMOVE_BIDI_CONTROLS;
404 }
405 /*
406 * Option "remove controls" implies UBIDI_REMOVE_BIDI_CONTROLS
407 * and cancels UBIDI_INSERT_LRM_FOR_NUMERIC.
408 */
409 if(pBiDi->reorderingOptions & UBIDI_OPTION_REMOVE_CONTROLS) {
410 options|=UBIDI_REMOVE_BIDI_CONTROLS;
411 options&=~UBIDI_INSERT_LRM_FOR_NUMERIC;
412 }
413 /*
414 * If we do not perform the "inverse BiDi" algorithm, then we
415 * don't need to insert any LRMs, and don't need to test for it.
416 */
417 if((pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_NUMBERS_AS_L) &&
418 (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_LIKE_DIRECT) &&
419 (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL) &&
420 (pBiDi->reorderingMode != UBIDI_REORDER_RUNS_ONLY)) {
421 options&=~UBIDI_INSERT_LRM_FOR_NUMERIC;
422 }
423 /*
424 * Iterate through all visual runs and copy the run text segments to
425 * the destination, according to the options.
426 *
427 * The tests for where to insert LRMs ignore the fact that there may be
428 * BN codes or non-BMP code points at the beginning and end of a run;
429 * they may insert LRMs unnecessarily but the tests are faster this way
430 * (this would have to be improved for UTF-8).
431 *
432 * Note that the only errors that are set by doWriteXY() are buffer overflow
433 * errors. Ignore them until the end, and continue for preflighting.
434 */
435 if(!(options&UBIDI_OUTPUT_REVERSE)) {
436 /* forward output */
437 if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) {
438 /* do not insert BiDi controls */
439 for(run=0; run<runCount; ++run) {
440 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) {
441 runLength=doWriteForward(text+logicalStart, runLength,
442 dest, destSize,
443 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
444 } else {
445 runLength=doWriteReverse(text+logicalStart, runLength,
446 dest, destSize,
447 options, pErrorCode);
448 }
449 dest+=runLength;
450 destSize-=runLength;
451 }
452 } else {
453 /* insert BiDi controls for "inverse BiDi" */
454 const DirProp *dirProps=pBiDi->dirProps;
455 const UChar *src;
456 UChar uc;
457 UBiDiDirection dir;
458 int32_t markFlag;
459
460 for(run=0; run<runCount; ++run) {
461 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength);
462 src=text+logicalStart;
463 /* check if something relevant in insertPoints */
464 markFlag=pBiDi->runs[run].insertRemove;
465 if(markFlag<0) { /* BiDi controls count */
466 markFlag=0;
467 }
468
469 if(UBIDI_LTR==dir) {
470 if((pBiDi->isInverse) &&
471 (/*run>0 &&*/ dirProps[logicalStart]!=L)) {
472 markFlag |= LRM_BEFORE;
473 }
474 if (markFlag & LRM_BEFORE) {
475 uc=LRM_CHAR;
476 }
477 else if (markFlag & RLM_BEFORE) {
478 uc=RLM_CHAR;
479 }
480 else uc=0;
481 if(uc) {
482 if(destSize>0) {
483 *dest++=uc;
484 }
485 --destSize;
486 }
487
488 runLength=doWriteForward(src, runLength,
489 dest, destSize,
490 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
491 dest+=runLength;
492 destSize-=runLength;
493
494 if((pBiDi->isInverse) &&
495 (/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L)) {
496 markFlag |= LRM_AFTER;
497 }
498 if (markFlag & LRM_AFTER) {
499 uc=LRM_CHAR;
500 }
501 else if (markFlag & RLM_AFTER) {
502 uc=RLM_CHAR;
503 }
504 else uc=0;
505 if(uc) {
506 if(destSize>0) {
507 *dest++=uc;
508 }
509 --destSize;
510 }
511 } else { /* RTL run */
512 if((pBiDi->isInverse) &&
513 (/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1])))) {
514 markFlag |= RLM_BEFORE;
515 }
516 if (markFlag & LRM_BEFORE) {
517 uc=LRM_CHAR;
518 }
519 else if (markFlag & RLM_BEFORE) {
520 uc=RLM_CHAR;
521 }
522 else uc=0;
523 if(uc) {
524 if(destSize>0) {
525 *dest++=uc;
526 }
527 --destSize;
528 }
529
530 runLength=doWriteReverse(src, runLength,
531 dest, destSize,
532 options, pErrorCode);
533 dest+=runLength;
534 destSize-=runLength;
535
536 if((pBiDi->isInverse) &&
537 (/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart])))) {
538 markFlag |= RLM_AFTER;
539 }
540 if (markFlag & LRM_AFTER) {
541 uc=LRM_CHAR;
542 }
543 else if (markFlag & RLM_AFTER) {
544 uc=RLM_CHAR;
545 }
546 else uc=0;
547 if(uc) {
548 if(destSize>0) {
549 *dest++=uc;
550 }
551 --destSize;
552 }
553 }
554 }
555 }
556 } else {
557 /* reverse output */
558 if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) {
559 /* do not insert BiDi controls */
560 for(run=runCount; --run>=0;) {
561 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) {
562 runLength=doWriteReverse(text+logicalStart, runLength,
563 dest, destSize,
564 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
565 } else {
566 runLength=doWriteForward(text+logicalStart, runLength,
567 dest, destSize,
568 options, pErrorCode);
569 }
570 dest+=runLength;
571 destSize-=runLength;
572 }
573 } else {
574 /* insert BiDi controls for "inverse BiDi" */
575 const DirProp *dirProps=pBiDi->dirProps;
576 const UChar *src;
577 UBiDiDirection dir;
578
579 for(run=runCount; --run>=0;) {
580 /* reverse output */
581 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength);
582 src=text+logicalStart;
583
584 if(UBIDI_LTR==dir) {
585 if(/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L) {
586 if(destSize>0) {
587 *dest++=LRM_CHAR;
588 }
589 --destSize;
590 }
591
592 runLength=doWriteReverse(src, runLength,
593 dest, destSize,
594 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
595 dest+=runLength;
596 destSize-=runLength;
597
598 if(/*run>0 &&*/ dirProps[logicalStart]!=L) {
599 if(destSize>0) {
600 *dest++=LRM_CHAR;
601 }
602 --destSize;
603 }
604 } else {
605 if(/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart]))) {
606 if(destSize>0) {
607 *dest++=RLM_CHAR;
608 }
609 --destSize;
610 }
611
612 runLength=doWriteForward(src, runLength,
613 dest, destSize,
614 options, pErrorCode);
615 dest+=runLength;
616 destSize-=runLength;
617
618 if(/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1]))) {
619 if(destSize>0) {
620 *dest++=RLM_CHAR;
621 }
622 --destSize;
623 }
624 }
625 }
626 }
627 }
628
629 return u_terminateUChars(saveDest, destCapacity, destCapacity-destSize, pErrorCode);
630 }
631