1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 2000-2011, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 * file name: ubidiwrt.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 1999aug06
14 * created by: Markus W. Scherer, updated by Matitiahu Allouche
15 *
16 * This file contains implementations for BiDi functions that use
17 * the core algorithm and core API to write reordered text.
18 */
19
20 /* set import/export definitions */
21 #ifndef U_COMMON_IMPLEMENTATION
22 # define U_COMMON_IMPLEMENTATION
23 #endif
24
25 #include "unicode/utypes.h"
26 #include "unicode/ustring.h"
27 #include "unicode/uchar.h"
28 #include "unicode/ubidi.h"
29 #include "unicode/utf16.h"
30 #include "cmemory.h"
31 #include "ustr_imp.h"
32 #include "ubidiimp.h"
33
34 /*
35 * The function implementations in this file are designed
36 * for UTF-16 and UTF-32, not for UTF-8.
37 *
38 * Assumptions that are not true for UTF-8:
39 * - Any code point always needs the same number of code units
40 * ("minimum-length-problem" of UTF-8)
41 * - The BiDi control characters need only one code unit each
42 *
43 * Further assumptions for all UTFs:
44 * - u_charMirror(c) needs the same number of code units as c
45 */
46 #if UTF_SIZE==8
47 # error reimplement ubidi_writeReordered() for UTF-8, see comment above
48 #endif
49
50 #define IS_COMBINING(type) ((1UL<<(type))&(1UL<<U_NON_SPACING_MARK|1UL<<U_COMBINING_SPACING_MARK|1UL<<U_ENCLOSING_MARK))
51
52 /*
53 * When we have UBIDI_OUTPUT_REVERSE set on ubidi_writeReordered(), then we
54 * semantically write RTL runs in reverse and later reverse them again.
55 * Instead, we actually write them in forward order to begin with.
56 * However, if the RTL run was to be mirrored, we need to mirror here now
57 * since the implicit second reversal must not do it.
58 * It looks strange to do mirroring in LTR output, but it is only because
59 * we are writing RTL output in reverse.
60 */
61 static int32_t
doWriteForward(const UChar * src,int32_t srcLength,UChar * dest,int32_t destSize,uint16_t options,UErrorCode * pErrorCode)62 doWriteForward(const UChar *src, int32_t srcLength,
63 UChar *dest, int32_t destSize,
64 uint16_t options,
65 UErrorCode *pErrorCode) {
66 /* optimize for several combinations of options */
67 switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING)) {
68 case 0: {
69 /* simply copy the LTR run to the destination */
70 int32_t length=srcLength;
71 if(destSize<length) {
72 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
73 return srcLength;
74 }
75 do {
76 *dest++=*src++;
77 } while(--length>0);
78 return srcLength;
79 }
80 case UBIDI_DO_MIRRORING: {
81 /* do mirroring */
82 int32_t i=0, j=0;
83 UChar32 c;
84
85 if(destSize<srcLength) {
86 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
87 return srcLength;
88 }
89 do {
90 U16_NEXT(src, i, srcLength, c);
91 c=u_charMirror(c);
92 U16_APPEND_UNSAFE(dest, j, c);
93 } while(i<srcLength);
94 return srcLength;
95 }
96 case UBIDI_REMOVE_BIDI_CONTROLS: {
97 /* copy the LTR run and remove any BiDi control characters */
98 int32_t remaining=destSize;
99 UChar c;
100 do {
101 c=*src++;
102 if(!IS_BIDI_CONTROL_CHAR(c)) {
103 if(--remaining<0) {
104 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
105
106 /* preflight the length */
107 while(--srcLength>0) {
108 c=*src++;
109 if(!IS_BIDI_CONTROL_CHAR(c)) {
110 --remaining;
111 }
112 }
113 return destSize-remaining;
114 }
115 *dest++=c;
116 }
117 } while(--srcLength>0);
118 return destSize-remaining;
119 }
120 default: {
121 /* remove BiDi control characters and do mirroring */
122 int32_t remaining=destSize;
123 int32_t i, j=0;
124 UChar32 c;
125 do {
126 i=0;
127 U16_NEXT(src, i, srcLength, c);
128 src+=i;
129 srcLength-=i;
130 if(!IS_BIDI_CONTROL_CHAR(c)) {
131 remaining-=i;
132 if(remaining<0) {
133 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
134
135 /* preflight the length */
136 while(srcLength>0) {
137 c=*src++;
138 if(!IS_BIDI_CONTROL_CHAR(c)) {
139 --remaining;
140 }
141 --srcLength;
142 }
143 return destSize-remaining;
144 }
145 c=u_charMirror(c);
146 U16_APPEND_UNSAFE(dest, j, c);
147 }
148 } while(srcLength>0);
149 return j;
150 }
151 } /* end of switch */
152 }
153
154 static int32_t
doWriteReverse(const UChar * src,int32_t srcLength,UChar * dest,int32_t destSize,uint16_t options,UErrorCode * pErrorCode)155 doWriteReverse(const UChar *src, int32_t srcLength,
156 UChar *dest, int32_t destSize,
157 uint16_t options,
158 UErrorCode *pErrorCode) {
159 /*
160 * RTL run -
161 *
162 * RTL runs need to be copied to the destination in reverse order
163 * of code points, not code units, to keep Unicode characters intact.
164 *
165 * The general strategy for this is to read the source text
166 * in backward order, collect all code units for a code point
167 * (and optionally following combining characters, see below),
168 * and copy all these code units in ascending order
169 * to the destination for this run.
170 *
171 * Several options request whether combining characters
172 * should be kept after their base characters,
173 * whether BiDi control characters should be removed, and
174 * whether characters should be replaced by their mirror-image
175 * equivalent Unicode characters.
176 */
177 int32_t i, j;
178 UChar32 c;
179
180 /* optimize for several combinations of options */
181 switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING|UBIDI_KEEP_BASE_COMBINING)) {
182 case 0:
183 /*
184 * With none of the "complicated" options set, the destination
185 * run will have the same length as the source run,
186 * and there is no mirroring and no keeping combining characters
187 * with their base characters.
188 */
189 if(destSize<srcLength) {
190 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
191 return srcLength;
192 }
193 destSize=srcLength;
194
195 /* preserve character integrity */
196 do {
197 /* i is always after the last code unit known to need to be kept in this segment */
198 i=srcLength;
199
200 /* collect code units for one base character */
201 U16_BACK_1(src, 0, srcLength);
202
203 /* copy this base character */
204 j=srcLength;
205 do {
206 *dest++=src[j++];
207 } while(j<i);
208 } while(srcLength>0);
209 break;
210 case UBIDI_KEEP_BASE_COMBINING:
211 /*
212 * Here, too, the destination
213 * run will have the same length as the source run,
214 * and there is no mirroring.
215 * We do need to keep combining characters with their base characters.
216 */
217 if(destSize<srcLength) {
218 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
219 return srcLength;
220 }
221 destSize=srcLength;
222
223 /* preserve character integrity */
224 do {
225 /* i is always after the last code unit known to need to be kept in this segment */
226 i=srcLength;
227
228 /* collect code units and modifier letters for one base character */
229 do {
230 U16_PREV(src, 0, srcLength, c);
231 } while(srcLength>0 && IS_COMBINING(u_charType(c)));
232
233 /* copy this "user character" */
234 j=srcLength;
235 do {
236 *dest++=src[j++];
237 } while(j<i);
238 } while(srcLength>0);
239 break;
240 default:
241 /*
242 * With several "complicated" options set, this is the most
243 * general and the slowest copying of an RTL run.
244 * We will do mirroring, remove BiDi controls, and
245 * keep combining characters with their base characters
246 * as requested.
247 */
248 if(!(options&UBIDI_REMOVE_BIDI_CONTROLS)) {
249 i=srcLength;
250 } else {
251 /* we need to find out the destination length of the run,
252 which will not include the BiDi control characters */
253 int32_t length=srcLength;
254 UChar ch;
255
256 i=0;
257 do {
258 ch=*src++;
259 if(!IS_BIDI_CONTROL_CHAR(ch)) {
260 ++i;
261 }
262 } while(--length>0);
263 src-=srcLength;
264 }
265
266 if(destSize<i) {
267 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
268 return i;
269 }
270 destSize=i;
271
272 /* preserve character integrity */
273 do {
274 /* i is always after the last code unit known to need to be kept in this segment */
275 i=srcLength;
276
277 /* collect code units for one base character */
278 U16_PREV(src, 0, srcLength, c);
279 if(options&UBIDI_KEEP_BASE_COMBINING) {
280 /* collect modifier letters for this base character */
281 while(srcLength>0 && IS_COMBINING(u_charType(c))) {
282 U16_PREV(src, 0, srcLength, c);
283 }
284 }
285
286 if(options&UBIDI_REMOVE_BIDI_CONTROLS && IS_BIDI_CONTROL_CHAR(c)) {
287 /* do not copy this BiDi control character */
288 continue;
289 }
290
291 /* copy this "user character" */
292 j=srcLength;
293 if(options&UBIDI_DO_MIRRORING) {
294 /* mirror only the base character */
295 int32_t k=0;
296 c=u_charMirror(c);
297 U16_APPEND_UNSAFE(dest, k, c);
298 dest+=k;
299 j+=k;
300 }
301 while(j<i) {
302 *dest++=src[j++];
303 }
304 } while(srcLength>0);
305 break;
306 } /* end of switch */
307
308 return destSize;
309 }
310
311 U_CAPI int32_t U_EXPORT2
ubidi_writeReverse(const UChar * src,int32_t srcLength,UChar * dest,int32_t destSize,uint16_t options,UErrorCode * pErrorCode)312 ubidi_writeReverse(const UChar *src, int32_t srcLength,
313 UChar *dest, int32_t destSize,
314 uint16_t options,
315 UErrorCode *pErrorCode) {
316 int32_t destLength;
317
318 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
319 return 0;
320 }
321
322 /* more error checking */
323 if( src==NULL || srcLength<-1 ||
324 destSize<0 || (destSize>0 && dest==NULL))
325 {
326 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
327 return 0;
328 }
329
330 /* do input and output overlap? */
331 if( dest!=NULL &&
332 ((src>=dest && src<dest+destSize) ||
333 (dest>=src && dest<src+srcLength)))
334 {
335 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
336 return 0;
337 }
338
339 if(srcLength==-1) {
340 srcLength=u_strlen(src);
341 }
342 if(srcLength>0) {
343 destLength=doWriteReverse(src, srcLength, dest, destSize, options, pErrorCode);
344 } else {
345 /* nothing to do */
346 destLength=0;
347 }
348
349 return u_terminateUChars(dest, destSize, destLength, pErrorCode);
350 }
351
352 U_CAPI int32_t U_EXPORT2
ubidi_writeReordered(UBiDi * pBiDi,UChar * dest,int32_t destSize,uint16_t options,UErrorCode * pErrorCode)353 ubidi_writeReordered(UBiDi *pBiDi,
354 UChar *dest, int32_t destSize,
355 uint16_t options,
356 UErrorCode *pErrorCode) {
357 const UChar *text;
358 UChar *saveDest;
359 int32_t length, destCapacity;
360 int32_t run, runCount, logicalStart, runLength;
361
362 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
363 return 0;
364 }
365
366 /* more error checking */
367 if( pBiDi==NULL ||
368 (text=pBiDi->text)==NULL || (length=pBiDi->length)<0 ||
369 destSize<0 || (destSize>0 && dest==NULL))
370 {
371 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
372 return 0;
373 }
374
375 /* do input and output overlap? */
376 if( dest!=NULL &&
377 ((text>=dest && text<dest+destSize) ||
378 (dest>=text && dest<text+pBiDi->originalLength)))
379 {
380 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
381 return 0;
382 }
383
384 if(length==0) {
385 /* nothing to do */
386 return u_terminateUChars(dest, destSize, 0, pErrorCode);
387 }
388
389 runCount=ubidi_countRuns(pBiDi, pErrorCode);
390 if(U_FAILURE(*pErrorCode)) {
391 return 0;
392 }
393
394 /* destSize shrinks, later destination length=destCapacity-destSize */
395 saveDest=dest;
396 destCapacity=destSize;
397
398 /*
399 * Option "insert marks" implies UBIDI_INSERT_LRM_FOR_NUMERIC if the
400 * reordering mode (checked below) is appropriate.
401 */
402 if(pBiDi->reorderingOptions & UBIDI_OPTION_INSERT_MARKS) {
403 options|=UBIDI_INSERT_LRM_FOR_NUMERIC;
404 options&=~UBIDI_REMOVE_BIDI_CONTROLS;
405 }
406 /*
407 * Option "remove controls" implies UBIDI_REMOVE_BIDI_CONTROLS
408 * and cancels UBIDI_INSERT_LRM_FOR_NUMERIC.
409 */
410 if(pBiDi->reorderingOptions & UBIDI_OPTION_REMOVE_CONTROLS) {
411 options|=UBIDI_REMOVE_BIDI_CONTROLS;
412 options&=~UBIDI_INSERT_LRM_FOR_NUMERIC;
413 }
414 /*
415 * If we do not perform the "inverse BiDi" algorithm, then we
416 * don't need to insert any LRMs, and don't need to test for it.
417 */
418 if((pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_NUMBERS_AS_L) &&
419 (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_LIKE_DIRECT) &&
420 (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL) &&
421 (pBiDi->reorderingMode != UBIDI_REORDER_RUNS_ONLY)) {
422 options&=~UBIDI_INSERT_LRM_FOR_NUMERIC;
423 }
424 /*
425 * Iterate through all visual runs and copy the run text segments to
426 * the destination, according to the options.
427 *
428 * The tests for where to insert LRMs ignore the fact that there may be
429 * BN codes or non-BMP code points at the beginning and end of a run;
430 * they may insert LRMs unnecessarily but the tests are faster this way
431 * (this would have to be improved for UTF-8).
432 *
433 * Note that the only errors that are set by doWriteXY() are buffer overflow
434 * errors. Ignore them until the end, and continue for preflighting.
435 */
436 if(!(options&UBIDI_OUTPUT_REVERSE)) {
437 /* forward output */
438 if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) {
439 /* do not insert BiDi controls */
440 for(run=0; run<runCount; ++run) {
441 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) {
442 runLength=doWriteForward(text+logicalStart, runLength,
443 dest, destSize,
444 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
445 } else {
446 runLength=doWriteReverse(text+logicalStart, runLength,
447 dest, destSize,
448 options, pErrorCode);
449 }
450 if(dest!=NULL) {
451 dest+=runLength;
452 }
453 destSize-=runLength;
454 }
455 } else {
456 /* insert BiDi controls for "inverse BiDi" */
457 const DirProp *dirProps=pBiDi->dirProps;
458 const UChar *src;
459 UChar uc;
460 UBiDiDirection dir;
461 int32_t markFlag;
462
463 for(run=0; run<runCount; ++run) {
464 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength);
465 src=text+logicalStart;
466 /* check if something relevant in insertPoints */
467 markFlag=pBiDi->runs[run].insertRemove;
468 if(markFlag<0) { /* BiDi controls count */
469 markFlag=0;
470 }
471
472 if(UBIDI_LTR==dir) {
473 if((pBiDi->isInverse) &&
474 (/*run>0 &&*/ dirProps[logicalStart]!=L)) {
475 markFlag |= LRM_BEFORE;
476 }
477 if (markFlag & LRM_BEFORE) {
478 uc=LRM_CHAR;
479 }
480 else if (markFlag & RLM_BEFORE) {
481 uc=RLM_CHAR;
482 }
483 else uc=0;
484 if(uc) {
485 if(destSize>0) {
486 *dest++=uc;
487 }
488 --destSize;
489 }
490
491 runLength=doWriteForward(src, runLength,
492 dest, destSize,
493 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
494 if(dest!=NULL) {
495 dest+=runLength;
496 }
497 destSize-=runLength;
498
499 if((pBiDi->isInverse) &&
500 (/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L)) {
501 markFlag |= LRM_AFTER;
502 }
503 if (markFlag & LRM_AFTER) {
504 uc=LRM_CHAR;
505 }
506 else if (markFlag & RLM_AFTER) {
507 uc=RLM_CHAR;
508 }
509 else uc=0;
510 if(uc) {
511 if(destSize>0) {
512 *dest++=uc;
513 }
514 --destSize;
515 }
516 } else { /* RTL run */
517 if((pBiDi->isInverse) &&
518 (/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1])))) {
519 markFlag |= RLM_BEFORE;
520 }
521 if (markFlag & LRM_BEFORE) {
522 uc=LRM_CHAR;
523 }
524 else if (markFlag & RLM_BEFORE) {
525 uc=RLM_CHAR;
526 }
527 else uc=0;
528 if(uc) {
529 if(destSize>0) {
530 *dest++=uc;
531 }
532 --destSize;
533 }
534
535 runLength=doWriteReverse(src, runLength,
536 dest, destSize,
537 options, pErrorCode);
538 if(dest!=NULL) {
539 dest+=runLength;
540 }
541 destSize-=runLength;
542
543 if((pBiDi->isInverse) &&
544 (/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart])))) {
545 markFlag |= RLM_AFTER;
546 }
547 if (markFlag & LRM_AFTER) {
548 uc=LRM_CHAR;
549 }
550 else if (markFlag & RLM_AFTER) {
551 uc=RLM_CHAR;
552 }
553 else uc=0;
554 if(uc) {
555 if(destSize>0) {
556 *dest++=uc;
557 }
558 --destSize;
559 }
560 }
561 }
562 }
563 } else {
564 /* reverse output */
565 if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) {
566 /* do not insert BiDi controls */
567 for(run=runCount; --run>=0;) {
568 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) {
569 runLength=doWriteReverse(text+logicalStart, runLength,
570 dest, destSize,
571 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
572 } else {
573 runLength=doWriteForward(text+logicalStart, runLength,
574 dest, destSize,
575 options, pErrorCode);
576 }
577 if(dest!=NULL) {
578 dest+=runLength;
579 }
580 destSize-=runLength;
581 }
582 } else {
583 /* insert BiDi controls for "inverse BiDi" */
584 const DirProp *dirProps=pBiDi->dirProps;
585 const UChar *src;
586 UBiDiDirection dir;
587
588 for(run=runCount; --run>=0;) {
589 /* reverse output */
590 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength);
591 src=text+logicalStart;
592
593 if(UBIDI_LTR==dir) {
594 if(/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L) {
595 if(destSize>0) {
596 *dest++=LRM_CHAR;
597 }
598 --destSize;
599 }
600
601 runLength=doWriteReverse(src, runLength,
602 dest, destSize,
603 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
604 if(dest!=NULL) {
605 dest+=runLength;
606 }
607 destSize-=runLength;
608
609 if(/*run>0 &&*/ dirProps[logicalStart]!=L) {
610 if(destSize>0) {
611 *dest++=LRM_CHAR;
612 }
613 --destSize;
614 }
615 } else {
616 if(/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart]))) {
617 if(destSize>0) {
618 *dest++=RLM_CHAR;
619 }
620 --destSize;
621 }
622
623 runLength=doWriteForward(src, runLength,
624 dest, destSize,
625 options, pErrorCode);
626 if(dest!=NULL) {
627 dest+=runLength;
628 }
629 destSize-=runLength;
630
631 if(/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1]))) {
632 if(destSize>0) {
633 *dest++=RLM_CHAR;
634 }
635 --destSize;
636 }
637 }
638 }
639 }
640 }
641
642 return u_terminateUChars(saveDest, destCapacity, destCapacity-destSize, pErrorCode);
643 }
644