• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ******************************************************************************
3 *
4 *   Copyright (C) 2000-2007, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ******************************************************************************
8 *   file name:  ubidiwrt.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 1999aug06
14 *   created by: Markus W. Scherer, updated by Matitiahu Allouche
15 *
16 * This file contains implementations for BiDi functions that use
17 * the core algorithm and core API to write reordered text.
18 */
19 
20 /* set import/export definitions */
21 #ifndef U_COMMON_IMPLEMENTATION
22 #   define U_COMMON_IMPLEMENTATION
23 #endif
24 
25 #include "unicode/utypes.h"
26 #include "unicode/ustring.h"
27 #include "unicode/uchar.h"
28 #include "unicode/ubidi.h"
29 #include "cmemory.h"
30 #include "ustr_imp.h"
31 #include "ubidiimp.h"
32 
33 /*
34  * The function implementations in this file are designed
35  * for UTF-16 and UTF-32, not for UTF-8.
36  *
37  * Assumptions that are not true for UTF-8:
38  * - Any code point always needs the same number of code units
39  *   ("minimum-length-problem" of UTF-8)
40  * - The BiDi control characters need only one code unit each
41  *
42  * Further assumptions for all UTFs:
43  * - u_charMirror(c) needs the same number of code units as c
44  */
45 #if UTF_SIZE==8
46 # error reimplement ubidi_writeReordered() for UTF-8, see comment above
47 #endif
48 
49 #define IS_COMBINING(type) ((1UL<<(type))&(1UL<<U_NON_SPACING_MARK|1UL<<U_COMBINING_SPACING_MARK|1UL<<U_ENCLOSING_MARK))
50 
51 /*
52  * When we have UBIDI_OUTPUT_REVERSE set on ubidi_writeReordered(), then we
53  * semantically write RTL runs in reverse and later reverse them again.
54  * Instead, we actually write them in forward order to begin with.
55  * However, if the RTL run was to be mirrored, we need to mirror here now
56  * since the implicit second reversal must not do it.
57  * It looks strange to do mirroring in LTR output, but it is only because
58  * we are writing RTL output in reverse.
59  */
60 static int32_t
doWriteForward(const UChar * src,int32_t srcLength,UChar * dest,int32_t destSize,uint16_t options,UErrorCode * pErrorCode)61 doWriteForward(const UChar *src, int32_t srcLength,
62                UChar *dest, int32_t destSize,
63                uint16_t options,
64                UErrorCode *pErrorCode) {
65     /* optimize for several combinations of options */
66     switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING)) {
67     case 0: {
68         /* simply copy the LTR run to the destination */
69         int32_t length=srcLength;
70         if(destSize<length) {
71             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
72             return srcLength;
73         }
74         do {
75             *dest++=*src++;
76         } while(--length>0);
77         return srcLength;
78     }
79     case UBIDI_DO_MIRRORING: {
80         /* do mirroring */
81         int32_t i=0, j=0;
82         UChar32 c;
83 
84         if(destSize<srcLength) {
85             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
86             return srcLength;
87         }
88         do {
89             UTF_NEXT_CHAR(src, i, srcLength, c);
90             c=u_charMirror(c);
91             UTF_APPEND_CHAR_UNSAFE(dest, j, c);
92         } while(i<srcLength);
93         return srcLength;
94     }
95     case UBIDI_REMOVE_BIDI_CONTROLS: {
96         /* copy the LTR run and remove any BiDi control characters */
97         int32_t remaining=destSize;
98         UChar c;
99         do {
100             c=*src++;
101             if(!IS_BIDI_CONTROL_CHAR(c)) {
102                 if(--remaining<0) {
103                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
104 
105                     /* preflight the length */
106                     while(--srcLength>0) {
107                         c=*src++;
108                         if(!IS_BIDI_CONTROL_CHAR(c)) {
109                             --remaining;
110                         }
111                     }
112                     return destSize-remaining;
113                 }
114                 *dest++=c;
115             }
116         } while(--srcLength>0);
117         return destSize-remaining;
118     }
119     default: {
120         /* remove BiDi control characters and do mirroring */
121         int32_t remaining=destSize;
122         int32_t i, j=0;
123         UChar32 c;
124         do {
125             i=0;
126             UTF_NEXT_CHAR(src, i, srcLength, c);
127             src+=i;
128             srcLength-=i;
129             if(!IS_BIDI_CONTROL_CHAR(c)) {
130                 remaining-=i;
131                 if(remaining<0) {
132                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
133 
134                     /* preflight the length */
135                     while(srcLength>0) {
136                         c=*src++;
137                         if(!IS_BIDI_CONTROL_CHAR(c)) {
138                             --remaining;
139                         }
140                         --srcLength;
141                     }
142                     return destSize-remaining;
143                 }
144                 c=u_charMirror(c);
145                 UTF_APPEND_CHAR_UNSAFE(dest, j, c);
146             }
147         } while(srcLength>0);
148         return j;
149     }
150     } /* end of switch */
151 }
152 
153 static int32_t
doWriteReverse(const UChar * src,int32_t srcLength,UChar * dest,int32_t destSize,uint16_t options,UErrorCode * pErrorCode)154 doWriteReverse(const UChar *src, int32_t srcLength,
155                UChar *dest, int32_t destSize,
156                uint16_t options,
157                UErrorCode *pErrorCode) {
158     /*
159      * RTL run -
160      *
161      * RTL runs need to be copied to the destination in reverse order
162      * of code points, not code units, to keep Unicode characters intact.
163      *
164      * The general strategy for this is to read the source text
165      * in backward order, collect all code units for a code point
166      * (and optionally following combining characters, see below),
167      * and copy all these code units in ascending order
168      * to the destination for this run.
169      *
170      * Several options request whether combining characters
171      * should be kept after their base characters,
172      * whether BiDi control characters should be removed, and
173      * whether characters should be replaced by their mirror-image
174      * equivalent Unicode characters.
175      */
176     int32_t i, j;
177     UChar32 c;
178 
179     /* optimize for several combinations of options */
180     switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING|UBIDI_KEEP_BASE_COMBINING)) {
181     case 0:
182         /*
183          * With none of the "complicated" options set, the destination
184          * run will have the same length as the source run,
185          * and there is no mirroring and no keeping combining characters
186          * with their base characters.
187          */
188         if(destSize<srcLength) {
189             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
190             return srcLength;
191         }
192         destSize=srcLength;
193 
194         /* preserve character integrity */
195         do {
196             /* i is always after the last code unit known to need to be kept in this segment */
197             i=srcLength;
198 
199             /* collect code units for one base character */
200             UTF_BACK_1(src, 0, srcLength);
201 
202             /* copy this base character */
203             j=srcLength;
204             do {
205                 *dest++=src[j++];
206             } while(j<i);
207         } while(srcLength>0);
208         break;
209     case UBIDI_KEEP_BASE_COMBINING:
210         /*
211          * Here, too, the destination
212          * run will have the same length as the source run,
213          * and there is no mirroring.
214          * We do need to keep combining characters with their base characters.
215          */
216         if(destSize<srcLength) {
217             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
218             return srcLength;
219         }
220         destSize=srcLength;
221 
222         /* preserve character integrity */
223         do {
224             /* i is always after the last code unit known to need to be kept in this segment */
225             i=srcLength;
226 
227             /* collect code units and modifier letters for one base character */
228             do {
229                 UTF_PREV_CHAR(src, 0, srcLength, c);
230             } while(srcLength>0 && IS_COMBINING(u_charType(c)));
231 
232             /* copy this "user character" */
233             j=srcLength;
234             do {
235                 *dest++=src[j++];
236             } while(j<i);
237         } while(srcLength>0);
238         break;
239     default:
240         /*
241          * With several "complicated" options set, this is the most
242          * general and the slowest copying of an RTL run.
243          * We will do mirroring, remove BiDi controls, and
244          * keep combining characters with their base characters
245          * as requested.
246          */
247         if(!(options&UBIDI_REMOVE_BIDI_CONTROLS)) {
248             i=srcLength;
249         } else {
250             /* we need to find out the destination length of the run,
251                which will not include the BiDi control characters */
252             int32_t length=srcLength;
253             UChar ch;
254 
255             i=0;
256             do {
257                 ch=*src++;
258                 if(!IS_BIDI_CONTROL_CHAR(ch)) {
259                     ++i;
260                 }
261             } while(--length>0);
262             src-=srcLength;
263         }
264 
265         if(destSize<i) {
266             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
267             return i;
268         }
269         destSize=i;
270 
271         /* preserve character integrity */
272         do {
273             /* i is always after the last code unit known to need to be kept in this segment */
274             i=srcLength;
275 
276             /* collect code units for one base character */
277             UTF_PREV_CHAR(src, 0, srcLength, c);
278             if(options&UBIDI_KEEP_BASE_COMBINING) {
279                 /* collect modifier letters for this base character */
280                 while(srcLength>0 && IS_COMBINING(u_charType(c))) {
281                     UTF_PREV_CHAR(src, 0, srcLength, c);
282                 }
283             }
284 
285             if(options&UBIDI_REMOVE_BIDI_CONTROLS && IS_BIDI_CONTROL_CHAR(c)) {
286                 /* do not copy this BiDi control character */
287                 continue;
288             }
289 
290             /* copy this "user character" */
291             j=srcLength;
292             if(options&UBIDI_DO_MIRRORING) {
293                 /* mirror only the base character */
294                 int32_t k=0;
295                 c=u_charMirror(c);
296                 UTF_APPEND_CHAR_UNSAFE(dest, k, c);
297                 dest+=k;
298                 j+=k;
299             }
300             while(j<i) {
301                 *dest++=src[j++];
302             }
303         } while(srcLength>0);
304         break;
305     } /* end of switch */
306 
307     return destSize;
308 }
309 
310 U_CAPI int32_t U_EXPORT2
ubidi_writeReverse(const UChar * src,int32_t srcLength,UChar * dest,int32_t destSize,uint16_t options,UErrorCode * pErrorCode)311 ubidi_writeReverse(const UChar *src, int32_t srcLength,
312                    UChar *dest, int32_t destSize,
313                    uint16_t options,
314                    UErrorCode *pErrorCode) {
315     int32_t destLength;
316 
317     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
318         return 0;
319     }
320 
321     /* more error checking */
322     if( src==NULL || srcLength<-1 ||
323         destSize<0 || (destSize>0 && dest==NULL))
324     {
325         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
326         return 0;
327     }
328 
329     /* do input and output overlap? */
330     if( dest!=NULL &&
331         ((src>=dest && src<dest+destSize) ||
332          (dest>=src && dest<src+srcLength)))
333     {
334         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
335         return 0;
336     }
337 
338     if(srcLength==-1) {
339         srcLength=u_strlen(src);
340     }
341     if(srcLength>0) {
342         destLength=doWriteReverse(src, srcLength, dest, destSize, options, pErrorCode);
343     } else {
344         /* nothing to do */
345         destLength=0;
346     }
347 
348     return u_terminateUChars(dest, destSize, destLength, pErrorCode);
349 }
350 
351 U_CAPI int32_t U_EXPORT2
ubidi_writeReordered(UBiDi * pBiDi,UChar * dest,int32_t destSize,uint16_t options,UErrorCode * pErrorCode)352 ubidi_writeReordered(UBiDi *pBiDi,
353                      UChar *dest, int32_t destSize,
354                      uint16_t options,
355                      UErrorCode *pErrorCode) {
356     const UChar *text;
357     UChar *saveDest;
358     int32_t length, destCapacity;
359     int32_t run, runCount, logicalStart, runLength;
360 
361     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
362         return 0;
363     }
364 
365     /* more error checking */
366     if( pBiDi==NULL ||
367         (text=pBiDi->text)==NULL || (length=pBiDi->length)<0 ||
368         destSize<0 || (destSize>0 && dest==NULL))
369     {
370         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
371         return 0;
372     }
373 
374     /* do input and output overlap? */
375     if( dest!=NULL &&
376         ((text>=dest && text<dest+destSize) ||
377          (dest>=text && dest<text+pBiDi->originalLength)))
378     {
379         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
380         return 0;
381     }
382 
383     if(length==0) {
384         /* nothing to do */
385         return u_terminateUChars(dest, destSize, 0, pErrorCode);
386     }
387 
388     runCount=ubidi_countRuns(pBiDi, pErrorCode);
389     if(U_FAILURE(*pErrorCode)) {
390         return 0;
391     }
392 
393     /* destSize shrinks, later destination length=destCapacity-destSize */
394     saveDest=dest;
395     destCapacity=destSize;
396 
397     /*
398      * Option "insert marks" implies UBIDI_INSERT_LRM_FOR_NUMERIC if the
399      * reordering mode (checked below) is appropriate.
400      */
401     if(pBiDi->reorderingOptions & UBIDI_OPTION_INSERT_MARKS) {
402         options|=UBIDI_INSERT_LRM_FOR_NUMERIC;
403         options&=~UBIDI_REMOVE_BIDI_CONTROLS;
404     }
405     /*
406      * Option "remove controls" implies UBIDI_REMOVE_BIDI_CONTROLS
407      * and cancels UBIDI_INSERT_LRM_FOR_NUMERIC.
408      */
409     if(pBiDi->reorderingOptions & UBIDI_OPTION_REMOVE_CONTROLS) {
410         options|=UBIDI_REMOVE_BIDI_CONTROLS;
411         options&=~UBIDI_INSERT_LRM_FOR_NUMERIC;
412     }
413     /*
414      * If we do not perform the "inverse BiDi" algorithm, then we
415      * don't need to insert any LRMs, and don't need to test for it.
416      */
417     if((pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_NUMBERS_AS_L) &&
418        (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_LIKE_DIRECT)  &&
419        (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL) &&
420        (pBiDi->reorderingMode != UBIDI_REORDER_RUNS_ONLY)) {
421         options&=~UBIDI_INSERT_LRM_FOR_NUMERIC;
422     }
423     /*
424      * Iterate through all visual runs and copy the run text segments to
425      * the destination, according to the options.
426      *
427      * The tests for where to insert LRMs ignore the fact that there may be
428      * BN codes or non-BMP code points at the beginning and end of a run;
429      * they may insert LRMs unnecessarily but the tests are faster this way
430      * (this would have to be improved for UTF-8).
431      *
432      * Note that the only errors that are set by doWriteXY() are buffer overflow
433      * errors. Ignore them until the end, and continue for preflighting.
434      */
435     if(!(options&UBIDI_OUTPUT_REVERSE)) {
436         /* forward output */
437         if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) {
438             /* do not insert BiDi controls */
439             for(run=0; run<runCount; ++run) {
440                 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) {
441                     runLength=doWriteForward(text+logicalStart, runLength,
442                                              dest, destSize,
443                                              (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
444                 } else {
445                     runLength=doWriteReverse(text+logicalStart, runLength,
446                                              dest, destSize,
447                                              options, pErrorCode);
448                 }
449                 dest+=runLength;
450                 destSize-=runLength;
451             }
452         } else {
453             /* insert BiDi controls for "inverse BiDi" */
454             const DirProp *dirProps=pBiDi->dirProps;
455             const UChar *src;
456             UChar uc;
457             UBiDiDirection dir;
458             int32_t markFlag;
459 
460             for(run=0; run<runCount; ++run) {
461                 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength);
462                 src=text+logicalStart;
463                 /* check if something relevant in insertPoints */
464                 markFlag=pBiDi->runs[run].insertRemove;
465                 if(markFlag<0) {        /* BiDi controls count */
466                     markFlag=0;
467                 }
468 
469                 if(UBIDI_LTR==dir) {
470                     if((pBiDi->isInverse) &&
471                        (/*run>0 &&*/ dirProps[logicalStart]!=L)) {
472                         markFlag |= LRM_BEFORE;
473                     }
474                     if (markFlag & LRM_BEFORE) {
475                         uc=LRM_CHAR;
476                     }
477                     else if (markFlag & RLM_BEFORE) {
478                         uc=RLM_CHAR;
479                     }
480                     else  uc=0;
481                     if(uc) {
482                         if(destSize>0) {
483                             *dest++=uc;
484                         }
485                         --destSize;
486                     }
487 
488                     runLength=doWriteForward(src, runLength,
489                                              dest, destSize,
490                                              (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
491                     dest+=runLength;
492                     destSize-=runLength;
493 
494                     if((pBiDi->isInverse) &&
495                        (/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L)) {
496                         markFlag |= LRM_AFTER;
497                     }
498                     if (markFlag & LRM_AFTER) {
499                         uc=LRM_CHAR;
500                     }
501                     else if (markFlag & RLM_AFTER) {
502                         uc=RLM_CHAR;
503                     }
504                     else  uc=0;
505                     if(uc) {
506                         if(destSize>0) {
507                             *dest++=uc;
508                         }
509                         --destSize;
510                     }
511                 } else {                /* RTL run */
512                     if((pBiDi->isInverse) &&
513                        (/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1])))) {
514                         markFlag |= RLM_BEFORE;
515                     }
516                     if (markFlag & LRM_BEFORE) {
517                         uc=LRM_CHAR;
518                     }
519                     else if (markFlag & RLM_BEFORE) {
520                         uc=RLM_CHAR;
521                     }
522                     else  uc=0;
523                     if(uc) {
524                         if(destSize>0) {
525                             *dest++=uc;
526                         }
527                         --destSize;
528                     }
529 
530                     runLength=doWriteReverse(src, runLength,
531                                              dest, destSize,
532                                              options, pErrorCode);
533                     dest+=runLength;
534                     destSize-=runLength;
535 
536                     if((pBiDi->isInverse) &&
537                        (/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart])))) {
538                         markFlag |= RLM_AFTER;
539                     }
540                     if (markFlag & LRM_AFTER) {
541                         uc=LRM_CHAR;
542                     }
543                     else if (markFlag & RLM_AFTER) {
544                         uc=RLM_CHAR;
545                     }
546                     else  uc=0;
547                     if(uc) {
548                         if(destSize>0) {
549                             *dest++=uc;
550                         }
551                         --destSize;
552                     }
553                 }
554             }
555         }
556     } else {
557         /* reverse output */
558         if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) {
559             /* do not insert BiDi controls */
560             for(run=runCount; --run>=0;) {
561                 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) {
562                     runLength=doWriteReverse(text+logicalStart, runLength,
563                                              dest, destSize,
564                                              (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
565                 } else {
566                     runLength=doWriteForward(text+logicalStart, runLength,
567                                              dest, destSize,
568                                              options, pErrorCode);
569                 }
570                 dest+=runLength;
571                 destSize-=runLength;
572             }
573         } else {
574             /* insert BiDi controls for "inverse BiDi" */
575             const DirProp *dirProps=pBiDi->dirProps;
576             const UChar *src;
577             UBiDiDirection dir;
578 
579             for(run=runCount; --run>=0;) {
580                 /* reverse output */
581                 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength);
582                 src=text+logicalStart;
583 
584                 if(UBIDI_LTR==dir) {
585                     if(/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L) {
586                         if(destSize>0) {
587                             *dest++=LRM_CHAR;
588                         }
589                         --destSize;
590                     }
591 
592                     runLength=doWriteReverse(src, runLength,
593                                              dest, destSize,
594                                              (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
595                     dest+=runLength;
596                     destSize-=runLength;
597 
598                     if(/*run>0 &&*/ dirProps[logicalStart]!=L) {
599                         if(destSize>0) {
600                             *dest++=LRM_CHAR;
601                         }
602                         --destSize;
603                     }
604                 } else {
605                     if(/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart]))) {
606                         if(destSize>0) {
607                             *dest++=RLM_CHAR;
608                         }
609                         --destSize;
610                     }
611 
612                     runLength=doWriteForward(src, runLength,
613                                              dest, destSize,
614                                              options, pErrorCode);
615                     dest+=runLength;
616                     destSize-=runLength;
617 
618                     if(/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1]))) {
619                         if(destSize>0) {
620                             *dest++=RLM_CHAR;
621                         }
622                         --destSize;
623                     }
624                 }
625             }
626         }
627     }
628 
629     return u_terminateUChars(saveDest, destCapacity, destCapacity-destSize, pErrorCode);
630 }
631