1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 * Copyright (C) 2000-2015, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 ******************************************************************************
10 * file name: ubidiwrt.c
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 1999aug06
16 * created by: Markus W. Scherer, updated by Matitiahu Allouche
17 *
18 * This file contains implementations for BiDi functions that use
19 * the core algorithm and core API to write reordered text.
20 */
21
22 #include "unicode/utypes.h"
23 #include "unicode/ustring.h"
24 #include "unicode/uchar.h"
25 #include "unicode/ubidi.h"
26 #include "unicode/utf16.h"
27 #include "cmemory.h"
28 #include "ustr_imp.h"
29 #include "ubidiimp.h"
30
31 /*
32 * The function implementations in this file are designed
33 * for UTF-16 and UTF-32, not for UTF-8.
34 *
35 * Assumptions that are not true for UTF-8:
36 * - Any code point always needs the same number of code units
37 * ("minimum-length-problem" of UTF-8)
38 * - The BiDi control characters need only one code unit each
39 *
40 * Further assumptions for all UTFs:
41 * - u_charMirror(c) needs the same number of code units as c
42 */
43 #if defined(UTF_SIZE) && UTF_SIZE==8
44 # error reimplement ubidi_writeReordered() for UTF-8, see comment above
45 #endif
46
47 #define IS_COMBINING(type) ((1UL<<(type))&(1UL<<U_NON_SPACING_MARK|1UL<<U_COMBINING_SPACING_MARK|1UL<<U_ENCLOSING_MARK))
48
49 /*
50 * When we have UBIDI_OUTPUT_REVERSE set on ubidi_writeReordered(), then we
51 * semantically write RTL runs in reverse and later reverse them again.
52 * Instead, we actually write them in forward order to begin with.
53 * However, if the RTL run was to be mirrored, we need to mirror here now
54 * since the implicit second reversal must not do it.
55 * It looks strange to do mirroring in LTR output, but it is only because
56 * we are writing RTL output in reverse.
57 */
58 static int32_t
doWriteForward(const char16_t * src,int32_t srcLength,char16_t * dest,int32_t destSize,uint16_t options,UErrorCode * pErrorCode)59 doWriteForward(const char16_t *src, int32_t srcLength,
60 char16_t *dest, int32_t destSize,
61 uint16_t options,
62 UErrorCode *pErrorCode) {
63 /* optimize for several combinations of options */
64 switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING)) {
65 case 0: {
66 /* simply copy the LTR run to the destination */
67 int32_t length=srcLength;
68 if(destSize<length) {
69 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
70 return srcLength;
71 }
72 do {
73 *dest++=*src++;
74 } while(--length>0);
75 return srcLength;
76 }
77 case UBIDI_DO_MIRRORING: {
78 /* do mirroring */
79 int32_t i=0, j=0;
80 UChar32 c;
81
82 if(destSize<srcLength) {
83 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
84 return srcLength;
85 }
86 do {
87 U16_NEXT(src, i, srcLength, c);
88 c=u_charMirror(c);
89 U16_APPEND_UNSAFE(dest, j, c);
90 } while(i<srcLength);
91 return srcLength;
92 }
93 case UBIDI_REMOVE_BIDI_CONTROLS: {
94 /* copy the LTR run and remove any BiDi control characters */
95 int32_t remaining=destSize;
96 char16_t c;
97 do {
98 c=*src++;
99 if(!IS_BIDI_CONTROL_CHAR(c)) {
100 if(--remaining<0) {
101 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
102
103 /* preflight the length */
104 while(--srcLength>0) {
105 c=*src++;
106 if(!IS_BIDI_CONTROL_CHAR(c)) {
107 --remaining;
108 }
109 }
110 return destSize-remaining;
111 }
112 *dest++=c;
113 }
114 } while(--srcLength>0);
115 return destSize-remaining;
116 }
117 default: {
118 /* remove BiDi control characters and do mirroring */
119 int32_t remaining=destSize;
120 int32_t i, j=0;
121 UChar32 c;
122 do {
123 i=0;
124 U16_NEXT(src, i, srcLength, c);
125 src+=i;
126 srcLength-=i;
127 if(!IS_BIDI_CONTROL_CHAR(c)) {
128 remaining-=i;
129 if(remaining<0) {
130 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
131
132 /* preflight the length */
133 while(srcLength>0) {
134 c=*src++;
135 if(!IS_BIDI_CONTROL_CHAR(c)) {
136 --remaining;
137 }
138 --srcLength;
139 }
140 return destSize-remaining;
141 }
142 c=u_charMirror(c);
143 U16_APPEND_UNSAFE(dest, j, c);
144 }
145 } while(srcLength>0);
146 return j;
147 }
148 } /* end of switch */
149 }
150
151 static int32_t
doWriteReverse(const char16_t * src,int32_t srcLength,char16_t * dest,int32_t destSize,uint16_t options,UErrorCode * pErrorCode)152 doWriteReverse(const char16_t *src, int32_t srcLength,
153 char16_t *dest, int32_t destSize,
154 uint16_t options,
155 UErrorCode *pErrorCode) {
156 /*
157 * RTL run -
158 *
159 * RTL runs need to be copied to the destination in reverse order
160 * of code points, not code units, to keep Unicode characters intact.
161 *
162 * The general strategy for this is to read the source text
163 * in backward order, collect all code units for a code point
164 * (and optionally following combining characters, see below),
165 * and copy all these code units in ascending order
166 * to the destination for this run.
167 *
168 * Several options request whether combining characters
169 * should be kept after their base characters,
170 * whether BiDi control characters should be removed, and
171 * whether characters should be replaced by their mirror-image
172 * equivalent Unicode characters.
173 */
174 int32_t i, j;
175 UChar32 c;
176
177 /* optimize for several combinations of options */
178 switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING|UBIDI_KEEP_BASE_COMBINING)) {
179 case 0:
180 /*
181 * With none of the "complicated" options set, the destination
182 * run will have the same length as the source run,
183 * and there is no mirroring and no keeping combining characters
184 * with their base characters.
185 */
186 if(destSize<srcLength) {
187 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
188 return srcLength;
189 }
190 destSize=srcLength;
191
192 /* preserve character integrity */
193 do {
194 /* i is always after the last code unit known to need to be kept in this segment */
195 i=srcLength;
196
197 /* collect code units for one base character */
198 U16_BACK_1(src, 0, srcLength);
199
200 /* copy this base character */
201 j=srcLength;
202 do {
203 *dest++=src[j++];
204 } while(j<i);
205 } while(srcLength>0);
206 break;
207 case UBIDI_KEEP_BASE_COMBINING:
208 /*
209 * Here, too, the destination
210 * run will have the same length as the source run,
211 * and there is no mirroring.
212 * We do need to keep combining characters with their base characters.
213 */
214 if(destSize<srcLength) {
215 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
216 return srcLength;
217 }
218 destSize=srcLength;
219
220 /* preserve character integrity */
221 do {
222 /* i is always after the last code unit known to need to be kept in this segment */
223 i=srcLength;
224
225 /* collect code units and modifier letters for one base character */
226 do {
227 U16_PREV(src, 0, srcLength, c);
228 } while(srcLength>0 && IS_COMBINING(u_charType(c)));
229
230 /* copy this "user character" */
231 j=srcLength;
232 do {
233 *dest++=src[j++];
234 } while(j<i);
235 } while(srcLength>0);
236 break;
237 default:
238 /*
239 * With several "complicated" options set, this is the most
240 * general and the slowest copying of an RTL run.
241 * We will do mirroring, remove BiDi controls, and
242 * keep combining characters with their base characters
243 * as requested.
244 */
245 if(!(options&UBIDI_REMOVE_BIDI_CONTROLS)) {
246 i=srcLength;
247 } else {
248 /* we need to find out the destination length of the run,
249 which will not include the BiDi control characters */
250 int32_t length=srcLength;
251 char16_t ch;
252
253 i=0;
254 do {
255 ch=*src++;
256 if(!IS_BIDI_CONTROL_CHAR(ch)) {
257 ++i;
258 }
259 } while(--length>0);
260 src-=srcLength;
261 }
262
263 if(destSize<i) {
264 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
265 return i;
266 }
267 destSize=i;
268
269 /* preserve character integrity */
270 do {
271 /* i is always after the last code unit known to need to be kept in this segment */
272 i=srcLength;
273
274 /* collect code units for one base character */
275 U16_PREV(src, 0, srcLength, c);
276 if(options&UBIDI_KEEP_BASE_COMBINING) {
277 /* collect modifier letters for this base character */
278 while(srcLength>0 && IS_COMBINING(u_charType(c))) {
279 U16_PREV(src, 0, srcLength, c);
280 }
281 }
282
283 if(options&UBIDI_REMOVE_BIDI_CONTROLS && IS_BIDI_CONTROL_CHAR(c)) {
284 /* do not copy this BiDi control character */
285 continue;
286 }
287
288 /* copy this "user character" */
289 j=srcLength;
290 if(options&UBIDI_DO_MIRRORING) {
291 /* mirror only the base character */
292 int32_t k=0;
293 c=u_charMirror(c);
294 U16_APPEND_UNSAFE(dest, k, c);
295 dest+=k;
296 j+=k;
297 }
298 while(j<i) {
299 *dest++=src[j++];
300 }
301 } while(srcLength>0);
302 break;
303 } /* end of switch */
304
305 return destSize;
306 }
307
308 U_CAPI int32_t U_EXPORT2
ubidi_writeReverse(const char16_t * src,int32_t srcLength,char16_t * dest,int32_t destSize,uint16_t options,UErrorCode * pErrorCode)309 ubidi_writeReverse(const char16_t *src, int32_t srcLength,
310 char16_t *dest, int32_t destSize,
311 uint16_t options,
312 UErrorCode *pErrorCode) {
313 int32_t destLength;
314
315 if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
316 return 0;
317 }
318
319 /* more error checking */
320 if( src==nullptr || srcLength<-1 ||
321 destSize<0 || (destSize>0 && dest==nullptr))
322 {
323 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
324 return 0;
325 }
326
327 /* do input and output overlap? */
328 if( dest!=nullptr &&
329 ((src>=dest && src<dest+destSize) ||
330 (dest>=src && dest<src+srcLength)))
331 {
332 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
333 return 0;
334 }
335
336 if(srcLength==-1) {
337 srcLength=u_strlen(src);
338 }
339 if(srcLength>0) {
340 destLength=doWriteReverse(src, srcLength, dest, destSize, options, pErrorCode);
341 } else {
342 /* nothing to do */
343 destLength=0;
344 }
345
346 return u_terminateUChars(dest, destSize, destLength, pErrorCode);
347 }
348
349 // Ticket 20907 - The optimizer in MSVC/Visual Studio versions below 16.4 has trouble with this
350 // function on Windows ARM64. As a work-around, we disable optimizations for this function.
351 // This work-around could/should be removed once the following versions of Visual Studio are no
352 // longer supported: All versions of VS2017, and versions of VS2019 below 16.4.
353 #if (defined(_MSC_VER) && (defined(_M_ARM64)) && (_MSC_VER < 1924))
354 #pragma optimize( "", off )
355 #endif
356 U_CAPI int32_t U_EXPORT2
ubidi_writeReordered(UBiDi * pBiDi,char16_t * dest,int32_t destSize,uint16_t options,UErrorCode * pErrorCode)357 ubidi_writeReordered(UBiDi *pBiDi,
358 char16_t *dest, int32_t destSize,
359 uint16_t options,
360 UErrorCode *pErrorCode) {
361 const char16_t *text;
362 char16_t *saveDest;
363 int32_t length, destCapacity;
364 int32_t run, runCount, logicalStart, runLength;
365
366 if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
367 return 0;
368 }
369
370 /* more error checking */
371 if( pBiDi==nullptr ||
372 (text=pBiDi->text)==nullptr || (length=pBiDi->length)<0 ||
373 destSize<0 || (destSize>0 && dest==nullptr))
374 {
375 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
376 return 0;
377 }
378
379 /* do input and output overlap? */
380 if( dest!=nullptr &&
381 ((text>=dest && text<dest+destSize) ||
382 (dest>=text && dest<text+pBiDi->originalLength)))
383 {
384 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
385 return 0;
386 }
387
388 if(length==0) {
389 /* nothing to do */
390 return u_terminateUChars(dest, destSize, 0, pErrorCode);
391 }
392
393 runCount=ubidi_countRuns(pBiDi, pErrorCode);
394 if(U_FAILURE(*pErrorCode)) {
395 return 0;
396 }
397
398 /* destSize shrinks, later destination length=destCapacity-destSize */
399 saveDest=dest;
400 destCapacity=destSize;
401
402 /*
403 * Option "insert marks" implies UBIDI_INSERT_LRM_FOR_NUMERIC if the
404 * reordering mode (checked below) is appropriate.
405 */
406 if(pBiDi->reorderingOptions & UBIDI_OPTION_INSERT_MARKS) {
407 options|=UBIDI_INSERT_LRM_FOR_NUMERIC;
408 options&=~UBIDI_REMOVE_BIDI_CONTROLS;
409 }
410 /*
411 * Option "remove controls" implies UBIDI_REMOVE_BIDI_CONTROLS
412 * and cancels UBIDI_INSERT_LRM_FOR_NUMERIC.
413 */
414 if(pBiDi->reorderingOptions & UBIDI_OPTION_REMOVE_CONTROLS) {
415 options|=UBIDI_REMOVE_BIDI_CONTROLS;
416 options&=~UBIDI_INSERT_LRM_FOR_NUMERIC;
417 }
418 /*
419 * If we do not perform the "inverse BiDi" algorithm, then we
420 * don't need to insert any LRMs, and don't need to test for it.
421 */
422 if((pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_NUMBERS_AS_L) &&
423 (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_LIKE_DIRECT) &&
424 (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL) &&
425 (pBiDi->reorderingMode != UBIDI_REORDER_RUNS_ONLY)) {
426 options&=~UBIDI_INSERT_LRM_FOR_NUMERIC;
427 }
428 /*
429 * Iterate through all visual runs and copy the run text segments to
430 * the destination, according to the options.
431 *
432 * The tests for where to insert LRMs ignore the fact that there may be
433 * BN codes or non-BMP code points at the beginning and end of a run;
434 * they may insert LRMs unnecessarily but the tests are faster this way
435 * (this would have to be improved for UTF-8).
436 *
437 * Note that the only errors that are set by doWriteXY() are buffer overflow
438 * errors. Ignore them until the end, and continue for preflighting.
439 */
440 if(!(options&UBIDI_OUTPUT_REVERSE)) {
441 /* forward output */
442 if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) {
443 /* do not insert BiDi controls */
444 for(run=0; run<runCount; ++run) {
445 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) {
446 runLength=doWriteForward(text+logicalStart, runLength,
447 dest, destSize,
448 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
449 } else {
450 runLength=doWriteReverse(text+logicalStart, runLength,
451 dest, destSize,
452 options, pErrorCode);
453 }
454 if(dest!=nullptr) {
455 dest+=runLength;
456 }
457 destSize-=runLength;
458 }
459 } else {
460 /* insert BiDi controls for "inverse BiDi" */
461 const DirProp *dirProps=pBiDi->dirProps;
462 const char16_t *src;
463 char16_t uc;
464 UBiDiDirection dir;
465 int32_t markFlag;
466
467 for(run=0; run<runCount; ++run) {
468 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength);
469 src=text+logicalStart;
470 /* check if something relevant in insertPoints */
471 markFlag=pBiDi->runs[run].insertRemove;
472 if(markFlag<0) { /* BiDi controls count */
473 markFlag=0;
474 }
475
476 if(UBIDI_LTR==dir) {
477 if((pBiDi->isInverse) &&
478 (/*run>0 &&*/ dirProps[logicalStart]!=L)) {
479 markFlag |= LRM_BEFORE;
480 }
481 if (markFlag & LRM_BEFORE) {
482 uc=LRM_CHAR;
483 }
484 else if (markFlag & RLM_BEFORE) {
485 uc=RLM_CHAR;
486 }
487 else uc=0;
488 if(uc) {
489 if(destSize>0) {
490 *dest++=uc;
491 }
492 --destSize;
493 }
494
495 runLength=doWriteForward(src, runLength,
496 dest, destSize,
497 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
498 if(dest!=nullptr) {
499 dest+=runLength;
500 }
501 destSize-=runLength;
502
503 if((pBiDi->isInverse) &&
504 (/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L)) {
505 markFlag |= LRM_AFTER;
506 }
507 if (markFlag & LRM_AFTER) {
508 uc=LRM_CHAR;
509 }
510 else if (markFlag & RLM_AFTER) {
511 uc=RLM_CHAR;
512 }
513 else uc=0;
514 if(uc) {
515 if(destSize>0) {
516 *dest++=uc;
517 }
518 --destSize;
519 }
520 } else { /* RTL run */
521 if((pBiDi->isInverse) &&
522 (/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1])))) {
523 markFlag |= RLM_BEFORE;
524 }
525 if (markFlag & LRM_BEFORE) {
526 uc=LRM_CHAR;
527 }
528 else if (markFlag & RLM_BEFORE) {
529 uc=RLM_CHAR;
530 }
531 else uc=0;
532 if(uc) {
533 if(destSize>0) {
534 *dest++=uc;
535 }
536 --destSize;
537 }
538
539 runLength=doWriteReverse(src, runLength,
540 dest, destSize,
541 options, pErrorCode);
542 if(dest!=nullptr) {
543 dest+=runLength;
544 }
545 destSize-=runLength;
546
547 if((pBiDi->isInverse) &&
548 (/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart])))) {
549 markFlag |= RLM_AFTER;
550 }
551 if (markFlag & LRM_AFTER) {
552 uc=LRM_CHAR;
553 }
554 else if (markFlag & RLM_AFTER) {
555 uc=RLM_CHAR;
556 }
557 else uc=0;
558 if(uc) {
559 if(destSize>0) {
560 *dest++=uc;
561 }
562 --destSize;
563 }
564 }
565 }
566 }
567 } else {
568 /* reverse output */
569 if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) {
570 /* do not insert BiDi controls */
571 for(run=runCount; --run>=0;) {
572 if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) {
573 runLength=doWriteReverse(text+logicalStart, runLength,
574 dest, destSize,
575 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
576 } else {
577 runLength=doWriteForward(text+logicalStart, runLength,
578 dest, destSize,
579 options, pErrorCode);
580 }
581 if(dest!=nullptr) {
582 dest+=runLength;
583 }
584 destSize-=runLength;
585 }
586 } else {
587 /* insert BiDi controls for "inverse BiDi" */
588 const DirProp *dirProps=pBiDi->dirProps;
589 const char16_t *src;
590 UBiDiDirection dir;
591
592 for(run=runCount; --run>=0;) {
593 /* reverse output */
594 dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength);
595 src=text+logicalStart;
596
597 if(UBIDI_LTR==dir) {
598 if(/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L) {
599 if(destSize>0) {
600 *dest++=LRM_CHAR;
601 }
602 --destSize;
603 }
604
605 runLength=doWriteReverse(src, runLength,
606 dest, destSize,
607 (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
608 if(dest!=nullptr) {
609 dest+=runLength;
610 }
611 destSize-=runLength;
612
613 if(/*run>0 &&*/ dirProps[logicalStart]!=L) {
614 if(destSize>0) {
615 *dest++=LRM_CHAR;
616 }
617 --destSize;
618 }
619 } else {
620 if(/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart]))) {
621 if(destSize>0) {
622 *dest++=RLM_CHAR;
623 }
624 --destSize;
625 }
626
627 runLength=doWriteForward(src, runLength,
628 dest, destSize,
629 options, pErrorCode);
630 if(dest!=nullptr) {
631 dest+=runLength;
632 }
633 destSize-=runLength;
634
635 if(/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1]))) {
636 if(destSize>0) {
637 *dest++=RLM_CHAR;
638 }
639 --destSize;
640 }
641 }
642 }
643 }
644 }
645
646 return u_terminateUChars(saveDest, destCapacity, destCapacity-destSize, pErrorCode);
647 }
648 #if (defined(_MSC_VER) && (defined(_M_ARM64)) && (_MSC_VER < 1924))
649 #pragma optimize( "", on )
650 #endif
651