1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * 6 * Copyright (C) 1999-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ****************************************************************************** 10 * file name: ubidiimp.h 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 1999aug06 16 * created by: Markus W. Scherer, updated by Matitiahu Allouche 17 */ 18 19 #ifndef UBIDIIMP_H 20 #define UBIDIIMP_H 21 22 #include "unicode/utypes.h" 23 #include "unicode/ubidi.h" 24 #include "unicode/uchar.h" 25 #include "ubidi_props.h" 26 27 /* miscellaneous definitions ---------------------------------------------- */ 28 29 // ICU-20853=ICU-20935 Solaris #defines CS and ES in sys/regset.h 30 #ifdef CS 31 # undef CS 32 #endif 33 #ifdef ES 34 # undef ES 35 #endif 36 37 typedef uint8_t DirProp; 38 typedef uint32_t Flags; 39 40 /* Comparing the description of the BiDi algorithm with this implementation 41 is easier with the same names for the BiDi types in the code as there. 42 See UCharDirection in uchar.h . 43 */ 44 enum { 45 L= U_LEFT_TO_RIGHT, /* 0 */ 46 R= U_RIGHT_TO_LEFT, /* 1 */ 47 EN= U_EUROPEAN_NUMBER, /* 2 */ 48 ES= U_EUROPEAN_NUMBER_SEPARATOR, /* 3 */ 49 ET= U_EUROPEAN_NUMBER_TERMINATOR, /* 4 */ 50 AN= U_ARABIC_NUMBER, /* 5 */ 51 CS= U_COMMON_NUMBER_SEPARATOR, /* 6 */ 52 B= U_BLOCK_SEPARATOR, /* 7 */ 53 S= U_SEGMENT_SEPARATOR, /* 8 */ 54 WS= U_WHITE_SPACE_NEUTRAL, /* 9 */ 55 ON= U_OTHER_NEUTRAL, /* 10 */ 56 LRE=U_LEFT_TO_RIGHT_EMBEDDING, /* 11 */ 57 LRO=U_LEFT_TO_RIGHT_OVERRIDE, /* 12 */ 58 AL= U_RIGHT_TO_LEFT_ARABIC, /* 13 */ 59 RLE=U_RIGHT_TO_LEFT_EMBEDDING, /* 14 */ 60 RLO=U_RIGHT_TO_LEFT_OVERRIDE, /* 15 */ 61 PDF=U_POP_DIRECTIONAL_FORMAT, /* 16 */ 62 NSM=U_DIR_NON_SPACING_MARK, /* 17 */ 63 BN= U_BOUNDARY_NEUTRAL, /* 18 */ 64 FSI=U_FIRST_STRONG_ISOLATE, /* 19 */ 65 LRI=U_LEFT_TO_RIGHT_ISOLATE, /* 20 */ 66 RLI=U_RIGHT_TO_LEFT_ISOLATE, /* 21 */ 67 PDI=U_POP_DIRECTIONAL_ISOLATE, /* 22 */ 68 ENL, /* EN after W7 */ /* 23 */ 69 ENR, /* EN not subject to W7 */ /* 24 */ 70 dirPropCount 71 }; 72 73 /* Sometimes, bit values are more appropriate 74 to deal with directionality properties. 75 Abbreviations in these macro names refer to names 76 used in the BiDi algorithm. 77 */ 78 #define DIRPROP_FLAG(dir) (1UL<<(dir)) 79 #define PURE_DIRPROP(prop) ((prop)&~0xE0) ????????????????????????? 80 81 /* special flag for multiple runs from explicit embedding codes */ 82 #define DIRPROP_FLAG_MULTI_RUNS (1UL<<31) 83 84 /* are there any characters that are LTR or RTL? */ 85 #define MASK_LTR (DIRPROP_FLAG(L)|DIRPROP_FLAG(EN)|DIRPROP_FLAG(ENL)|DIRPROP_FLAG(ENR)|DIRPROP_FLAG(AN)|DIRPROP_FLAG(LRE)|DIRPROP_FLAG(LRO)|DIRPROP_FLAG(LRI)) 86 #define MASK_RTL (DIRPROP_FLAG(R)|DIRPROP_FLAG(AL)|DIRPROP_FLAG(RLE)|DIRPROP_FLAG(RLO)|DIRPROP_FLAG(RLI)) 87 #define MASK_R_AL (DIRPROP_FLAG(R)|DIRPROP_FLAG(AL)) 88 #define MASK_STRONG_EN_AN (DIRPROP_FLAG(L)|DIRPROP_FLAG(R)|DIRPROP_FLAG(AL)|DIRPROP_FLAG(EN)|DIRPROP_FLAG(AN)) 89 90 /* explicit embedding codes */ 91 #define MASK_EXPLICIT (DIRPROP_FLAG(LRE)|DIRPROP_FLAG(LRO)|DIRPROP_FLAG(RLE)|DIRPROP_FLAG(RLO)|DIRPROP_FLAG(PDF)) 92 93 /* explicit isolate codes */ 94 #define MASK_ISO (DIRPROP_FLAG(LRI)|DIRPROP_FLAG(RLI)|DIRPROP_FLAG(FSI)|DIRPROP_FLAG(PDI)) 95 96 #define MASK_BN_EXPLICIT (DIRPROP_FLAG(BN)|MASK_EXPLICIT) 97 98 /* paragraph and segment separators */ 99 #define MASK_B_S (DIRPROP_FLAG(B)|DIRPROP_FLAG(S)) 100 101 /* all types that are counted as White Space or Neutral in some steps */ 102 #define MASK_WS (MASK_B_S|DIRPROP_FLAG(WS)|MASK_BN_EXPLICIT|MASK_ISO) 103 104 /* types that are neutrals or could becomes neutrals in (Wn) */ 105 #define MASK_POSSIBLE_N (DIRPROP_FLAG(ON)|DIRPROP_FLAG(CS)|DIRPROP_FLAG(ES)|DIRPROP_FLAG(ET)|MASK_WS) 106 107 /* 108 * These types may be changed to "e", 109 * the embedding type (L or R) of the run, 110 * in the BiDi algorithm (N2) 111 */ 112 #define MASK_EMBEDDING (DIRPROP_FLAG(NSM)|MASK_POSSIBLE_N) 113 114 /* the dirProp's L and R are defined to 0 and 1 values in UCharDirection */ 115 #define GET_LR_FROM_LEVEL(level) ((DirProp)((level)&1)) 116 117 #define IS_DEFAULT_LEVEL(level) ((level)>=0xfe) 118 119 /* 120 * The following bit is used for the directional isolate status. 121 * Stack entries corresponding to isolate sequences are greater than ISOLATE. 122 */ 123 #define ISOLATE 0x0100 124 125 U_CFUNC UBiDiLevel 126 ubidi_getParaLevelAtIndex(const UBiDi *pBiDi, int32_t index); 127 128 #define GET_PARALEVEL(ubidi, index) \ 129 ((UBiDiLevel)(!(ubidi)->defaultParaLevel || (index)<(ubidi)->paras[0].limit ? \ 130 (ubidi)->paraLevel : ubidi_getParaLevelAtIndex((ubidi), (index)))) 131 132 /* number of paras entries allocated initially without malloc */ 133 #define SIMPLE_PARAS_COUNT 10 134 /* number of isolate entries allocated initially without malloc */ 135 #define SIMPLE_ISOLATES_COUNT 5 136 /* number of isolate run entries for paired brackets allocated initially without malloc */ 137 #define SIMPLE_OPENINGS_COUNT 20 138 139 #define CR 0x000D 140 #define LF 0x000A 141 142 /* Run structure for reordering --------------------------------------------- */ 143 enum { 144 LRM_BEFORE=1, 145 LRM_AFTER=2, 146 RLM_BEFORE=4, 147 RLM_AFTER=8 148 }; 149 150 typedef struct Para { 151 int32_t limit; 152 int32_t level; 153 } Para; 154 155 enum { /* flags for Opening.flags */ 156 FOUND_L=DIRPROP_FLAG(L), 157 FOUND_R=DIRPROP_FLAG(R) 158 }; 159 160 typedef struct Opening { 161 int32_t position; /* position of opening bracket */ 162 int32_t match; /* matching char or -position of closing bracket */ 163 int32_t contextPos; /* position of last strong char found before opening */ 164 uint16_t flags; /* bits for L or R/AL found within the pair */ 165 UBiDiDirection contextDir; /* L or R according to last strong char before opening */ 166 uint8_t filler; /* to complete a nice multiple of 4 chars */ 167 } Opening; 168 169 typedef struct IsoRun { 170 int32_t contextPos; /* position of char determining context */ 171 uint16_t start; /* index of first opening entry for this run */ 172 uint16_t limit; /* index after last opening entry for this run */ 173 UBiDiLevel level; /* level of this run */ 174 DirProp lastStrong; /* bidi class of last strong char found in this run */ 175 DirProp lastBase; /* bidi class of last base char found in this run */ 176 UBiDiDirection contextDir; /* L or R to use as context for following openings */ 177 } IsoRun; 178 179 typedef struct BracketData { 180 UBiDi *pBiDi; 181 /* array of opening entries which should be enough in most cases; no malloc() */ 182 Opening simpleOpenings[SIMPLE_OPENINGS_COUNT]; 183 Opening *openings; /* pointer to current array of entries */ 184 int32_t openingsCount; /* number of allocated entries */ 185 int32_t isoRunLast; /* index of last used entry */ 186 /* array of nested isolated sequence entries; can never excess UBIDI_MAX_EXPLICIT_LEVEL 187 + 1 for index 0, + 1 for before the first isolated sequence */ 188 IsoRun isoRuns[UBIDI_MAX_EXPLICIT_LEVEL+2]; 189 UBool isNumbersSpecial; /* reordering mode for NUMBERS_SPECIAL */ 190 } BracketData; 191 192 typedef struct Isolate { 193 int32_t startON; 194 int32_t start1; 195 int32_t state; 196 int16_t stateImp; 197 } Isolate; 198 199 typedef struct Run { 200 int32_t logicalStart, /* first character of the run; b31 indicates even/odd level */ 201 visualLimit, /* last visual position of the run +1 */ 202 insertRemove; /* if >0, flags for inserting LRM/RLM before/after run, 203 if <0, count of bidi controls within run */ 204 } Run; 205 206 /* in a Run, logicalStart will get this bit set if the run level is odd */ 207 #define INDEX_ODD_BIT (1UL<<31) 208 209 #define MAKE_INDEX_ODD_PAIR(index, level) ((index)|((int32_t)((level)&1)<<31)) 210 #define ADD_ODD_BIT_FROM_LEVEL(x, level) ((x)|=((int32_t)((level)&1)<<31)) 211 #define REMOVE_ODD_BIT(x) ((x)&=~INDEX_ODD_BIT) 212 213 #define GET_INDEX(x) ((x)&~INDEX_ODD_BIT) 214 #define GET_ODD_BIT(x) ((uint32_t)(x)>>31) 215 #define IS_ODD_RUN(x) ((UBool)(((x)&INDEX_ODD_BIT)!=0)) 216 #define IS_EVEN_RUN(x) ((UBool)(((x)&INDEX_ODD_BIT)==0)) 217 218 U_CFUNC UBool 219 ubidi_getRuns(UBiDi *pBiDi, UErrorCode *pErrorCode); 220 221 /** BiDi control code points */ 222 enum { 223 ZWNJ_CHAR=0x200c, 224 ZWJ_CHAR, 225 LRM_CHAR, 226 RLM_CHAR, 227 LRE_CHAR=0x202a, 228 RLE_CHAR, 229 PDF_CHAR, 230 LRO_CHAR, 231 RLO_CHAR, 232 LRI_CHAR=0x2066, 233 RLI_CHAR, 234 FSI_CHAR, 235 PDI_CHAR 236 }; 237 238 #define IS_BIDI_CONTROL_CHAR(c) (((uint32_t)(c)&0xfffffffc)==ZWNJ_CHAR || (uint32_t)((c)-LRE_CHAR)<5 || (uint32_t)((c)-LRI_CHAR)<4) 239 240 /* InsertPoints structure for noting where to put BiDi marks ---------------- */ 241 242 typedef struct Point { 243 int32_t pos; /* position in text */ 244 int32_t flag; /* flag for LRM/RLM, before/after */ 245 } Point; 246 247 typedef struct InsertPoints { 248 int32_t capacity; /* number of points allocated */ 249 int32_t size; /* number of points used */ 250 int32_t confirmed; /* number of points confirmed */ 251 UErrorCode errorCode; /* for eventual memory shortage */ 252 Point *points; /* pointer to array of points */ 253 } InsertPoints; 254 255 256 /* UBiDi structure ----------------------------------------------------------- */ 257 258 struct UBiDi { 259 /* pointer to parent paragraph object (pointer to self if this object is 260 * a paragraph object); set to NULL in a newly opened object; set to a 261 * real value after a successful execution of ubidi_setPara or ubidi_setLine 262 */ 263 const UBiDi * pParaBiDi; 264 265 /* alias pointer to the current text */ 266 const UChar *text; 267 268 /* length of the current text */ 269 int32_t originalLength; 270 271 /* if the UBIDI_OPTION_STREAMING option is set, this is the length 272 * of text actually processed by ubidi_setPara, which may be shorter than 273 * the original length. 274 * Otherwise, it is identical to the original length. 275 */ 276 int32_t length; 277 278 /* if the UBIDI_OPTION_REMOVE_CONTROLS option is set, and/or 279 * marks are allowed to be inserted in one of the reordering mode, the 280 * length of the result string may be different from the processed length. 281 */ 282 int32_t resultLength; 283 284 /* memory sizes in bytes */ 285 int32_t dirPropsSize, levelsSize, openingsSize, parasSize, runsSize, isolatesSize; 286 287 /* allocated memory */ 288 DirProp *dirPropsMemory; 289 UBiDiLevel *levelsMemory; 290 Opening *openingsMemory; 291 Para *parasMemory; 292 Run *runsMemory; 293 Isolate *isolatesMemory; 294 295 /* indicators for whether memory may be allocated after ubidi_open() */ 296 UBool mayAllocateText, mayAllocateRuns; 297 298 /* arrays with one value per text-character */ 299 DirProp *dirProps; 300 UBiDiLevel *levels; 301 302 /* are we performing an approximation of the "inverse BiDi" algorithm? */ 303 UBool isInverse; 304 305 /* are we using the basic algorithm or its variation? */ 306 UBiDiReorderingMode reorderingMode; 307 308 /* UBIDI_REORDER_xxx values must be ordered so that all the regular 309 * logical to visual modes come first, and all inverse BiDi modes 310 * come last. 311 */ 312 #define UBIDI_REORDER_LAST_LOGICAL_TO_VISUAL UBIDI_REORDER_NUMBERS_SPECIAL 313 314 /* bitmask for reordering options */ 315 uint32_t reorderingOptions; 316 317 /* must block separators receive level 0? */ 318 UBool orderParagraphsLTR; 319 320 /* the paragraph level */ 321 UBiDiLevel paraLevel; 322 /* original paraLevel when contextual */ 323 /* must be one of UBIDI_DEFAULT_xxx or 0 if not contextual */ 324 UBiDiLevel defaultParaLevel; 325 326 /* context data */ 327 const UChar *prologue; 328 int32_t proLength; 329 const UChar *epilogue; 330 int32_t epiLength; 331 332 /* the following is set in ubidi_setPara, used in processPropertySeq */ 333 const struct ImpTabPair * pImpTabPair; /* pointer to levels state table pair */ 334 335 /* the overall paragraph or line directionality - see UBiDiDirection */ 336 UBiDiDirection direction; 337 338 /* flags is a bit set for which directional properties are in the text */ 339 Flags flags; 340 341 /* lastArabicPos is index to the last AL in the text, -1 if none */ 342 int32_t lastArabicPos; 343 344 /* characters after trailingWSStart are WS and are */ 345 /* implicitly at the paraLevel (rule (L1)) - levels may not reflect that */ 346 int32_t trailingWSStart; 347 348 /* fields for paragraph handling */ 349 int32_t paraCount; /* set in getDirProps() */ 350 /* filled in getDirProps() */ 351 Para *paras; 352 353 /* for relatively short text, we only need a tiny array of paras (no malloc()) */ 354 Para simpleParas[SIMPLE_PARAS_COUNT]; 355 356 /* fields for line reordering */ 357 int32_t runCount; /* ==-1: runs not set up yet */ 358 Run *runs; 359 360 /* for non-mixed text, we only need a tiny array of runs (no malloc()) */ 361 Run simpleRuns[1]; 362 363 /* maximum or current nesting depth of isolate sequences */ 364 /* Within resolveExplicitLevels() and checkExplicitLevels(), this is the maximal 365 nesting encountered. 366 Within resolveImplicitLevels(), this is the index of the current isolates 367 stack entry. */ 368 int32_t isolateCount; 369 Isolate *isolates; 370 371 /* for simple text, have a small stack (no malloc()) */ 372 Isolate simpleIsolates[SIMPLE_ISOLATES_COUNT]; 373 374 /* for inverse Bidi with insertion of directional marks */ 375 InsertPoints insertPoints; 376 377 /* for option UBIDI_OPTION_REMOVE_CONTROLS */ 378 int32_t controlCount; 379 380 /* for Bidi class callback */ 381 UBiDiClassCallback *fnClassCallback; /* action pointer */ 382 const void *coClassCallback; /* context pointer */ 383 }; 384 385 #define IS_VALID_PARA(x) ((x) && ((x)->pParaBiDi==(x))) 386 #define IS_VALID_PARA_OR_LINE(x) ((x) && ((x)->pParaBiDi==(x) || (((x)->pParaBiDi) && (x)->pParaBiDi->pParaBiDi==(x)->pParaBiDi))) 387 388 typedef union { 389 DirProp *dirPropsMemory; 390 UBiDiLevel *levelsMemory; 391 Opening *openingsMemory; 392 Para *parasMemory; 393 Run *runsMemory; 394 Isolate *isolatesMemory; 395 } BidiMemoryForAllocation; 396 397 /* Macros for initial checks at function entry */ 398 #define RETURN_IF_NULL_OR_FAILING_ERRCODE(pErrcode, retvalue) UPRV_BLOCK_MACRO_BEGIN { \ 399 if((pErrcode)==NULL || U_FAILURE(*pErrcode)) return retvalue; \ 400 } UPRV_BLOCK_MACRO_END 401 #define RETURN_IF_NOT_VALID_PARA(bidi, errcode, retvalue) UPRV_BLOCK_MACRO_BEGIN { \ 402 if(!IS_VALID_PARA(bidi)) { \ 403 errcode=U_INVALID_STATE_ERROR; \ 404 return retvalue; \ 405 } \ 406 } UPRV_BLOCK_MACRO_END 407 #define RETURN_IF_NOT_VALID_PARA_OR_LINE(bidi, errcode, retvalue) UPRV_BLOCK_MACRO_BEGIN { \ 408 if(!IS_VALID_PARA_OR_LINE(bidi)) { \ 409 errcode=U_INVALID_STATE_ERROR; \ 410 return retvalue; \ 411 } \ 412 } UPRV_BLOCK_MACRO_END 413 #define RETURN_IF_BAD_RANGE(arg, start, limit, errcode, retvalue) UPRV_BLOCK_MACRO_BEGIN { \ 414 if((arg)<(start) || (arg)>=(limit)) { \ 415 (errcode)=U_ILLEGAL_ARGUMENT_ERROR; \ 416 return retvalue; \ 417 } \ 418 } UPRV_BLOCK_MACRO_END 419 420 #define RETURN_VOID_IF_NULL_OR_FAILING_ERRCODE(pErrcode) UPRV_BLOCK_MACRO_BEGIN { \ 421 if((pErrcode)==NULL || U_FAILURE(*pErrcode)) return; \ 422 } UPRV_BLOCK_MACRO_END 423 #define RETURN_VOID_IF_NOT_VALID_PARA(bidi, errcode) UPRV_BLOCK_MACRO_BEGIN { \ 424 if(!IS_VALID_PARA(bidi)) { \ 425 errcode=U_INVALID_STATE_ERROR; \ 426 return; \ 427 } \ 428 } UPRV_BLOCK_MACRO_END 429 #define RETURN_VOID_IF_NOT_VALID_PARA_OR_LINE(bidi, errcode) UPRV_BLOCK_MACRO_BEGIN { \ 430 if(!IS_VALID_PARA_OR_LINE(bidi)) { \ 431 errcode=U_INVALID_STATE_ERROR; \ 432 return; \ 433 } \ 434 } UPRV_BLOCK_MACRO_END 435 #define RETURN_VOID_IF_BAD_RANGE(arg, start, limit, errcode) UPRV_BLOCK_MACRO_BEGIN { \ 436 if((arg)<(start) || (arg)>=(limit)) { \ 437 (errcode)=U_ILLEGAL_ARGUMENT_ERROR; \ 438 return; \ 439 } \ 440 } UPRV_BLOCK_MACRO_END 441 442 /* helper function to (re)allocate memory if allowed */ 443 U_CFUNC UBool 444 ubidi_getMemory(BidiMemoryForAllocation *pMemory, int32_t *pSize, UBool mayAllocate, int32_t sizeNeeded); 445 446 /* helper macros for each allocated array in UBiDi */ 447 #define getDirPropsMemory(pBiDi, length) \ 448 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->dirPropsMemory, &(pBiDi)->dirPropsSize, \ 449 (pBiDi)->mayAllocateText, (length)) 450 451 #define getLevelsMemory(pBiDi, length) \ 452 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->levelsMemory, &(pBiDi)->levelsSize, \ 453 (pBiDi)->mayAllocateText, (length)) 454 455 #define getRunsMemory(pBiDi, length) \ 456 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->runsMemory, &(pBiDi)->runsSize, \ 457 (pBiDi)->mayAllocateRuns, (length)*sizeof(Run)) 458 459 /* additional macros used by ubidi_open() - always allow allocation */ 460 #define getInitialDirPropsMemory(pBiDi, length) \ 461 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->dirPropsMemory, &(pBiDi)->dirPropsSize, \ 462 true, (length)) 463 464 #define getInitialLevelsMemory(pBiDi, length) \ 465 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->levelsMemory, &(pBiDi)->levelsSize, \ 466 true, (length)) 467 468 #define getInitialOpeningsMemory(pBiDi, length) \ 469 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->openingsMemory, &(pBiDi)->openingsSize, \ 470 true, (length)*sizeof(Opening)) 471 472 #define getInitialParasMemory(pBiDi, length) \ 473 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->parasMemory, &(pBiDi)->parasSize, \ 474 true, (length)*sizeof(Para)) 475 476 #define getInitialRunsMemory(pBiDi, length) \ 477 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->runsMemory, &(pBiDi)->runsSize, \ 478 true, (length)*sizeof(Run)) 479 480 #define getInitialIsolatesMemory(pBiDi, length) \ 481 ubidi_getMemory((BidiMemoryForAllocation *)&(pBiDi)->isolatesMemory, &(pBiDi)->isolatesSize, \ 482 true, (length)*sizeof(Isolate)) 483 484 #endif 485