1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2009, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: bidiconf.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2009oct16
14 * created by: Markus W. Scherer
15 *
16 * BiDi conformance test, using the Unicode BidiTest.txt file.
17 */
18
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include "unicode/utypes.h"
23 #include "unicode/ubidi.h"
24 #include "unicode/errorcode.h"
25 #include "unicode/localpointer.h"
26 #include "unicode/putil.h"
27 #include "unicode/unistr.h"
28 #include "intltest.h"
29 #include "uparse.h"
30
31 class BiDiConformanceTest : public IntlTest {
32 public:
BiDiConformanceTest()33 BiDiConformanceTest() :
34 directionBits(0), lineNumber(0), levelsCount(0), orderingCount(0),
35 errorCount(0) {}
36
37 void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
38
39 void TestBidiTest();
40 private:
41 char *getUnidataPath(char path[]);
42
43 UBool parseLevels(const char *start);
44 UBool parseOrdering(const char *start);
45 UBool parseInputStringFromBiDiClasses(const char *&start);
46
47 UBool checkLevels(const UBiDiLevel actualLevels[], int32_t actualCount,
48 const char *paraLevelName);
49 UBool checkOrdering(UBiDi *ubidi, const char *paraLevelName);
50
51 void printErrorLine(const char *paraLevelName);
52
53 char line[10000];
54 UBiDiLevel levels[1000];
55 uint32_t directionBits;
56 int32_t ordering[1000];
57 int32_t lineNumber;
58 int32_t levelsCount;
59 int32_t orderingCount;
60 int32_t errorCount;
61 UnicodeString inputString;
62 };
63
createBiDiConformanceTest()64 extern IntlTest *createBiDiConformanceTest() {
65 return new BiDiConformanceTest();
66 }
67
runIndexedTest(int32_t index,UBool exec,const char * & name,char * par)68 void BiDiConformanceTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char *par) {
69 if(exec) {
70 logln("TestSuite BiDiConformanceTest: ");
71 }
72 switch (index) {
73 TESTCASE(0, TestBidiTest);
74 default:
75 name="";
76 break; // needed to end the loop
77 }
78 }
79
80 // TODO: Move to a common place (IntlTest?) to avoid duplication with UnicodeTest (ucdtest.cpp).
getUnidataPath(char path[])81 char *BiDiConformanceTest::getUnidataPath(char path[]) {
82 IcuTestErrorCode errorCode(*this, "getUnidataPath");
83 const int kUnicodeDataTxtLength=15; // strlen("UnicodeData.txt")
84
85 // Look inside ICU_DATA first.
86 strcpy(path, pathToDataDirectory());
87 strcat(path, "unidata" U_FILE_SEP_STRING "UnicodeData.txt");
88 FILE *f=fopen(path, "r");
89 if(f!=NULL) {
90 fclose(f);
91 *(strchr(path, 0)-kUnicodeDataTxtLength)=0; // Remove the basename.
92 return path;
93 }
94
95 // As a fallback, try to guess where the source data was located
96 // at the time ICU was built, and look there.
97 # ifdef U_TOPSRCDIR
98 strcpy(path, U_TOPSRCDIR U_FILE_SEP_STRING "data");
99 # else
100 strcpy(path, loadTestData(errorCode));
101 strcat(path, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".."
102 U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".."
103 U_FILE_SEP_STRING "data");
104 # endif
105 strcat(path, U_FILE_SEP_STRING);
106 strcat(path, "unidata" U_FILE_SEP_STRING "UnicodeData.txt");
107 f=fopen(path, "r");
108 if(f!=NULL) {
109 fclose(f);
110 *(strchr(path, 0)-kUnicodeDataTxtLength)=0; // Remove the basename.
111 return path;
112 }
113 return NULL;
114 }
115
116 U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
117
118 // TODO: Make "public" in uparse.h.
119 #define U_IS_INV_WHITESPACE(c) ((c)==' ' || (c)=='\t' || (c)=='\r' || (c)=='\n')
120
parseLevels(const char * start)121 UBool BiDiConformanceTest::parseLevels(const char *start) {
122 directionBits=0;
123 levelsCount=0;
124 while(*start!=0 && *(start=u_skipWhitespace(start))!=0) {
125 if(*start=='x') {
126 levels[levelsCount++]=UBIDI_DEFAULT_LTR;
127 ++start;
128 } else {
129 char *end;
130 uint32_t value=(uint32_t)strtoul(start, &end, 10);
131 if(end<=start || (!U_IS_INV_WHITESPACE(*end) && *end!=0) || value>(UBIDI_MAX_EXPLICIT_LEVEL+1)) {
132 errln("@Levels: parse error at %s", start);
133 return FALSE;
134 }
135 levels[levelsCount++]=(UBiDiLevel)value;
136 directionBits|=(1<<(value&1));
137 start=end;
138 }
139 }
140 return TRUE;
141 }
142
parseOrdering(const char * start)143 UBool BiDiConformanceTest::parseOrdering(const char *start) {
144 orderingCount=0;
145 while(*start!=0 && *(start=u_skipWhitespace(start))!=0) {
146 char *end;
147 uint32_t value=(uint32_t)strtoul(start, &end, 10);
148 if(end<=start || (!U_IS_INV_WHITESPACE(*end) && *end!=0) || value>=1000) {
149 errln("@Reorder: parse error at %s", start);
150 return FALSE;
151 }
152 ordering[orderingCount++]=(int32_t)value;
153 start=end;
154 }
155 return TRUE;
156 }
157
158 static const UChar charFromBiDiClass[U_CHAR_DIRECTION_COUNT]={
159 0x6c, // 'l' for L
160 0x52, // 'R' for R
161 0x33, // '3' for EN
162 0x2d, // '-' for ES
163 0x25, // '%' for ET
164 0x39, // '9' for AN
165 0x2c, // ',' for CS
166 0x2f, // '/' for B
167 0x5f, // '_' for S
168 0x20, // ' ' for WS
169 0x3d, // '=' for ON
170 0x65, // 'e' for LRE
171 0x6f, // 'o' for LRO
172 0x41, // 'A' for AL
173 0x45, // 'E' for RLE
174 0x4f, // 'O' for RLO
175 0x2a, // '*' for PDF
176 0x60, // '`' for NSM
177 0x7c // '|' for BN
178 };
179
180 U_CDECL_BEGIN
181
182 static UCharDirection U_CALLCONV
biDiConfUBiDiClassCallback(const void * context,UChar32 c)183 biDiConfUBiDiClassCallback(const void *context, UChar32 c) {
184 for(int i=0; i<U_CHAR_DIRECTION_COUNT; ++i) {
185 if(c==charFromBiDiClass[i]) {
186 return (UCharDirection)i;
187 }
188 }
189 // Character not in our hardcoded table.
190 // Should not occur during testing.
191 return U_BIDI_CLASS_DEFAULT;
192 }
193
194 U_CDECL_END
195
196 static const int8_t biDiClassNameLengths[U_CHAR_DIRECTION_COUNT+1]={
197 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 3, 3, 2, 3, 3, 3, 3, 2, 0
198 };
199
parseInputStringFromBiDiClasses(const char * & start)200 UBool BiDiConformanceTest::parseInputStringFromBiDiClasses(const char *&start) {
201 inputString.remove();
202 /*
203 * Lengthy but fast BiDi class parser.
204 * A simple parser could terminate or extract the name string and use
205 * int32_t biDiClassInt=u_getPropertyValueEnum(UCHAR_BIDI_CLASS, bidiClassString);
206 * but that makes this test take significantly more time.
207 */
208 while(*start!=0 && *(start=u_skipWhitespace(start))!=0 && *start!=';') {
209 UCharDirection biDiClass=U_CHAR_DIRECTION_COUNT;
210 // Compare each character once until we have a match on
211 // a complete, short BiDi class name.
212 if(start[0]=='L') {
213 if(start[1]=='R') {
214 if(start[2]=='E') {
215 biDiClass=U_LEFT_TO_RIGHT_EMBEDDING;
216 } else if(start[2]=='O') {
217 biDiClass=U_LEFT_TO_RIGHT_OVERRIDE;
218 }
219 } else {
220 biDiClass=U_LEFT_TO_RIGHT;
221 }
222 } else if(start[0]=='R') {
223 if(start[1]=='L') {
224 if(start[2]=='E') {
225 biDiClass=U_RIGHT_TO_LEFT_EMBEDDING;
226 } else if(start[2]=='O') {
227 biDiClass=U_RIGHT_TO_LEFT_OVERRIDE;
228 }
229 } else {
230 biDiClass=U_RIGHT_TO_LEFT;
231 }
232 } else if(start[0]=='E') {
233 if(start[1]=='N') {
234 biDiClass=U_EUROPEAN_NUMBER;
235 } else if(start[1]=='S') {
236 biDiClass=U_EUROPEAN_NUMBER_SEPARATOR;
237 } else if(start[1]=='T') {
238 biDiClass=U_EUROPEAN_NUMBER_TERMINATOR;
239 }
240 } else if(start[0]=='A') {
241 if(start[1]=='L') {
242 biDiClass=U_RIGHT_TO_LEFT_ARABIC;
243 } else if(start[1]=='N') {
244 biDiClass=U_ARABIC_NUMBER;
245 }
246 } else if(start[0]=='C' && start[1]=='S') {
247 biDiClass=U_COMMON_NUMBER_SEPARATOR;
248 } else if(start[0]=='B') {
249 if(start[1]=='N') {
250 biDiClass=U_BOUNDARY_NEUTRAL;
251 } else {
252 biDiClass=U_BLOCK_SEPARATOR;
253 }
254 } else if(start[0]=='S') {
255 biDiClass=U_SEGMENT_SEPARATOR;
256 } else if(start[0]=='W' && start[1]=='S') {
257 biDiClass=U_WHITE_SPACE_NEUTRAL;
258 } else if(start[0]=='O' && start[1]=='N') {
259 biDiClass=U_OTHER_NEUTRAL;
260 } else if(start[0]=='P' && start[1]=='D' && start[2]=='F') {
261 biDiClass=U_POP_DIRECTIONAL_FORMAT;
262 } else if(start[0]=='N' && start[1]=='S' && start[2]=='M') {
263 biDiClass=U_DIR_NON_SPACING_MARK;
264 }
265 // Now we verify that the class name is terminated properly,
266 // and not just the start of a longer word.
267 int8_t biDiClassNameLength=biDiClassNameLengths[biDiClass];
268 char c=start[biDiClassNameLength];
269 if(biDiClass==U_CHAR_DIRECTION_COUNT || (!U_IS_INV_WHITESPACE(c) && c!=';' && c!=0)) {
270 errln("BiDi class string not recognized at %s", start);
271 return FALSE;
272 }
273 inputString.append(charFromBiDiClass[biDiClass]);
274 start+=biDiClassNameLength;
275 }
276 return TRUE;
277 }
278
TestBidiTest()279 void BiDiConformanceTest::TestBidiTest() {
280 IcuTestErrorCode errorCode(*this, "TestBidiTest");
281 const char *sourceTestDataPath=getSourceTestData(errorCode);
282 if(errorCode.logIfFailureAndReset("unable to find the source/test/testdata "
283 "folder (getSourceTestData())")) {
284 return;
285 }
286 char bidiTestPath[400];
287 strcpy(bidiTestPath, sourceTestDataPath);
288 strcat(bidiTestPath, "BidiTest.txt");
289 LocalStdioFilePointer bidiTestFile(fopen(bidiTestPath, "r"));
290 if(bidiTestFile.isNull()) {
291 errln("unable to open %s", bidiTestPath);
292 return;
293 }
294 LocalUBiDiPointer ubidi(ubidi_open());
295 ubidi_setClassCallback(ubidi.getAlias(), biDiConfUBiDiClassCallback, NULL,
296 NULL, NULL, errorCode);
297 if(errorCode.logIfFailureAndReset("ubidi_setClassCallback()")) {
298 return;
299 }
300 lineNumber=0;
301 levelsCount=0;
302 orderingCount=0;
303 errorCount=0;
304 while(errorCount<10 && fgets(line, (int)sizeof(line), bidiTestFile.getAlias())!=NULL) {
305 ++lineNumber;
306 // Remove trailing comments and whitespace.
307 char *commentStart=strchr(line, '#');
308 if(commentStart!=NULL) {
309 *commentStart=0;
310 }
311 u_rtrim(line);
312 const char *start=u_skipWhitespace(line);
313 if(*start==0) {
314 continue; // Skip empty and comment-only lines.
315 }
316 if(*start=='@') {
317 ++start;
318 if(0==strncmp(start, "Levels:", 7)) {
319 if(!parseLevels(start+7)) {
320 return;
321 }
322 } else if(0==strncmp(start, "Reorder:", 8)) {
323 if(!parseOrdering(start+8)) {
324 return;
325 }
326 }
327 // Skip unknown @Xyz: ...
328 } else {
329 if(!parseInputStringFromBiDiClasses(start)) {
330 return;
331 }
332 start=u_skipWhitespace(start);
333 if(*start!=';') {
334 errln("missing ; separator on input line %s", line);
335 return;
336 }
337 start=u_skipWhitespace(start+1);
338 char *end;
339 uint32_t bitset=(uint32_t)strtoul(start, &end, 10);
340 if(end<=start || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0)) {
341 errln("input bitset parse error at %s", start);
342 return;
343 }
344 // Loop over the bitset.
345 static const UBiDiLevel paraLevels[]={ UBIDI_DEFAULT_LTR, 0, 1 };
346 static const char *const paraLevelNames[]={ "auto/LTR", "LTR", "RTL" };
347 for(int i=0; i<=2; ++i) {
348 if(bitset&(1<<i)) {
349 ubidi_setPara(ubidi.getAlias(), inputString.getBuffer(), inputString.length(),
350 paraLevels[i], NULL, errorCode);
351 const UBiDiLevel *actualLevels=ubidi_getLevels(ubidi.getAlias(), errorCode);
352 if(errorCode.logIfFailureAndReset("ubidi_setPara() or ubidi_getLevels()")) {
353 errln("Input line %d: %s", (int)lineNumber, line);
354 return;
355 }
356 if(!checkLevels(actualLevels, ubidi_getProcessedLength(ubidi.getAlias()),
357 paraLevelNames[i])) {
358 // continue outerLoop; does not exist in C++
359 // so just break out of the inner loop.
360 break;
361 }
362 if(!checkOrdering(ubidi.getAlias(), paraLevelNames[i])) {
363 // continue outerLoop; does not exist in C++
364 // so just break out of the inner loop.
365 break;
366 }
367 }
368 }
369 }
370 }
371 }
372
printLevel(UBiDiLevel level)373 static UChar printLevel(UBiDiLevel level) {
374 if(level<UBIDI_DEFAULT_LTR) {
375 return 0x30+level;
376 } else {
377 return 0x78; // 'x'
378 }
379 }
380
getDirectionBits(const UBiDiLevel actualLevels[],int32_t actualCount)381 static uint32_t getDirectionBits(const UBiDiLevel actualLevels[], int32_t actualCount) {
382 uint32_t actualDirectionBits=0;
383 for(int32_t i=0; i<actualCount; ++i) {
384 actualDirectionBits|=(1<<(actualLevels[i]&1));
385 }
386 return actualDirectionBits;
387 }
388
checkLevels(const UBiDiLevel actualLevels[],int32_t actualCount,const char * paraLevelName)389 UBool BiDiConformanceTest::checkLevels(const UBiDiLevel actualLevels[], int32_t actualCount,
390 const char *paraLevelName) {
391 UBool isOk=TRUE;
392 if(levelsCount!=actualCount) {
393 errln("Wrong number of level values; expected %d actual %d",
394 (int)levelsCount, (int)actualCount);
395 isOk=FALSE;
396 } else {
397 for(int32_t i=0; i<actualCount; ++i) {
398 if(levels[i]!=actualLevels[i] && levels[i]<UBIDI_DEFAULT_LTR) {
399 if(directionBits!=3 && directionBits==getDirectionBits(actualLevels, actualCount)) {
400 // ICU used a shortcut:
401 // Since the text is unidirectional, it did not store the resolved
402 // levels but just returns all levels as the paragraph level 0 or 1.
403 // The reordering result is the same, so this is fine.
404 break;
405 } else {
406 errln("Wrong level value at index %d; expected %d actual %d",
407 (int)i, levels[i], actualLevels[i]);
408 isOk=FALSE;
409 break;
410 }
411 }
412 }
413 }
414 if(!isOk) {
415 printErrorLine(paraLevelName);
416 UnicodeString els("Expected levels: ");
417 int32_t i;
418 for(i=0; i<levelsCount; ++i) {
419 els.append((UChar)0x20).append(printLevel(levels[i]));
420 }
421 UnicodeString als("Actual levels: ");
422 for(i=0; i<actualCount; ++i) {
423 als.append((UChar)0x20).append(printLevel(actualLevels[i]));
424 }
425 errln(els);
426 errln(als);
427 }
428 return isOk;
429 }
430
431 // Note: ubidi_setReorderingOptions(ubidi, UBIDI_OPTION_REMOVE_CONTROLS);
432 // does not work for custom BiDi class assignments
433 // and anyway also removes LRM/RLM/ZWJ/ZWNJ which is not desirable here.
434 // Therefore we just skip the indexes for BiDi controls while comparing
435 // with the expected ordering that has them omitted.
checkOrdering(UBiDi * ubidi,const char * paraLevelName)436 UBool BiDiConformanceTest::checkOrdering(UBiDi *ubidi, const char *paraLevelName) {
437 UBool isOk=TRUE;
438 IcuTestErrorCode errorCode(*this, "TestBidiTest/checkOrdering()");
439 int32_t resultLength=ubidi_getResultLength(ubidi); // visual length including BiDi controls
440 int32_t i, visualIndex;
441 // Note: It should be faster to call ubidi_countRuns()/ubidi_getVisualRun()
442 // and loop over each run's indexes, but that seems unnecessary for this test code.
443 for(i=visualIndex=0; i<resultLength; ++i) {
444 int32_t logicalIndex=ubidi_getLogicalIndex(ubidi, i, errorCode);
445 if(errorCode.logIfFailureAndReset("ubidi_getLogicalIndex()")) {
446 errln("Input line %d: %s", (int)lineNumber, line);
447 return FALSE;
448 }
449 if(levels[logicalIndex]>=UBIDI_DEFAULT_LTR) {
450 continue; // BiDi control, omitted from expected ordering.
451 }
452 if(visualIndex<orderingCount && logicalIndex!=ordering[visualIndex]) {
453 errln("Wrong ordering value at visual index %d; expected %d actual %d",
454 (int)visualIndex, ordering[visualIndex], logicalIndex);
455 isOk=FALSE;
456 break;
457 }
458 ++visualIndex;
459 }
460 // visualIndex is now the visual length minus the BiDi controls,
461 // which should match the length of the BidiTest.txt ordering.
462 if(isOk && orderingCount!=visualIndex) {
463 errln("Wrong number of ordering values; expected %d actual %d",
464 (int)orderingCount, (int)visualIndex);
465 isOk=FALSE;
466 }
467 if(!isOk) {
468 printErrorLine(paraLevelName);
469 UnicodeString eord("Expected ordering: ");
470 for(i=0; i<orderingCount; ++i) {
471 eord.append((UChar)0x20).append((UChar)(0x30+ordering[i]));
472 }
473 UnicodeString aord("Actual ordering: ");
474 for(i=0; i<resultLength; ++i) {
475 int32_t logicalIndex=ubidi_getLogicalIndex(ubidi, i, errorCode);
476 if(levels[logicalIndex]<UBIDI_DEFAULT_LTR) {
477 aord.append((UChar)0x20).append((UChar)(0x30+logicalIndex));
478 }
479 }
480 errln(eord);
481 errln(aord);
482 }
483 return isOk;
484 }
485
printErrorLine(const char * paraLevelName)486 void BiDiConformanceTest::printErrorLine(const char *paraLevelName) {
487 ++errorCount;
488 errln("Input line %5d: %s", (int)lineNumber, line);
489 errln(UnicodeString("Input string: ")+inputString);
490 errln("Para level: %s", paraLevelName);
491 }
492