1#!/usr/bin/env python3 2# 3# Original script modified in November 2003 to take advantage of 4# the character-validation range routines, and updated to the 5# current Unicode information (Version 4.0.1) 6# 7# NOTE: there is an 'alias' facility for blocks which are not present in 8# the current release, but are needed for ABI compatibility. This 9# must be accomplished MANUALLY! Please see the comments below under 10# 'blockAliases' 11# 12import sys 13import string 14import time 15 16webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html" 17sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt" 18 19# 20# blockAliases is a small hack - it is used for mapping block names which 21# were were used in the 3.1 release, but are missing or changed in the current 22# release. The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]" 23blockAliases = [] 24blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols") 25blockAliases.append("Greek:GreekandCoptic") 26blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," + 27 "SupplementaryPrivateUseArea-B") 28 29# minTableSize gives the minimum number of ranges which must be present 30# before a range table is produced. If there are less than this 31# number, inline comparisons are generated 32minTableSize = 8 33 34(blockfile, catfile) = sources.split() 35 36 37# 38# Now process the "blocks" file, reducing it to a dictionary 39# indexed by blockname, containing a tuple with the applicable 40# block range 41# 42BlockNames = {} 43try: 44 blocks = open(blockfile, "r") 45except: 46 print("Missing %s, aborting ..." % blockfile) 47 sys.exit(1) 48 49for line in blocks.readlines(): 50 if line[0] == '#': 51 continue 52 line = line.strip() 53 if line == '': 54 continue 55 try: 56 fields = line.split(';') 57 range = fields[0].strip() 58 (start, end) = range.split("..") 59 name = fields[1].strip() 60 name = name.replace(' ', '') 61 except: 62 print("Failed to process line: %s" % (line)) 63 continue 64 start = "0x" + start 65 end = "0x" + end 66 try: 67 BlockNames[name].append((start, end)) 68 except: 69 BlockNames[name] = [(start, end)] 70blocks.close() 71print("Parsed %d blocks descriptions" % (len(BlockNames.keys()))) 72 73for block in blockAliases: 74 alias = block.split(':') 75 alist = alias[1].split(',') 76 for comp in alist: 77 if comp in BlockNames: 78 if alias[0] not in BlockNames: 79 BlockNames[alias[0]] = [] 80 for r in BlockNames[comp]: 81 BlockNames[alias[0]].append(r) 82 else: 83 print("Alias %s: %s not in Blocks" % (alias[0], comp)) 84 continue 85 86# 87# Next process the Categories file. This is more complex, since 88# the file is in code sequence, and we need to invert it. We use 89# a dictionary with index category-name, with each entry containing 90# all the ranges (codepoints) of that category. Note that category 91# names comprise two parts - the general category, and the "subclass" 92# within that category. Therefore, both "general category" (which is 93# the first character of the 2-character category-name) and the full 94# (2-character) name are entered into this dictionary. 95# 96try: 97 data = open(catfile, "r") 98except: 99 print("Missing %s, aborting ..." % catfile) 100 sys.exit(1) 101 102nbchar = 0; 103Categories = {} 104for line in data.readlines(): 105 if line[0] == '#': 106 continue 107 line = line.strip() 108 if line == '': 109 continue 110 try: 111 fields = line.split(';') 112 point = fields[0].strip() 113 value = 0 114 while point != '': 115 value = value * 16 116 if point[0] >= '0' and point[0] <= '9': 117 value = value + ord(point[0]) - ord('0') 118 elif point[0] >= 'A' and point[0] <= 'F': 119 value = value + 10 + ord(point[0]) - ord('A') 120 elif point[0] >= 'a' and point[0] <= 'f': 121 value = value + 10 + ord(point[0]) - ord('a') 122 point = point[1:] 123 name = fields[2] 124 except: 125 print("Failed to process line: %s" % (line)) 126 continue 127 128 nbchar = nbchar + 1 129 # update entry for "full name" 130 try: 131 Categories[name].append(value) 132 except: 133 try: 134 Categories[name] = [value] 135 except: 136 print("Failed to process line: %s" % (line)) 137 # update "general category" name 138 try: 139 Categories[name[0]].append(value) 140 except: 141 try: 142 Categories[name[0]] = [value] 143 except: 144 print("Failed to process line: %s" % (line)) 145 146blocks.close() 147print("Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))) 148 149# 150# The data is now all read. Time to process it into a more useful form. 151# 152# reduce the number list into ranges 153for cat in Categories.keys(): 154 list = Categories[cat] 155 start = -1 156 prev = -1 157 end = -1 158 ranges = [] 159 for val in list: 160 if start == -1: 161 start = val 162 prev = val 163 continue 164 elif val == prev + 1: 165 prev = val 166 continue 167 elif prev == start: 168 ranges.append((prev, prev)) 169 start = val 170 prev = val 171 continue 172 else: 173 ranges.append((start, prev)) 174 start = val 175 prev = val 176 continue 177 if prev == start: 178 ranges.append((prev, prev)) 179 else: 180 ranges.append((start, prev)) 181 Categories[cat] = ranges 182 183# 184# Assure all data is in alphabetic order, since we will be doing binary 185# searches on the tables. 186# 187bkeys = sorted(BlockNames.keys()) 188 189ckeys = sorted(Categories.keys()) 190 191# 192# Generate the resulting files 193# 194try: 195 header = open("include/libxml/xmlunicode.h", "w") 196except: 197 print("Failed to open include/libxml/xmlunicode.h") 198 sys.exit(1) 199 200try: 201 output = open("xmlunicode.c", "w") 202except: 203 print("Failed to open xmlunicode.c") 204 sys.exit(1) 205 206date = time.asctime(time.localtime(time.time())) 207 208header.write( 209"""/* 210 * Summary: Unicode character APIs 211 * Description: API for the Unicode character APIs 212 * 213 * This file is automatically generated from the 214 * UCS description files of the Unicode Character Database 215 * %s 216 * using the genUnicode.py Python script. 217 * 218 * Generation date: %s 219 * Sources: %s 220 * Author: Daniel Veillard 221 */ 222 223#ifndef __XML_UNICODE_H__ 224#define __XML_UNICODE_H__ 225 226#include <libxml/xmlversion.h> 227 228#ifdef LIBXML_UNICODE_ENABLED 229 230#ifdef __cplusplus 231extern "C" { 232#endif 233 234""" % (webpage, date, sources)); 235 236output.write( 237"""/* 238 * xmlunicode.c: this module implements the Unicode character APIs 239 * 240 * This file is automatically generated from the 241 * UCS description files of the Unicode Character Database 242 * %s 243 * using the genUnicode.py Python script. 244 * 245 * Generation date: %s 246 * Sources: %s 247 * Daniel Veillard <veillard@redhat.com> 248 */ 249 250#define IN_LIBXML 251#include "libxml.h" 252 253#ifdef LIBXML_UNICODE_ENABLED 254 255#include <string.h> 256#include <libxml/xmlversion.h> 257#include <libxml/xmlunicode.h> 258#include <libxml/chvalid.h> 259 260typedef int (xmlIntFunc)(int); /* just to keep one's mind untwisted */ 261 262typedef struct { 263 const char *rangename; 264 xmlIntFunc *func; 265} xmlUnicodeRange; 266 267typedef struct { 268 const xmlUnicodeRange *table; 269 int numentries; 270} xmlUnicodeNameTable; 271 272 273static xmlIntFunc *xmlUnicodeLookup(const xmlUnicodeNameTable *tptr, const char *tname); 274 275static const xmlUnicodeRange xmlUnicodeBlocks[] = { 276""" % (webpage, date, sources)); 277 278flag = 0 279for block in bkeys: 280 name = block.replace('-', '') 281 if flag: 282 output.write(',\n') 283 else: 284 flag = 1 285 output.write(' {"%s", xmlUCSIs%s}' % (block, name)) 286output.write('};\n\n') 287 288output.write('static const xmlUnicodeRange xmlUnicodeCats[] = {\n') 289flag = 0; 290for name in ckeys: 291 if flag: 292 output.write(',\n') 293 else: 294 flag = 1 295 output.write(' {"%s", xmlUCSIsCat%s}' % (name, name)) 296output.write('};\n\n') 297 298# 299# For any categories with more than minTableSize ranges we generate 300# a range table suitable for xmlCharInRange 301# 302for name in ckeys: 303 if len(Categories[name]) > minTableSize: 304 numshort = 0 305 numlong = 0 306 ranges = Categories[name] 307 sptr = "NULL" 308 lptr = "NULL" 309 for range in ranges: 310 (low, high) = range 311 if high < 0x10000: 312 if numshort == 0: 313 pline = "static const xmlChSRange xml%sS[] = {" % name 314 sptr = "xml%sS" % name 315 else: 316 pline += "," 317 numshort += 1 318 else: 319 if numlong == 0: 320 if numshort > 0: 321 output.write(pline + " };\n") 322 pline = "static const xmlChLRange xml%sL[] = {" % name 323 lptr = "xml%sL" % name 324 else: 325 pline += "," 326 numlong += 1 327 if len(pline) > 60: 328 output.write(pline + "\n") 329 pline = " " 330 elif pline[-1:] == ",": 331 pline += " " 332 pline += "{%s, %s}" % (hex(low), hex(high)) 333 output.write(pline + " };\nstatic const xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n" 334 % (name, numshort, numlong, sptr, lptr)) 335 336 337output.write( 338"""static const xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s}; 339static const xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s}; 340 341/** 342 * xmlUnicodeLookup: 343 * @tptr: pointer to the name table 344 * @name: name to be found 345 * 346 * binary table lookup for user-supplied name 347 * 348 * Returns pointer to range function if found, otherwise NULL 349 */ 350static xmlIntFunc 351*xmlUnicodeLookup(const xmlUnicodeNameTable *tptr, const char *tname) { 352 int low, high, mid, cmp; 353 const xmlUnicodeRange *sptr; 354 355 if ((tptr == NULL) || (tname == NULL)) return(NULL); 356 357 low = 0; 358 high = tptr->numentries - 1; 359 sptr = tptr->table; 360 while (low <= high) { 361 mid = (low + high) / 2; 362 if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0) 363 return (sptr[mid].func); 364 if (cmp < 0) 365 high = mid - 1; 366 else 367 low = mid + 1; 368 } 369 return (NULL); 370} 371 372""" % (len(BlockNames), len(Categories)) ) 373 374for block in bkeys: 375 name = block.replace('-', '') 376 header.write("XMLPUBFUN int xmlUCSIs%s\t(int code);\n" % name) 377 output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name)) 378 output.write(" *\n * Check whether the character is part of %s UCS Block\n"% 379 (block)) 380 output.write(" *\n * Returns 1 if true 0 otherwise\n */\n"); 381 output.write("int\nxmlUCSIs%s(int code) {\n return(" % name) 382 flag = 0 383 for (start, end) in BlockNames[block]: 384 if flag: 385 output.write(" ||\n ") 386 else: 387 flag = 1 388 output.write("((code >= %s) && (code <= %s))" % (start, end)) 389 output.write(");\n}\n\n") 390 391header.write("\nXMLPUBFUN int xmlUCSIsBlock\t(int code, const char *block);\n\n") 392output.write( 393"""/** 394 * xmlUCSIsBlock: 395 * @code: UCS code point 396 * @block: UCS block name 397 * 398 * Check whether the character is part of the UCS Block 399 * 400 * Returns 1 if true, 0 if false and -1 on unknown block 401 */ 402int 403xmlUCSIsBlock(int code, const char *block) { 404 xmlIntFunc *func; 405 406 func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block); 407 if (func == NULL) 408 return (-1); 409 return (func(code)); 410} 411 412""") 413 414for name in ckeys: 415 ranges = Categories[name] 416 header.write("XMLPUBFUN int xmlUCSIsCat%s\t(int code);\n" % name) 417 output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name)) 418 output.write(" *\n * Check whether the character is part of %s UCS Category\n"% 419 (name)) 420 output.write(" *\n * Returns 1 if true 0 otherwise\n */\n"); 421 output.write("int\nxmlUCSIsCat%s(int code) {\n" % name) 422 if len(Categories[name]) > minTableSize: 423 output.write(" return(xmlCharInRange((unsigned int)code, &xml%sG)" 424 % name) 425 else: 426 start = 1 427 for range in ranges: 428 (begin, end) = range; 429 if start: 430 output.write(" return("); 431 start = 0 432 else: 433 output.write(" ||\n "); 434 if (begin == end): 435 output.write("(code == %s)" % (hex(begin))) 436 else: 437 output.write("((code >= %s) && (code <= %s))" % ( 438 hex(begin), hex(end))) 439 output.write(");\n}\n\n") 440 441header.write("\nXMLPUBFUN int xmlUCSIsCat\t(int code, const char *cat);\n") 442output.write( 443"""/** 444 * xmlUCSIsCat: 445 * @code: UCS code point 446 * @cat: UCS Category name 447 * 448 * Check whether the character is part of the UCS Category 449 * 450 * Returns 1 if true, 0 if false and -1 on unknown category 451 */ 452int 453xmlUCSIsCat(int code, const char *cat) { 454 xmlIntFunc *func; 455 456 func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat); 457 if (func == NULL) 458 return (-1); 459 return (func(code)); 460} 461 462#endif /* LIBXML_UNICODE_ENABLED */ 463""") 464 465header.write(""" 466#ifdef __cplusplus 467} 468#endif 469 470#endif /* LIBXML_UNICODE_ENABLED */ 471 472#endif /* __XML_UNICODE_H__ */ 473"""); 474 475header.close() 476output.close() 477