1#!/usr/bin/env python3 2# 3# Portions of this script have been (shamelessly) stolen from the 4# prior work of Daniel Veillard (genUnicode.py) 5# 6# I, however, take full credit for any bugs, errors or difficulties :-) 7# 8# William Brack 9# October 2003 10# 11# 18 October 2003 12# Modified to maintain binary compatibility with previous library versions 13# by adding a suffix 'Q' ('quick') to the macro generated for the original, 14# function, and adding generation of a function (with the original name) which 15# instantiates the macro. 16# 17 18import sys 19import time 20 21# 22# A routine to take a list of yes/no (1, 0) values and turn it 23# into a list of ranges. This will later be used to determine whether 24# to generate single-byte lookup tables, or inline comparisons 25# 26def makeRange(lst): 27 ret = [] 28 pos = 0 29 while pos < len(lst): 30 try: # index generates exception if not present 31 s = lst[pos:].index(1) # look for start of next range 32 except: 33 break # if no more, finished 34 pos += s # pointer to start of possible range 35 try: 36 e = lst[pos:].index(0) # look for end of range 37 e += pos 38 except: # if no end, set to end of list 39 e = len(lst) 40 ret.append((pos, e-1)) # append range tuple to list 41 pos = e + 1 # ready to check for next range 42 return ret 43 44sources = "chvalid.def" # input filename 45 46# minTableSize gives the minimum number of ranges which must be present 47# before a 256-byte lookup table is produced. If there are less than this 48# number, a macro with inline comparisons is generated 49minTableSize = 6 50 51# dictionary of functions, key=name, element contains char-map and range-list 52Functs = {} 53 54state = 0 55 56try: 57 defines = open("chvalid.def", "r") 58except: 59 print("Missing chvalid.def, aborting ...") 60 sys.exit(1) 61 62# 63# The lines in the .def file have three types:- 64# name: Defines a new function block 65# ur: Defines individual or ranges of unicode values 66# end: Indicates the end of the function block 67# 68# These lines are processed below. 69# 70for line in defines.readlines(): 71 # ignore blank lines, or lines beginning with '#' 72 if line[0] == '#': 73 continue 74 line = line.strip() 75 if line == '': 76 continue 77 # split line into space-separated fields, then split on type 78 try: 79 fields = line.split(' ') 80 # 81 # name line: 82 # validate any previous function block already ended 83 # validate this function not already defined 84 # initialize an entry in the function dicitonary 85 # including a mask table with no values yet defined 86 # 87 if fields[0] == 'name': 88 name = fields[1] 89 if state != 0: 90 print("'name' %s found before previous name" \ 91 "completed" % (fields[1])) 92 continue 93 state = 1 94 if name in Functs: 95 print("name '%s' already present - may give" \ 96 " wrong results" % (name)) 97 else: 98 # dict entry with two list elements (chdata, rangedata) 99 Functs[name] = [ [], [] ] 100 for v in range(256): 101 Functs[name][0].append(0) 102 # 103 # end line: 104 # validate there was a preceding function name line 105 # set state to show no current function active 106 # 107 elif fields[0] == 'end': 108 if state == 0: 109 print("'end' found outside of function block") 110 continue 111 state = 0 112 113 # 114 # ur line: 115 # validate function has been defined 116 # process remaining fields on the line, which may be either 117 # individual unicode values or ranges of values 118 # 119 elif fields[0] == 'ur': 120 if state != 1: 121 raise Exception("'ur' found outside of 'name' block") 122 for el in fields[1:]: 123 pos = el.find('..') 124 # pos <=0 means not a range, so must be individual value 125 if pos <= 0: 126 # cheap handling of hex or decimal values 127 if el[0:2] == '0x': 128 value = int(el[2:],16) 129 elif el[0] == "'": 130 value = ord(el[1]) 131 else: 132 value = int(el) 133 if ((value < 0) | (value > 0x1fffff)): 134 raise Exception('Illegal value (%s) in ch for'\ 135 ' name %s' % (el,name)) 136 # for ur we have only ranges (makes things simpler), 137 # so convert val to range 138 currange = (value, value) 139 # pos > 0 means this is a range, so isolate/validate 140 # the interval 141 else: 142 # split the range into it's first-val, last-val 143 (first, last) = el.split("..") 144 # convert values from text into binary 145 if first[0:2] == '0x': 146 start = int(first[2:],16) 147 elif first[0] == "'": 148 start = ord(first[1]) 149 else: 150 start = int(first) 151 if last[0:2] == '0x': 152 end = int(last[2:],16) 153 elif last[0] == "'": 154 end = ord(last[1]) 155 else: 156 end = int(last) 157 if (start < 0) | (end > 0x1fffff) | (start > end): 158 raise Exception("Invalid range '%s'" % el) 159 currange = (start, end) 160 # common path - 'currange' has the range, now take care of it 161 # We split on single-byte values vs. multibyte 162 if currange[1] < 0x100: # single-byte 163 for ch in range(currange[0],currange[1]+1): 164 # validate that value not previously defined 165 if Functs[name][0][ch]: 166 msg = "Duplicate ch value '%s' for name '%s'" % (el, name) 167 raise Exception(msg) 168 Functs[name][0][ch] = 1 169 else: # multi-byte 170 if currange in Functs[name][1]: 171 raise Exception("range already defined in" \ 172 " function") 173 else: 174 Functs[name][1].append(currange) 175 176 except: 177 print("Failed to process line: %s" % (line)) 178 raise 179# 180# At this point, the entire definition file has been processed. Now we 181# enter the output phase, where we generate the two files chvalid.c and' 182# chvalid.h 183# 184# To do this, we first output the 'static' data (heading, fixed 185# definitions, etc.), then output the 'dynamic' data (the results 186# of the above processing), and finally output closing 'static' data 187# (e.g. the subroutine to process the ranges) 188# 189 190# 191# Generate the headings: 192# 193try: 194 header = open("include/libxml/chvalid.h", "w") 195except: 196 print("Failed to open include/libxml/chvalid.h") 197 sys.exit(1) 198 199try: 200 output = open("chvalid.c", "w") 201except: 202 print("Failed to open chvalid.c") 203 sys.exit(1) 204 205date = time.asctime(time.localtime(time.time())) 206 207header.write( 208"""/* 209 * Summary: Unicode character range checking 210 * Description: this module exports interfaces for the character 211 * range validation APIs 212 * 213 * This file is automatically generated from the cvs source 214 * definition files using the genChRanges.py Python script 215 * 216 * Generation date: %s 217 * Sources: %s 218 * Author: William Brack <wbrack@mmm.com.hk> 219 */ 220 221#ifndef __XML_CHVALID_H__ 222#define __XML_CHVALID_H__ 223 224#include <libxml/xmlversion.h> 225#include <libxml/xmlstring.h> 226 227#ifdef __cplusplus 228extern "C" { 229#endif 230 231/* 232 * Define our typedefs and structures 233 * 234 */ 235typedef struct _xmlChSRange xmlChSRange; 236typedef xmlChSRange *xmlChSRangePtr; 237struct _xmlChSRange { 238 unsigned short\tlow; 239 unsigned short\thigh; 240}; 241 242typedef struct _xmlChLRange xmlChLRange; 243typedef xmlChLRange *xmlChLRangePtr; 244struct _xmlChLRange { 245 unsigned int\tlow; 246 unsigned int\thigh; 247}; 248 249typedef struct _xmlChRangeGroup xmlChRangeGroup; 250typedef xmlChRangeGroup *xmlChRangeGroupPtr; 251struct _xmlChRangeGroup { 252 int\t\t\tnbShortRange; 253 int\t\t\tnbLongRange; 254 const xmlChSRange\t*shortRange;\t/* points to an array of ranges */ 255 const xmlChLRange\t*longRange; 256}; 257 258/** 259 * Range checking routine 260 */ 261XMLPUBFUN int 262\t\txmlCharInRange(unsigned int val, const xmlChRangeGroup *group); 263 264""" % (date, sources)); 265output.write( 266"""/* 267 * chvalid.c:\tthis module implements the character range 268 *\t\tvalidation APIs 269 * 270 * This file is automatically generated from the cvs source 271 * definition files using the genChRanges.py Python script 272 * 273 * Generation date: %s 274 * Sources: %s 275 * William Brack <wbrack@mmm.com.hk> 276 */ 277 278#define IN_LIBXML 279#include "libxml.h" 280#include <libxml/chvalid.h> 281 282#include <stddef.h> 283 284/* 285 * The initial tables ({func_name}_tab) are used to validate whether a 286 * single-byte character is within the specified group. Each table 287 * contains 256 bytes, with each byte representing one of the 256 288 * possible characters. If the table byte is set, the character is 289 * allowed. 290 * 291 */ 292""" % (date, sources)); 293 294# 295# Now output the generated data. 296# We try to produce the best execution times. Tests have shown that validation 297# with direct table lookup is, when there are a "small" number of valid items, 298# still not as fast as a sequence of inline compares. So, if the single-byte 299# portion of a range has a "small" number of ranges, we output a macro for inline 300# compares, otherwise we output a 256-byte table and a macro to use it. 301# 302 303fkeys = sorted(Functs.keys()) 304 305for f in fkeys: 306 307# First we convert the specified single-byte values into a group of ranges. 308# If the total number of such ranges is less than minTableSize, we generate 309# an inline macro for direct comparisons; if greater, we generate a lookup 310# table. 311 if max(Functs[f][0]) > 0: # only check if at least one entry 312 rangeTable = makeRange(Functs[f][0]) 313 numRanges = len(rangeTable) 314 if numRanges >= minTableSize: # table is worthwhile 315 header.write("XMLPUBVAR const unsigned char %s_tab[256];\n" % f) 316 header.write(""" 317/** 318 * %s_ch: 319 * @c: char to validate 320 * 321 * Automatically generated by genChRanges.py 322 */ 323""" % f) 324 header.write("#define %s_ch(c)\t(%s_tab[(c)])\n" % (f, f)) 325 326 # write the constant data to the code file 327 output.write("const unsigned char %s_tab[256] = {\n" % f) 328 pline = " " 329 for n in range(255): 330 pline += " 0x%02x," % Functs[f][0][n] 331 if len(pline) > 72: 332 output.write(pline + "\n") 333 pline = " " 334 output.write(pline + " 0x%02x };\n\n" % Functs[f][0][255]) 335 336 else: # inline check is used 337 # first another little optimisation - if space is present, 338 # put it at the front of the list so it is checked first 339 try: 340 ix = rangeTable.remove((0x20, 0x20)) 341 rangeTable.insert(0, (0x20, 0x20)) 342 except: 343 pass 344 firstFlag = 1 345 346 header.write(""" 347/** 348 * %s_ch: 349 * @c: char to validate 350 * 351 * Automatically generated by genChRanges.py 352 */ 353""" % f) 354 # okay, I'm tired of the messy lineup - let's automate it! 355 pline = "#define %s_ch(c)" % f 356 # 'ntab' is number of tabs needed to position to col. 33 from name end 357 ntab = 4 - (len(pline)) // 8 358 if ntab < 0: 359 ntab = 0 360 just = "" 361 for i in range(ntab): 362 just += "\t" 363 pline = pline + just + "(" 364 for rg in rangeTable: 365 if not firstFlag: 366 pline += " || \\\n\t\t\t\t " 367 else: 368 firstFlag = 0 369 if rg[0] == rg[1]: # single value - check equal 370 pline += "((c) == 0x%x)" % rg[0] 371 else: # value range 372 # since we are doing char, also change range ending in 0xff 373 if rg[1] != 0xff: 374 pline += "((0x%x <= (c)) &&" % rg[0] 375 pline += " ((c) <= 0x%x))" % rg[1] 376 else: 377 pline += " (0x%x <= (c))" % rg[0] 378 pline += ")\n" 379 header.write(pline) 380 381 header.write(""" 382/** 383 * %sQ: 384 * @c: char to validate 385 * 386 * Automatically generated by genChRanges.py 387 */ 388""" % f) 389 pline = "#define %sQ(c)" % f 390 ntab = 4 - (len(pline)) // 8 391 if ntab < 0: 392 ntab = 0 393 just = "" 394 for i in range(ntab): 395 just += "\t" 396 header.write(pline + just + "(((c) < 0x100) ? \\\n\t\t\t\t ") 397 if max(Functs[f][0]) > 0: 398 header.write("%s_ch((c)) :" % f) 399 else: 400 header.write("0 :") 401 402 # if no ranges defined, value invalid if >= 0x100 403 numRanges = len(Functs[f][1]) 404 if numRanges == 0: 405 header.write(" 0)\n\n") 406 else: 407 if numRanges >= minTableSize: 408 header.write(" \\\n\t\t\t\t xmlCharInRange((c), &%sGroup))\n\n" % f) 409 else: # if < minTableSize, generate inline code 410 firstFlag = 1 411 for rg in Functs[f][1]: 412 if not firstFlag: 413 pline += " || \\\n\t\t\t\t " 414 else: 415 firstFlag = 0 416 pline = "\\\n\t\t\t\t(" 417 if rg[0] == rg[1]: # single value - check equal 418 pline += "((c) == 0x%x)" % rg[0] 419 else: # value range 420 pline += "((0x%x <= (c)) &&" % rg[0] 421 pline += " ((c) <= 0x%x))" % rg[1] 422 pline += "))\n\n" 423 header.write(pline) 424 425 426 if len(Functs[f][1]) > 0: 427 header.write("XMLPUBVAR const xmlChRangeGroup %sGroup;\n" % f) 428 429 430# 431# Next we do the unicode ranges 432# 433 434for f in fkeys: 435 if len(Functs[f][1]) > 0: # only generate if unicode ranges present 436 rangeTable = Functs[f][1] 437 rangeTable.sort() # ascending tuple sequence 438 numShort = 0 439 numLong = 0 440 for rg in rangeTable: 441 if rg[1] < 0x10000: # if short value 442 if numShort == 0: # first occurrence 443 pline = "static const xmlChSRange %s_srng[] = {" % f 444 else: 445 pline += "," 446 numShort += 1 447 if len(pline) > 60: 448 output.write(pline + "\n") 449 pline = " " 450 else: 451 pline += " " 452 pline += "{0x%x, 0x%x}" % (rg[0], rg[1]) 453 else: # if long value 454 if numLong == 0: # first occurrence 455 if numShort > 0: # if there were shorts, finish them off 456 output.write(pline + "};\n") 457 pline = "static const xmlChLRange %s_lrng[] = { " % f 458 else: 459 pline += ", " 460 numLong += 1 461 if len(pline) > 60: 462 output.write(pline + "\n") 463 pline = " " 464 pline += "{0x%x, 0x%x}" % (rg[0], rg[1]) 465 output.write(pline + "};\n") # finish off last group 466 467 pline = "const xmlChRangeGroup %sGroup =\n\t{%d, %d, " % (f, numShort, numLong) 468 if numShort > 0: 469 pline += "%s_srng" % f 470 else: 471 pline += "(xmlChSRangePtr)0" 472 if numLong > 0: 473 pline += ", %s_lrng" % f 474 else: 475 pline += ", (xmlChLRangePtr)0" 476 477 output.write(pline + "};\n\n") 478 479output.write( 480""" 481/** 482 * xmlCharInRange: 483 * @val: character to be validated 484 * @rptr: pointer to range to be used to validate 485 * 486 * Does a binary search of the range table to determine if char 487 * is valid 488 * 489 * Returns: true if character valid, false otherwise 490 */ 491int 492xmlCharInRange (unsigned int val, const xmlChRangeGroup *rptr) { 493 int low, high, mid; 494 const xmlChSRange *sptr; 495 const xmlChLRange *lptr; 496 497 if (rptr == NULL) return(0); 498 if (val < 0x10000) {\t/* is val in 'short' or 'long' array? */ 499\tif (rptr->nbShortRange == 0) 500\t return 0; 501\tlow = 0; 502\thigh = rptr->nbShortRange - 1; 503\tsptr = rptr->shortRange; 504\twhile (low <= high) { 505\t mid = (low + high) / 2; 506\t if ((unsigned short) val < sptr[mid].low) { 507\t\thigh = mid - 1; 508\t } else { 509\t\tif ((unsigned short) val > sptr[mid].high) { 510\t\t low = mid + 1; 511\t\t} else { 512\t\t return 1; 513\t\t} 514\t } 515\t} 516 } else { 517\tif (rptr->nbLongRange == 0) { 518\t return 0; 519\t} 520\tlow = 0; 521\thigh = rptr->nbLongRange - 1; 522\tlptr = rptr->longRange; 523\twhile (low <= high) { 524\t mid = (low + high) / 2; 525\t if (val < lptr[mid].low) { 526\t\thigh = mid - 1; 527\t } else { 528\t\tif (val > lptr[mid].high) { 529\t\t low = mid + 1; 530\t\t} else { 531\t\t return 1; 532\t\t} 533\t } 534\t} 535 } 536 return 0; 537} 538 539"""); 540 541# 542# finally, generate the ABI compatibility functions 543# 544for f in fkeys: 545 output.write(""" 546/** 547 * %s: 548 * @ch: character to validate 549 * 550 * This function is DEPRECATED. 551""" % f); 552 if max(Functs[f][0]) > 0: 553 output.write(" * Use %s_ch or %sQ instead" % (f, f)) 554 else: 555 output.write(" * Use %sQ instead" % f) 556 output.write(""" 557 * 558 * Returns true if argument valid, false otherwise 559 */ 560""") 561 output.write("int\n%s(unsigned int ch) {\n return(%sQ(ch));\n}\n\n" % (f,f)) 562 header.write("XMLPUBFUN int\n\t\t%s(unsigned int ch);\n" % f); 563# 564# Run complete - write trailers and close the output files 565# 566 567header.write(""" 568#ifdef __cplusplus 569} 570#endif 571#endif /* __XML_CHVALID_H__ */ 572""") 573 574header.close() 575 576output.close() 577 578