• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2#
3# Portions of this script have been (shamelessly) stolen from the
4# prior work of Daniel Veillard (genUnicode.py)
5#
6# I, however, take full credit for any bugs, errors or difficulties :-)
7#
8# William Brack
9# October 2003
10#
11# 18 October 2003
12# Modified to maintain binary compatibility with previous library versions
13# by adding a suffix 'Q' ('quick') to the macro generated for the original,
14# function, and adding generation of a function (with the original name) which
15# instantiates the macro.
16#
17
18import sys
19import time
20
21#
22# A routine to take a list of yes/no (1, 0) values and turn it
23# into a list of ranges.  This will later be used to determine whether
24# to generate single-byte lookup tables, or inline comparisons
25#
26def makeRange(lst):
27    ret = []
28    pos = 0
29    while pos < len(lst):
30        try:            # index generates exception if not present
31            s = lst[pos:].index(1)      # look for start of next range
32        except:
33            break                       # if no more, finished
34        pos += s                        # pointer to start of possible range
35        try:
36            e = lst[pos:].index(0)      # look for end of range
37            e += pos
38        except:                         # if no end, set to end of list
39            e = len(lst)
40        ret.append((pos, e-1))          # append range tuple to list
41        pos = e + 1                     # ready to check for next range
42    return ret
43
44sources = "chvalid.def"                 # input filename
45
46# minTableSize gives the minimum number of ranges which must be present
47# before a 256-byte lookup table is produced.  If there are less than this
48# number, a macro with inline comparisons is generated
49minTableSize = 6
50
51# dictionary of functions, key=name, element contains char-map and range-list
52Functs = {}
53
54state = 0
55
56try:
57    defines = open("chvalid.def", "r")
58except:
59    print("Missing chvalid.def, aborting ...")
60    sys.exit(1)
61
62#
63# The lines in the .def file have three types:-
64#   name:   Defines a new function block
65#   ur:     Defines individual or ranges of unicode values
66#   end:    Indicates the end of the function block
67#
68# These lines are processed below.
69#
70for line in defines.readlines():
71    # ignore blank lines, or lines beginning with '#'
72    if line[0] == '#':
73        continue
74    line = line.strip()
75    if line == '':
76        continue
77    # split line into space-separated fields, then split on type
78    try:
79        fields = line.split(' ')
80        #
81        # name line:
82        #   validate any previous function block already ended
83        #   validate this function not already defined
84        #   initialize an entry in the function dicitonary
85        #       including a mask table with no values yet defined
86        #
87        if fields[0] == 'name':
88            name = fields[1]
89            if state != 0:
90                print("'name' %s found before previous name" \
91                      "completed" % (fields[1]))
92                continue
93            state = 1
94            if name in Functs:
95                print("name '%s' already present - may give" \
96                      " wrong results" % (name))
97            else:
98                # dict entry with two list elements (chdata, rangedata)
99                Functs[name] = [ [], [] ]
100                for v in range(256):
101                    Functs[name][0].append(0)
102        #
103        # end line:
104        #   validate there was a preceding function name line
105        #   set state to show no current function active
106        #
107        elif fields[0] == 'end':
108            if state == 0:
109                print("'end' found outside of function block")
110                continue
111            state = 0
112
113        #
114        # ur line:
115        #   validate function has been defined
116        #   process remaining fields on the line, which may be either
117        #       individual unicode values or ranges of values
118        #
119        elif fields[0] == 'ur':
120            if state != 1:
121                raise Exception("'ur' found outside of 'name' block")
122            for el in fields[1:]:
123                pos = el.find('..')
124                # pos <=0 means not a range, so must be individual value
125                if pos <= 0:
126                    # cheap handling of hex or decimal values
127                    if el[0:2] == '0x':
128                        value = int(el[2:],16)
129                    elif el[0] == "'":
130                        value = ord(el[1])
131                    else:
132                        value = int(el)
133                    if ((value < 0) | (value > 0x1fffff)):
134                        raise Exception('Illegal value (%s) in ch for'\
135                                ' name %s' % (el,name))
136                    # for ur we have only ranges (makes things simpler),
137                    # so convert val to range
138                    currange = (value, value)
139                # pos > 0 means this is a range, so isolate/validate
140                # the interval
141                else:
142                    # split the range into it's first-val, last-val
143                    (first, last) = el.split("..")
144                    # convert values from text into binary
145                    if first[0:2] == '0x':
146                        start = int(first[2:],16)
147                    elif first[0] == "'":
148                        start = ord(first[1])
149                    else:
150                        start = int(first)
151                    if last[0:2] == '0x':
152                        end = int(last[2:],16)
153                    elif last[0] == "'":
154                        end = ord(last[1])
155                    else:
156                        end = int(last)
157                    if (start < 0) | (end > 0x1fffff) | (start > end):
158                        raise Exception("Invalid range '%s'" % el)
159                    currange = (start, end)
160                # common path - 'currange' has the range, now take care of it
161                # We split on single-byte values vs. multibyte
162                if currange[1] < 0x100: # single-byte
163                    for ch in range(currange[0],currange[1]+1):
164                        # validate that value not previously defined
165                        if Functs[name][0][ch]:
166                            msg = "Duplicate ch value '%s' for name '%s'" % (el, name)
167                            raise Exception(msg)
168                        Functs[name][0][ch] = 1
169                else:                   # multi-byte
170                    if currange in Functs[name][1]:
171                        raise Exception("range already defined in" \
172                                " function")
173                    else:
174                        Functs[name][1].append(currange)
175
176    except:
177        print("Failed to process line: %s" % (line))
178        raise
179#
180# At this point, the entire definition file has been processed.  Now we
181# enter the output phase, where we generate the two files chvalid.c and'
182# chvalid.h
183#
184# To do this, we first output the 'static' data (heading, fixed
185# definitions, etc.), then output the 'dynamic' data (the results
186# of the above processing), and finally output closing 'static' data
187# (e.g. the subroutine to process the ranges)
188#
189
190#
191# Generate the headings:
192#
193try:
194    header = open("include/libxml/chvalid.h", "w")
195except:
196    print("Failed to open include/libxml/chvalid.h")
197    sys.exit(1)
198
199try:
200    output = open("chvalid.c", "w")
201except:
202    print("Failed to open chvalid.c")
203    sys.exit(1)
204
205date = time.asctime(time.localtime(time.time()))
206
207header.write(
208"""/*
209 * Summary: Unicode character range checking
210 * Description: this module exports interfaces for the character
211 *               range validation APIs
212 *
213 * This file is automatically generated from the cvs source
214 * definition files using the genChRanges.py Python script
215 *
216 * Generation date: %s
217 * Sources: %s
218 * Author: William Brack <wbrack@mmm.com.hk>
219 */
220
221#ifndef __XML_CHVALID_H__
222#define __XML_CHVALID_H__
223
224#include <libxml/xmlversion.h>
225#include <libxml/xmlstring.h>
226
227#ifdef __cplusplus
228extern "C" {
229#endif
230
231/*
232 * Define our typedefs and structures
233 *
234 */
235typedef struct _xmlChSRange xmlChSRange;
236typedef xmlChSRange *xmlChSRangePtr;
237struct _xmlChSRange {
238    unsigned short\tlow;
239    unsigned short\thigh;
240};
241
242typedef struct _xmlChLRange xmlChLRange;
243typedef xmlChLRange *xmlChLRangePtr;
244struct _xmlChLRange {
245    unsigned int\tlow;
246    unsigned int\thigh;
247};
248
249typedef struct _xmlChRangeGroup xmlChRangeGroup;
250typedef xmlChRangeGroup *xmlChRangeGroupPtr;
251struct _xmlChRangeGroup {
252    int\t\t\tnbShortRange;
253    int\t\t\tnbLongRange;
254    const xmlChSRange\t*shortRange;\t/* points to an array of ranges */
255    const xmlChLRange\t*longRange;
256};
257
258/**
259 * Range checking routine
260 */
261XMLPUBFUN int
262\t\txmlCharInRange(unsigned int val, const xmlChRangeGroup *group);
263
264""" % (date, sources));
265output.write(
266"""/*
267 * chvalid.c:\tthis module implements the character range
268 *\t\tvalidation APIs
269 *
270 * This file is automatically generated from the cvs source
271 * definition files using the genChRanges.py Python script
272 *
273 * Generation date: %s
274 * Sources: %s
275 * William Brack <wbrack@mmm.com.hk>
276 */
277
278#define IN_LIBXML
279#include "libxml.h"
280#include <libxml/chvalid.h>
281
282#include <stddef.h>
283
284/*
285 * The initial tables ({func_name}_tab) are used to validate whether a
286 * single-byte character is within the specified group.  Each table
287 * contains 256 bytes, with each byte representing one of the 256
288 * possible characters.  If the table byte is set, the character is
289 * allowed.
290 *
291 */
292""" % (date, sources));
293
294#
295# Now output the generated data.
296# We try to produce the best execution times.  Tests have shown that validation
297# with direct table lookup is, when there are a "small" number of valid items,
298# still not as fast as a sequence of inline compares.  So, if the single-byte
299# portion of a range has a "small" number of ranges, we output a macro for inline
300# compares, otherwise we output a 256-byte table and a macro to use it.
301#
302
303fkeys = sorted(Functs.keys())
304
305for f in fkeys:
306
307# First we convert the specified single-byte values into a group of ranges.
308# If the total number of such ranges is less than minTableSize, we generate
309# an inline macro for direct comparisons; if greater, we generate a lookup
310# table.
311    if max(Functs[f][0]) > 0:   # only check if at least one entry
312        rangeTable = makeRange(Functs[f][0])
313        numRanges = len(rangeTable)
314        if numRanges >= minTableSize:   # table is worthwhile
315            header.write("XMLPUBVAR const unsigned char %s_tab[256];\n" % f)
316            header.write("""
317/**
318 * %s_ch:
319 * @c: char to validate
320 *
321 * Automatically generated by genChRanges.py
322 */
323""" % f)
324            header.write("#define %s_ch(c)\t(%s_tab[(c)])\n" % (f, f))
325
326            # write the constant data to the code file
327            output.write("const unsigned char %s_tab[256] = {\n" % f)
328            pline = "   "
329            for n in range(255):
330                pline += " 0x%02x," % Functs[f][0][n]
331                if len(pline) > 72:
332                    output.write(pline + "\n")
333                    pline = "   "
334            output.write(pline + " 0x%02x };\n\n" % Functs[f][0][255])
335
336        else:           # inline check is used
337            # first another little optimisation - if space is present,
338            # put it at the front of the list so it is checked first
339            try:
340                ix = rangeTable.remove((0x20, 0x20))
341                rangeTable.insert(0, (0x20, 0x20))
342            except:
343                pass
344            firstFlag = 1
345
346            header.write("""
347/**
348 * %s_ch:
349 * @c: char to validate
350 *
351 * Automatically generated by genChRanges.py
352 */
353""" % f)
354            # okay, I'm tired of the messy lineup - let's automate it!
355            pline = "#define %s_ch(c)" % f
356            # 'ntab' is number of tabs needed to position to col. 33 from name end
357            ntab = 4 - (len(pline)) // 8
358            if ntab < 0:
359                ntab = 0
360            just = ""
361            for i in range(ntab):
362                just += "\t"
363            pline = pline + just + "("
364            for rg in rangeTable:
365                if not firstFlag:
366                    pline += " || \\\n\t\t\t\t "
367                else:
368                    firstFlag = 0
369                if rg[0] == rg[1]:              # single value - check equal
370                    pline += "((c) == 0x%x)" % rg[0]
371                else:                           # value range
372                # since we are doing char, also change range ending in 0xff
373                    if rg[1] != 0xff:
374                        pline += "((0x%x <= (c)) &&" % rg[0]
375                        pline += " ((c) <= 0x%x))" % rg[1]
376                    else:
377                        pline += " (0x%x <= (c))" % rg[0]
378            pline += ")\n"
379            header.write(pline)
380
381    header.write("""
382/**
383 * %sQ:
384 * @c: char to validate
385 *
386 * Automatically generated by genChRanges.py
387 */
388""" % f)
389    pline = "#define %sQ(c)" % f
390    ntab = 4 - (len(pline)) // 8
391    if ntab < 0:
392        ntab = 0
393    just = ""
394    for i in range(ntab):
395        just += "\t"
396    header.write(pline + just + "(((c) < 0x100) ? \\\n\t\t\t\t ")
397    if max(Functs[f][0]) > 0:
398        header.write("%s_ch((c)) :" % f)
399    else:
400        header.write("0 :")
401
402    # if no ranges defined, value invalid if >= 0x100
403    numRanges = len(Functs[f][1])
404    if numRanges == 0:
405        header.write(" 0)\n\n")
406    else:
407        if numRanges >= minTableSize:
408            header.write(" \\\n\t\t\t\t xmlCharInRange((c), &%sGroup))\n\n"  % f)
409        else:           # if < minTableSize, generate inline code
410            firstFlag = 1
411            for rg in Functs[f][1]:
412                if not firstFlag:
413                    pline += " || \\\n\t\t\t\t "
414                else:
415                    firstFlag = 0
416                    pline = "\\\n\t\t\t\t("
417                if rg[0] == rg[1]:              # single value - check equal
418                    pline += "((c) == 0x%x)" % rg[0]
419                else:                           # value range
420                    pline += "((0x%x <= (c)) &&" % rg[0]
421                    pline += " ((c) <= 0x%x))" % rg[1]
422            pline += "))\n\n"
423            header.write(pline)
424
425
426    if len(Functs[f][1]) > 0:
427        header.write("XMLPUBVAR const xmlChRangeGroup %sGroup;\n" % f)
428
429
430#
431# Next we do the unicode ranges
432#
433
434for f in fkeys:
435    if len(Functs[f][1]) > 0:   # only generate if unicode ranges present
436        rangeTable = Functs[f][1]
437        rangeTable.sort()       # ascending tuple sequence
438        numShort = 0
439        numLong  = 0
440        for rg in rangeTable:
441            if rg[1] < 0x10000: # if short value
442                if numShort == 0:       # first occurrence
443                    pline = "static const xmlChSRange %s_srng[] = {" % f
444                else:
445                    pline += ","
446                numShort += 1
447                if len(pline) > 60:
448                    output.write(pline + "\n")
449                    pline = "    "
450                else:
451                    pline += " "
452                pline += "{0x%x, 0x%x}" % (rg[0], rg[1])
453            else:               # if long value
454                if numLong == 0:        # first occurrence
455                    if numShort > 0:    # if there were shorts, finish them off
456                        output.write(pline + "};\n")
457                    pline = "static const xmlChLRange %s_lrng[] = { " % f
458                else:
459                    pline += ", "
460                numLong += 1
461                if len(pline) > 60:
462                    output.write(pline + "\n")
463                    pline = "    "
464                pline += "{0x%x, 0x%x}" % (rg[0], rg[1])
465        output.write(pline + "};\n")    # finish off last group
466
467        pline = "const xmlChRangeGroup %sGroup =\n\t{%d, %d, " % (f, numShort, numLong)
468        if numShort > 0:
469            pline += "%s_srng" % f
470        else:
471            pline += "(xmlChSRangePtr)0"
472        if numLong > 0:
473            pline += ", %s_lrng" % f
474        else:
475            pline += ", (xmlChLRangePtr)0"
476
477        output.write(pline + "};\n\n")
478
479output.write(
480"""
481/**
482 * xmlCharInRange:
483 * @val: character to be validated
484 * @rptr: pointer to range to be used to validate
485 *
486 * Does a binary search of the range table to determine if char
487 * is valid
488 *
489 * Returns: true if character valid, false otherwise
490 */
491int
492xmlCharInRange (unsigned int val, const xmlChRangeGroup *rptr) {
493    int low, high, mid;
494    const xmlChSRange *sptr;
495    const xmlChLRange *lptr;
496
497    if (rptr == NULL) return(0);
498    if (val < 0x10000) {\t/* is val in 'short' or 'long'  array? */
499\tif (rptr->nbShortRange == 0)
500\t    return 0;
501\tlow = 0;
502\thigh = rptr->nbShortRange - 1;
503\tsptr = rptr->shortRange;
504\twhile (low <= high) {
505\t    mid = (low + high) / 2;
506\t    if ((unsigned short) val < sptr[mid].low) {
507\t\thigh = mid - 1;
508\t    } else {
509\t\tif ((unsigned short) val > sptr[mid].high) {
510\t\t    low = mid + 1;
511\t\t} else {
512\t\t    return 1;
513\t\t}
514\t    }
515\t}
516    } else {
517\tif (rptr->nbLongRange == 0) {
518\t    return 0;
519\t}
520\tlow = 0;
521\thigh = rptr->nbLongRange - 1;
522\tlptr = rptr->longRange;
523\twhile (low <= high) {
524\t    mid = (low + high) / 2;
525\t    if (val < lptr[mid].low) {
526\t\thigh = mid - 1;
527\t    } else {
528\t\tif (val > lptr[mid].high) {
529\t\t    low = mid + 1;
530\t\t} else {
531\t\t    return 1;
532\t\t}
533\t    }
534\t}
535    }
536    return 0;
537}
538
539""");
540
541#
542# finally, generate the ABI compatibility functions
543#
544for f in fkeys:
545    output.write("""
546/**
547 * %s:
548 * @ch:  character to validate
549 *
550 * This function is DEPRECATED.
551""" % f);
552    if max(Functs[f][0]) > 0:
553        output.write(" * Use %s_ch or %sQ instead" % (f, f))
554    else:
555        output.write(" * Use %sQ instead" % f)
556    output.write("""
557 *
558 * Returns true if argument valid, false otherwise
559 */
560""")
561    output.write("int\n%s(unsigned int ch) {\n    return(%sQ(ch));\n}\n\n" % (f,f))
562    header.write("XMLPUBFUN int\n\t\t%s(unsigned int ch);\n" % f);
563#
564# Run complete - write trailers and close the output files
565#
566
567header.write("""
568#ifdef __cplusplus
569}
570#endif
571#endif /* __XML_CHVALID_H__ */
572""")
573
574header.close()
575
576output.close()
577
578