• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2#
3# Original script modified in November 2003 to take advantage of
4# the character-validation range routines, and updated to the
5# current Unicode information (Version 4.0.1)
6#
7# NOTE: there is an 'alias' facility for blocks which are not present in
8#	the current release, but are needed for ABI compatibility.  This
9#	must be accomplished MANUALLY!  Please see the comments below under
10#     'blockAliases'
11#
12import sys
13import string
14import time
15
16webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
17sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt"
18
19#
20# blockAliases is a small hack - it is used for mapping block names which
21# were were used in the 3.1 release, but are missing or changed in the current
22# release.  The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
23blockAliases = []
24blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
25blockAliases.append("Greek:GreekandCoptic")
26blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," +
27	"SupplementaryPrivateUseArea-B")
28
29# minTableSize gives the minimum number of ranges which must be present
30# before a range table is produced.  If there are less than this
31# number, inline comparisons are generated
32minTableSize = 8
33
34(blockfile, catfile) = sources.split()
35
36
37#
38# Now process the "blocks" file, reducing it to a dictionary
39# indexed by blockname, containing a tuple with the applicable
40# block range
41#
42BlockNames = {}
43try:
44    blocks = open(blockfile, "r")
45except:
46    print("Missing %s, aborting ..." % blockfile)
47    sys.exit(1)
48
49for line in blocks.readlines():
50    if line[0] == '#':
51        continue
52    line = line.strip()
53    if line == '':
54        continue
55    try:
56        fields = line.split(';')
57        range = fields[0].strip()
58        (start, end) = range.split("..")
59        name = fields[1].strip()
60        name = name.replace(' ', '')
61    except:
62        print("Failed to process line: %s" % (line))
63        continue
64    start = "0x" + start
65    end = "0x" + end
66    try:
67        BlockNames[name].append((start, end))
68    except:
69        BlockNames[name] = [(start, end)]
70blocks.close()
71print("Parsed %d blocks descriptions" % (len(BlockNames.keys())))
72
73for block in blockAliases:
74    alias = block.split(':')
75    alist = alias[1].split(',')
76    for comp in alist:
77        if comp in BlockNames:
78            if alias[0] not in BlockNames:
79                BlockNames[alias[0]] = []
80            for r in BlockNames[comp]:
81                BlockNames[alias[0]].append(r)
82        else:
83            print("Alias %s: %s not in Blocks" % (alias[0], comp))
84            continue
85
86#
87# Next process the Categories file. This is more complex, since
88# the file is in code sequence, and we need to invert it.  We use
89# a dictionary with index category-name, with each entry containing
90# all the ranges (codepoints) of that category.  Note that category
91# names comprise two parts - the general category, and the "subclass"
92# within that category.  Therefore, both "general category" (which is
93# the first character of the 2-character category-name) and the full
94# (2-character) name are entered into this dictionary.
95#
96try:
97    data = open(catfile, "r")
98except:
99    print("Missing %s, aborting ..." % catfile)
100    sys.exit(1)
101
102nbchar = 0;
103Categories = {}
104for line in data.readlines():
105    if line[0] == '#':
106        continue
107    line = line.strip()
108    if line == '':
109        continue
110    try:
111        fields = line.split(';')
112        point = fields[0].strip()
113        value = 0
114        while point != '':
115            value = value * 16
116            if point[0] >= '0' and point[0] <= '9':
117                value = value + ord(point[0]) - ord('0')
118            elif point[0] >= 'A' and point[0] <= 'F':
119                value = value + 10 + ord(point[0]) - ord('A')
120            elif point[0] >= 'a' and point[0] <= 'f':
121                value = value + 10 + ord(point[0]) - ord('a')
122            point = point[1:]
123        name = fields[2]
124    except:
125        print("Failed to process line: %s" % (line))
126        continue
127
128    nbchar = nbchar + 1
129    # update entry for "full name"
130    try:
131        Categories[name].append(value)
132    except:
133        try:
134            Categories[name] = [value]
135        except:
136            print("Failed to process line: %s" % (line))
137    # update "general category" name
138    try:
139        Categories[name[0]].append(value)
140    except:
141        try:
142            Categories[name[0]] = [value]
143        except:
144            print("Failed to process line: %s" % (line))
145
146blocks.close()
147print("Parsed %d char generating %d categories" % (nbchar, len(Categories.keys())))
148
149#
150# The data is now all read.  Time to process it into a more useful form.
151#
152# reduce the number list into ranges
153for cat in Categories.keys():
154    list = Categories[cat]
155    start = -1
156    prev = -1
157    end = -1
158    ranges = []
159    for val in list:
160        if start == -1:
161            start = val
162            prev = val
163            continue
164        elif val == prev + 1:
165            prev = val
166            continue
167        elif prev == start:
168            ranges.append((prev, prev))
169            start = val
170            prev = val
171            continue
172        else:
173            ranges.append((start, prev))
174            start = val
175            prev = val
176            continue
177    if prev == start:
178        ranges.append((prev, prev))
179    else:
180        ranges.append((start, prev))
181    Categories[cat] = ranges
182
183#
184# Assure all data is in alphabetic order, since we will be doing binary
185# searches on the tables.
186#
187bkeys = sorted(BlockNames.keys())
188
189ckeys = sorted(Categories.keys())
190
191#
192# Generate the resulting files
193#
194try:
195    header = open("include/libxml/xmlunicode.h", "w")
196except:
197    print("Failed to open include/libxml/xmlunicode.h")
198    sys.exit(1)
199
200try:
201    output = open("xmlunicode.c", "w")
202except:
203    print("Failed to open xmlunicode.c")
204    sys.exit(1)
205
206date = time.asctime(time.localtime(time.time()))
207
208header.write(
209"""/*
210 * Summary: Unicode character APIs
211 * Description: API for the Unicode character APIs
212 *
213 * This file is automatically generated from the
214 * UCS description files of the Unicode Character Database
215 * %s
216 * using the genUnicode.py Python script.
217 *
218 * Generation date: %s
219 * Sources: %s
220 * Author: Daniel Veillard
221 */
222
223#ifndef __XML_UNICODE_H__
224#define __XML_UNICODE_H__
225
226#include <libxml/xmlversion.h>
227
228#ifdef LIBXML_UNICODE_ENABLED
229
230#ifdef __cplusplus
231extern "C" {
232#endif
233
234""" % (webpage, date, sources));
235
236output.write(
237"""/*
238 * xmlunicode.c: this module implements the Unicode character APIs
239 *
240 * This file is automatically generated from the
241 * UCS description files of the Unicode Character Database
242 * %s
243 * using the genUnicode.py Python script.
244 *
245 * Generation date: %s
246 * Sources: %s
247 * Daniel Veillard <veillard@redhat.com>
248 */
249
250#define IN_LIBXML
251#include "libxml.h"
252
253#ifdef LIBXML_UNICODE_ENABLED
254
255#include <string.h>
256#include <libxml/xmlversion.h>
257#include <libxml/xmlunicode.h>
258#include <libxml/chvalid.h>
259
260typedef int (xmlIntFunc)(int);	/* just to keep one's mind untwisted */
261
262typedef struct {
263    const char *rangename;
264    xmlIntFunc *func;
265} xmlUnicodeRange;
266
267typedef struct {
268    const xmlUnicodeRange *table;
269    int		    numentries;
270} xmlUnicodeNameTable;
271
272
273static xmlIntFunc *xmlUnicodeLookup(const xmlUnicodeNameTable *tptr, const char *tname);
274
275static const xmlUnicodeRange xmlUnicodeBlocks[] = {
276""" % (webpage, date, sources));
277
278flag = 0
279for block in bkeys:
280    name = block.replace('-', '')
281    if flag:
282        output.write(',\n')
283    else:
284        flag = 1
285    output.write('  {"%s", xmlUCSIs%s}' % (block, name))
286output.write('};\n\n')
287
288output.write('static const xmlUnicodeRange xmlUnicodeCats[] = {\n')
289flag = 0;
290for name in ckeys:
291    if flag:
292        output.write(',\n')
293    else:
294        flag = 1
295    output.write('  {"%s", xmlUCSIsCat%s}' % (name, name))
296output.write('};\n\n')
297
298#
299# For any categories with more than minTableSize ranges we generate
300# a range table suitable for xmlCharInRange
301#
302for name in ckeys:
303  if len(Categories[name]) > minTableSize:
304    numshort = 0
305    numlong = 0
306    ranges = Categories[name]
307    sptr = "NULL"
308    lptr = "NULL"
309    for range in ranges:
310      (low, high) = range
311      if high < 0x10000:
312        if numshort == 0:
313          pline = "static const xmlChSRange xml%sS[] = {" % name
314          sptr = "xml%sS" % name
315        else:
316          pline += ","
317        numshort += 1
318      else:
319        if numlong == 0:
320          if numshort > 0:
321            output.write(pline + " };\n")
322          pline = "static const xmlChLRange xml%sL[] = {" % name
323          lptr = "xml%sL" % name
324        else:
325          pline += ","
326        numlong += 1
327      if len(pline) > 60:
328        output.write(pline + "\n")
329        pline = "    "
330      elif pline[-1:] == ",":
331        pline += " "
332      pline += "{%s, %s}" % (hex(low), hex(high))
333    output.write(pline + " };\nstatic const xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
334         % (name, numshort, numlong, sptr, lptr))
335
336
337output.write(
338"""static const xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
339static const xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
340
341/**
342 * xmlUnicodeLookup:
343 * @tptr: pointer to the name table
344 * @name: name to be found
345 *
346 * binary table lookup for user-supplied name
347 *
348 * Returns pointer to range function if found, otherwise NULL
349 */
350static xmlIntFunc
351*xmlUnicodeLookup(const xmlUnicodeNameTable *tptr, const char *tname) {
352    int low, high, mid, cmp;
353    const xmlUnicodeRange *sptr;
354
355    if ((tptr == NULL) || (tname == NULL)) return(NULL);
356
357    low = 0;
358    high = tptr->numentries - 1;
359    sptr = tptr->table;
360    while (low <= high) {
361	mid = (low + high) / 2;
362	if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
363	    return (sptr[mid].func);
364	if (cmp < 0)
365	    high = mid - 1;
366	else
367	    low = mid + 1;
368    }
369    return (NULL);
370}
371
372""" % (len(BlockNames), len(Categories)) )
373
374for block in bkeys:
375    name = block.replace('-', '')
376    header.write("XMLPUBFUN int xmlUCSIs%s\t(int code);\n" % name)
377    output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
378    output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
379                 (block))
380    output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
381    output.write("int\nxmlUCSIs%s(int code) {\n    return(" % name)
382    flag = 0
383    for (start, end) in BlockNames[block]:
384        if flag:
385            output.write(" ||\n           ")
386        else:
387            flag = 1
388        output.write("((code >= %s) && (code <= %s))" % (start, end))
389    output.write(");\n}\n\n")
390
391header.write("\nXMLPUBFUN int xmlUCSIsBlock\t(int code, const char *block);\n\n")
392output.write(
393"""/**
394 * xmlUCSIsBlock:
395 * @code: UCS code point
396 * @block: UCS block name
397 *
398 * Check whether the character is part of the UCS Block
399 *
400 * Returns 1 if true, 0 if false and -1 on unknown block
401 */
402int
403xmlUCSIsBlock(int code, const char *block) {
404    xmlIntFunc *func;
405
406    func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
407    if (func == NULL)
408	return (-1);
409    return (func(code));
410}
411
412""")
413
414for name in ckeys:
415    ranges = Categories[name]
416    header.write("XMLPUBFUN int xmlUCSIsCat%s\t(int code);\n" % name)
417    output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
418    output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
419                 (name))
420    output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
421    output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
422    if len(Categories[name]) > minTableSize:
423        output.write("    return(xmlCharInRange((unsigned int)code, &xml%sG)"
424            % name)
425    else:
426        start = 1
427        for range in ranges:
428            (begin, end) = range;
429            if start:
430                output.write("    return(");
431                start = 0
432            else:
433                output.write(" ||\n           ");
434            if (begin == end):
435                output.write("(code == %s)" % (hex(begin)))
436            else:
437                output.write("((code >= %s) && (code <= %s))" % (
438                         hex(begin), hex(end)))
439    output.write(");\n}\n\n")
440
441header.write("\nXMLPUBFUN int xmlUCSIsCat\t(int code, const char *cat);\n")
442output.write(
443"""/**
444 * xmlUCSIsCat:
445 * @code: UCS code point
446 * @cat: UCS Category name
447 *
448 * Check whether the character is part of the UCS Category
449 *
450 * Returns 1 if true, 0 if false and -1 on unknown category
451 */
452int
453xmlUCSIsCat(int code, const char *cat) {
454    xmlIntFunc *func;
455
456    func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
457    if (func == NULL)
458	return (-1);
459    return (func(code));
460}
461
462#endif /* LIBXML_UNICODE_ENABLED */
463""")
464
465header.write("""
466#ifdef __cplusplus
467}
468#endif
469
470#endif /* LIBXML_UNICODE_ENABLED */
471
472#endif /* __XML_UNICODE_H__ */
473""");
474
475header.close()
476output.close()
477