1# 2# genmap_schinese.py: Simplified Chinese Codecs Map Generator 3# 4# Original Author: Hye-Shik Chang <perky@FreeBSD.org> 5# Modified Author: Dong-hee Na <donghee.na92@gmail.com> 6# 7import os 8import re 9 10from genmap_support import * 11 12 13GB2312_C1 = (0x21, 0x7e) 14GB2312_C2 = (0x21, 0x7e) 15GBKL1_C1 = (0x81, 0xa8) 16GBKL1_C2 = (0x40, 0xfe) 17GBKL2_C1 = (0xa9, 0xfe) 18GBKL2_C2 = (0x40, 0xa0) 19GB18030EXTP1_C1 = (0xa1, 0xa9) 20GB18030EXTP1_C2 = (0x40, 0xfe) 21GB18030EXTP2_C1 = (0xaa, 0xaf) 22GB18030EXTP2_C2 = (0xa1, 0xfe) 23GB18030EXTP3_C1 = (0xd7, 0xd7) 24GB18030EXTP3_C2 = (0xfa, 0xfe) 25GB18030EXTP4_C1 = (0xf8, 0xfd) 26GB18030EXTP4_C2 = (0xa1, 0xfe) 27GB18030EXTP5_C1 = (0xfe, 0xfe) 28GB18030EXTP5_C2 = (0x50, 0xfe) 29 30MAPPINGS_GB2312 = 'http://people.freebsd.org/~perky/i18n/GB2312.TXT' 31MAPPINGS_CP936 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT' 32MAPPINGS_GB18030 = 'http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/gb-18030-2000.xml' 33 34re_gb18030ass = re.compile('<a u="([A-F0-9]{4})" b="([0-9A-F ]+)"/>') 35 36 37def parse_gb18030map(fo): 38 m, gbuni = {}, {} 39 for i in range(65536): 40 if i < 0xd800 or i > 0xdfff: # exclude unicode surrogate area 41 gbuni[i] = None 42 for uni, native in re_gb18030ass.findall(fo.read()): 43 uni = eval('0x'+uni) 44 native = [eval('0x'+u) for u in native.split()] 45 if len(native) <= 2: 46 del gbuni[uni] 47 if len(native) == 2: # we can decode algorithmically for 1 or 4 bytes 48 m.setdefault(native[0], {}) 49 m[native[0]][native[1]] = uni 50 gbuni = [k for k in gbuni.keys()] 51 gbuni.sort() 52 return m, gbuni 53 54def main(): 55 print("Loading Mapping File...") 56 gb2312map = open_mapping_file('python-mappings/GB2312.TXT', MAPPINGS_GB2312) 57 cp936map = open_mapping_file('python-mappings/CP936.TXT', MAPPINGS_CP936) 58 gb18030map = open_mapping_file('python-mappings/gb-18030-2000.xml', MAPPINGS_GB18030) 59 60 gb18030decmap, gb18030unilinear = parse_gb18030map(gb18030map) 61 gbkdecmap = loadmap(cp936map) 62 gb2312decmap = loadmap(gb2312map) 63 difmap = {} 64 for c1, m in gbkdecmap.items(): 65 for c2, code in m.items(): 66 del gb18030decmap[c1][c2] 67 if not gb18030decmap[c1]: 68 del gb18030decmap[c1] 69 for c1, m in gb2312decmap.items(): 70 for c2, code in m.items(): 71 gbkc1, gbkc2 = c1 | 0x80, c2 | 0x80 72 if gbkdecmap[gbkc1][gbkc2] == code: 73 del gbkdecmap[gbkc1][gbkc2] 74 if not gbkdecmap[gbkc1]: 75 del gbkdecmap[gbkc1] 76 77 gb2312_gbkencmap, gb18030encmap = {}, {} 78 for c1, m in gbkdecmap.items(): 79 for c2, code in m.items(): 80 gb2312_gbkencmap.setdefault(code >> 8, {}) 81 gb2312_gbkencmap[code >> 8][code & 0xff] = c1 << 8 | c2 # MSB set 82 for c1, m in gb2312decmap.items(): 83 for c2, code in m.items(): 84 gb2312_gbkencmap.setdefault(code >> 8, {}) 85 gb2312_gbkencmap[code >> 8][code & 0xff] = c1 << 8 | c2 # MSB unset 86 for c1, m in gb18030decmap.items(): 87 for c2, code in m.items(): 88 gb18030encmap.setdefault(code >> 8, {}) 89 gb18030encmap[code >> 8][code & 0xff] = c1 << 8 | c2 90 91 with open('mappings_cn.h', 'w') as fp: 92 print_autogen(fp, os.path.basename(__file__)) 93 94 print("Generating GB2312 decode map...") 95 writer = DecodeMapWriter(fp, "gb2312", gb2312decmap) 96 writer.update_decode_map(GB2312_C1, GB2312_C2) 97 writer.generate() 98 99 print("Generating GBK decode map...") 100 writer = DecodeMapWriter(fp, "gbkext", gbkdecmap) 101 writer.update_decode_map(GBKL1_C1, GBKL1_C2) 102 writer.update_decode_map(GBKL2_C1, GBKL2_C2) 103 writer.generate() 104 105 print("Generating GB2312 && GBK encode map...") 106 writer = EncodeMapWriter(fp, "gbcommon", gb2312_gbkencmap) 107 writer.generate() 108 109 print("Generating GB18030 extension decode map...") 110 writer = DecodeMapWriter(fp, "gb18030ext", gb18030decmap) 111 for i in range(1, 6): 112 writer.update_decode_map(eval("GB18030EXTP%d_C1" % i), eval("GB18030EXTP%d_C2" % i)) 113 114 writer.generate() 115 116 print("Generating GB18030 extension encode map...") 117 writer = EncodeMapWriter(fp, "gb18030ext", gb18030encmap) 118 writer.generate() 119 120 print("Generating GB18030 Unicode BMP Mapping Ranges...") 121 ranges = [[-1, -1, -1]] 122 gblinnum = 0 123 fp.write(""" 124static const struct _gb18030_to_unibmp_ranges { 125 Py_UCS4 first, last; 126 DBCHAR base; 127} gb18030_to_unibmp_ranges[] = { 128""") 129 130 for uni in gb18030unilinear: 131 if uni == ranges[-1][1] + 1: 132 ranges[-1][1] = uni 133 else: 134 ranges.append([uni, uni, gblinnum]) 135 gblinnum += 1 136 137 filler = BufferedFiller() 138 for first, last, base in ranges[1:]: 139 filler.write('{', str(first), ',', str(last), ',', str(base), '},') 140 141 filler.write('{', '0,', '0,', str( 142 ranges[-1][2] + ranges[-1][1] - ranges[-1][0] + 1), '}', '};') 143 filler.printout(fp) 144 145 print("Done!") 146 147 148if __name__ == '__main__': 149 main() 150