1#! /usr/bin/python 2 3# Generate utt tables. Note: this script has now been converted to Python 3. 4 5# The source file pcre2_tables.c contains (amongst other things), a table that 6# is indexed by script name. In order to reduce the number of relocations when 7# loading the library, the names are held as a single large string, with 8# offsets in the table. This is tedious to maintain by hand. Therefore, this 9# script is used to generate the table. The output is sent to stdout; usually 10# that should be directed to a temporary file. Then pcre2_tables.c can be 11# edited by replacing the relevant definitions and table therein with the 12# temporary file. 13 14# Modified by PH 17-March-2009 to generate the more verbose form that works 15# for UTF-support in EBCDIC as well as ASCII environments. 16# Modified by PH 01-March-2010 to add new scripts for Unicode 5.2.0. 17# Modified by PH 04-May-2010 to add new "X.." special categories. 18# Modified by PH 30-April-2011 to add new scripts for Unicode 6.0.0 19# Modified by ChPe 30-September-2012 to add this note; no other changes were 20# necessary for Unicode 6.2.0 support. 21# Modfied by PH 26-February-2013 to add the Xuc special category. 22# Comment modified by PH 13-May-2014 to update to PCRE2 file names. 23# Script updated to Python 3 by running it through the 2to3 converter. 24# Added script names for Unicode 7.0.0, 20-June-2014. 25# Added script names for Unicode 8.0.0, 19-June-2015. 26# Added script names for Unicode 10.0.0, 02-July-2017. 27# Added script names for Unicode 11.0.0, 03-July-2018. 28# Added 'Unknown' script, 01-October-2018. 29# Added script names for Unicode 12.1.0, 27-July-2019. 30# Added script names for Unicode 13.0.0, 10-March-2020. 31# Added Script names for Unicode 14.0.0, PCRE2-10.39 32 33script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', \ 34 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', \ 35 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', \ 36 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', \ 37 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', \ 38 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', \ 39 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', \ 40 # New for Unicode 5.0 41 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', \ 42 # New for Unicode 5.1 43 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', \ 44 # New for Unicode 5.2 45 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', \ 46 'Inscriptional_Pahlavi', 'Inscriptional_Parthian', \ 47 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', \ 48 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', \ 49 # New for Unicode 6.0.0 50 'Batak', 'Brahmi', 'Mandaic', \ 51# New for Unicode 6.1.0 52 'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri', 53# New for Unicode 7.0.0 54 'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi', 55 'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean', 56 'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi', 57 'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi', 58# New for Unicode 8.0.0 59 'Ahom', 'Anatolian_Hieroglyphs', 'Hatran', 'Multani', 'Old_Hungarian', 60 'SignWriting', 61# New for Unicode 10.0.0 62 'Adlam', 'Bhaiksuki', 'Marchen', 'Newa', 'Osage', 'Tangut', 'Masaram_Gondi', 63 'Nushu', 'Soyombo', 'Zanabazar_Square', 64# New for Unicode 11.0.0 65 'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin', 66 'Old_Sogdian', 'Sogdian', 67# New for Unicode 12.0.0 68 'Elymaic', 'Nandinagari', 'Nyiakeng_Puachue_Hmong', 'Wancho', 69# New for Unicode 13.0.0 70 'Chorasmian', 'Dives_Akuru', 'Khitan_Small_Script', 'Yezidi', 71# New for Unicode 14.0.0 72 'Cypro_Minoan', 'Old_Uyghur', 'Tangsa', 'Toto', 'Vithkuqi' 73 ] 74 75category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 76 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps', 77 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ] 78 79general_category_names = ['C', 'L', 'M', 'N', 'P', 'S', 'Z'] 80 81# First add the Unicode script and category names. 82 83utt_table = list(zip(script_names, ['PT_SC'] * len(script_names))) 84utt_table += list(zip(category_names, ['PT_PC'] * len(category_names))) 85utt_table += list(zip(general_category_names, ['PT_GC'] * len(general_category_names))) 86 87# Now add our own specials. 88 89utt_table.append(('Any', 'PT_ANY')) 90utt_table.append(('L&', 'PT_LAMP')) 91utt_table.append(('Xan', 'PT_ALNUM')) 92utt_table.append(('Xps', 'PT_PXSPACE')) 93utt_table.append(('Xsp', 'PT_SPACE')) 94utt_table.append(('Xuc', 'PT_UCNC')) 95utt_table.append(('Xwd', 'PT_WORD')) 96 97# Sort the table. 98 99utt_table.sort() 100 101# We have to use STR_ macros to define the strings so that it all works in 102# UTF-8 mode on EBCDIC platforms. 103 104for utt in utt_table: 105 print('#define STRING_%s0' % (utt[0].replace('&', '_AMPERSAND')), end=' ') 106 for c in utt[0]: 107 if c == '_': 108 print('STR_UNDERSCORE', end=' ') 109 elif c == '&': 110 print('STR_AMPERSAND', end=' ') 111 else: 112 print('STR_%s' % c, end=' '); 113 print('"\\0"') 114 115# Print the actual table, using the string names 116 117print('') 118print('const char PRIV(utt_names)[] ='); 119last = '' 120for utt in utt_table: 121 if utt == utt_table[-1]: 122 last = ';' 123 print(' STRING_%s0%s' % (utt[0].replace('&', '_AMPERSAND'), last)) 124# This was how it was done before the EBCDIC-compatible modification. 125# print ' "%s\\0"%s' % (utt[0], last) 126 127print('\nconst ucp_type_table PRIV(utt)[] = {') 128offset = 0 129last = ',' 130for utt in utt_table: 131 if utt[1] in ('PT_ANY', 'PT_LAMP', 'PT_ALNUM', 'PT_PXSPACE', 132 'PT_SPACE', 'PT_UCNC', 'PT_WORD'): 133 value = '0' 134 else: 135 value = 'ucp_' + utt[0] 136 if utt == utt_table[-1]: 137 last = '' 138 print(' { %3d, %s, %s }%s' % (offset, utt[1], value, last)) 139 offset += len(utt[0]) + 1 140print('};') 141