1#!/usr/bin/env python3 2# -*- coding: utf-8 -*- 3 4# This script reads Huffman Code table [1] and generates symbol table 5# and decoding tables in C language. The resulting code is used in 6# lib/nghttp2_hd_huffman.h and lib/nghttp2_hd_huffman_data.c 7# 8# [1] https://httpwg.org/specs/rfc7541.html 9 10import re 11import sys 12from io import StringIO 13 14# From [1] 15HUFFMAN_CODE_TABLE = """\ 16 ( 0) |11111111|11000 1ff8 [13] 17 ( 1) |11111111|11111111|1011000 7fffd8 [23] 18 ( 2) |11111111|11111111|11111110|0010 fffffe2 [28] 19 ( 3) |11111111|11111111|11111110|0011 fffffe3 [28] 20 ( 4) |11111111|11111111|11111110|0100 fffffe4 [28] 21 ( 5) |11111111|11111111|11111110|0101 fffffe5 [28] 22 ( 6) |11111111|11111111|11111110|0110 fffffe6 [28] 23 ( 7) |11111111|11111111|11111110|0111 fffffe7 [28] 24 ( 8) |11111111|11111111|11111110|1000 fffffe8 [28] 25 ( 9) |11111111|11111111|11101010 ffffea [24] 26 ( 10) |11111111|11111111|11111111|111100 3ffffffc [30] 27 ( 11) |11111111|11111111|11111110|1001 fffffe9 [28] 28 ( 12) |11111111|11111111|11111110|1010 fffffea [28] 29 ( 13) |11111111|11111111|11111111|111101 3ffffffd [30] 30 ( 14) |11111111|11111111|11111110|1011 fffffeb [28] 31 ( 15) |11111111|11111111|11111110|1100 fffffec [28] 32 ( 16) |11111111|11111111|11111110|1101 fffffed [28] 33 ( 17) |11111111|11111111|11111110|1110 fffffee [28] 34 ( 18) |11111111|11111111|11111110|1111 fffffef [28] 35 ( 19) |11111111|11111111|11111111|0000 ffffff0 [28] 36 ( 20) |11111111|11111111|11111111|0001 ffffff1 [28] 37 ( 21) |11111111|11111111|11111111|0010 ffffff2 [28] 38 ( 22) |11111111|11111111|11111111|111110 3ffffffe [30] 39 ( 23) |11111111|11111111|11111111|0011 ffffff3 [28] 40 ( 24) |11111111|11111111|11111111|0100 ffffff4 [28] 41 ( 25) |11111111|11111111|11111111|0101 ffffff5 [28] 42 ( 26) |11111111|11111111|11111111|0110 ffffff6 [28] 43 ( 27) |11111111|11111111|11111111|0111 ffffff7 [28] 44 ( 28) |11111111|11111111|11111111|1000 ffffff8 [28] 45 ( 29) |11111111|11111111|11111111|1001 ffffff9 [28] 46 ( 30) |11111111|11111111|11111111|1010 ffffffa [28] 47 ( 31) |11111111|11111111|11111111|1011 ffffffb [28] 48' ' ( 32) |010100 14 [ 6] 49'!' ( 33) |11111110|00 3f8 [10] 50'"' ( 34) |11111110|01 3f9 [10] 51'#' ( 35) |11111111|1010 ffa [12] 52'$' ( 36) |11111111|11001 1ff9 [13] 53'%' ( 37) |010101 15 [ 6] 54'&' ( 38) |11111000 f8 [ 8] 55''' ( 39) |11111111|010 7fa [11] 56'(' ( 40) |11111110|10 3fa [10] 57')' ( 41) |11111110|11 3fb [10] 58'*' ( 42) |11111001 f9 [ 8] 59'+' ( 43) |11111111|011 7fb [11] 60',' ( 44) |11111010 fa [ 8] 61'-' ( 45) |010110 16 [ 6] 62'.' ( 46) |010111 17 [ 6] 63'/' ( 47) |011000 18 [ 6] 64'0' ( 48) |00000 0 [ 5] 65'1' ( 49) |00001 1 [ 5] 66'2' ( 50) |00010 2 [ 5] 67'3' ( 51) |011001 19 [ 6] 68'4' ( 52) |011010 1a [ 6] 69'5' ( 53) |011011 1b [ 6] 70'6' ( 54) |011100 1c [ 6] 71'7' ( 55) |011101 1d [ 6] 72'8' ( 56) |011110 1e [ 6] 73'9' ( 57) |011111 1f [ 6] 74':' ( 58) |1011100 5c [ 7] 75';' ( 59) |11111011 fb [ 8] 76'<' ( 60) |11111111|1111100 7ffc [15] 77'=' ( 61) |100000 20 [ 6] 78'>' ( 62) |11111111|1011 ffb [12] 79'?' ( 63) |11111111|00 3fc [10] 80'@' ( 64) |11111111|11010 1ffa [13] 81'A' ( 65) |100001 21 [ 6] 82'B' ( 66) |1011101 5d [ 7] 83'C' ( 67) |1011110 5e [ 7] 84'D' ( 68) |1011111 5f [ 7] 85'E' ( 69) |1100000 60 [ 7] 86'F' ( 70) |1100001 61 [ 7] 87'G' ( 71) |1100010 62 [ 7] 88'H' ( 72) |1100011 63 [ 7] 89'I' ( 73) |1100100 64 [ 7] 90'J' ( 74) |1100101 65 [ 7] 91'K' ( 75) |1100110 66 [ 7] 92'L' ( 76) |1100111 67 [ 7] 93'M' ( 77) |1101000 68 [ 7] 94'N' ( 78) |1101001 69 [ 7] 95'O' ( 79) |1101010 6a [ 7] 96'P' ( 80) |1101011 6b [ 7] 97'Q' ( 81) |1101100 6c [ 7] 98'R' ( 82) |1101101 6d [ 7] 99'S' ( 83) |1101110 6e [ 7] 100'T' ( 84) |1101111 6f [ 7] 101'U' ( 85) |1110000 70 [ 7] 102'V' ( 86) |1110001 71 [ 7] 103'W' ( 87) |1110010 72 [ 7] 104'X' ( 88) |11111100 fc [ 8] 105'Y' ( 89) |1110011 73 [ 7] 106'Z' ( 90) |11111101 fd [ 8] 107'[' ( 91) |11111111|11011 1ffb [13] 108'\' ( 92) |11111111|11111110|000 7fff0 [19] 109']' ( 93) |11111111|11100 1ffc [13] 110'^' ( 94) |11111111|111100 3ffc [14] 111'_' ( 95) |100010 22 [ 6] 112'`' ( 96) |11111111|1111101 7ffd [15] 113'a' ( 97) |00011 3 [ 5] 114'b' ( 98) |100011 23 [ 6] 115'c' ( 99) |00100 4 [ 5] 116'd' (100) |100100 24 [ 6] 117'e' (101) |00101 5 [ 5] 118'f' (102) |100101 25 [ 6] 119'g' (103) |100110 26 [ 6] 120'h' (104) |100111 27 [ 6] 121'i' (105) |00110 6 [ 5] 122'j' (106) |1110100 74 [ 7] 123'k' (107) |1110101 75 [ 7] 124'l' (108) |101000 28 [ 6] 125'm' (109) |101001 29 [ 6] 126'n' (110) |101010 2a [ 6] 127'o' (111) |00111 7 [ 5] 128'p' (112) |101011 2b [ 6] 129'q' (113) |1110110 76 [ 7] 130'r' (114) |101100 2c [ 6] 131's' (115) |01000 8 [ 5] 132't' (116) |01001 9 [ 5] 133'u' (117) |101101 2d [ 6] 134'v' (118) |1110111 77 [ 7] 135'w' (119) |1111000 78 [ 7] 136'x' (120) |1111001 79 [ 7] 137'y' (121) |1111010 7a [ 7] 138'z' (122) |1111011 7b [ 7] 139'{' (123) |11111111|1111110 7ffe [15] 140'|' (124) |11111111|100 7fc [11] 141'}' (125) |11111111|111101 3ffd [14] 142'~' (126) |11111111|11101 1ffd [13] 143 (127) |11111111|11111111|11111111|1100 ffffffc [28] 144 (128) |11111111|11111110|0110 fffe6 [20] 145 (129) |11111111|11111111|010010 3fffd2 [22] 146 (130) |11111111|11111110|0111 fffe7 [20] 147 (131) |11111111|11111110|1000 fffe8 [20] 148 (132) |11111111|11111111|010011 3fffd3 [22] 149 (133) |11111111|11111111|010100 3fffd4 [22] 150 (134) |11111111|11111111|010101 3fffd5 [22] 151 (135) |11111111|11111111|1011001 7fffd9 [23] 152 (136) |11111111|11111111|010110 3fffd6 [22] 153 (137) |11111111|11111111|1011010 7fffda [23] 154 (138) |11111111|11111111|1011011 7fffdb [23] 155 (139) |11111111|11111111|1011100 7fffdc [23] 156 (140) |11111111|11111111|1011101 7fffdd [23] 157 (141) |11111111|11111111|1011110 7fffde [23] 158 (142) |11111111|11111111|11101011 ffffeb [24] 159 (143) |11111111|11111111|1011111 7fffdf [23] 160 (144) |11111111|11111111|11101100 ffffec [24] 161 (145) |11111111|11111111|11101101 ffffed [24] 162 (146) |11111111|11111111|010111 3fffd7 [22] 163 (147) |11111111|11111111|1100000 7fffe0 [23] 164 (148) |11111111|11111111|11101110 ffffee [24] 165 (149) |11111111|11111111|1100001 7fffe1 [23] 166 (150) |11111111|11111111|1100010 7fffe2 [23] 167 (151) |11111111|11111111|1100011 7fffe3 [23] 168 (152) |11111111|11111111|1100100 7fffe4 [23] 169 (153) |11111111|11111110|11100 1fffdc [21] 170 (154) |11111111|11111111|011000 3fffd8 [22] 171 (155) |11111111|11111111|1100101 7fffe5 [23] 172 (156) |11111111|11111111|011001 3fffd9 [22] 173 (157) |11111111|11111111|1100110 7fffe6 [23] 174 (158) |11111111|11111111|1100111 7fffe7 [23] 175 (159) |11111111|11111111|11101111 ffffef [24] 176 (160) |11111111|11111111|011010 3fffda [22] 177 (161) |11111111|11111110|11101 1fffdd [21] 178 (162) |11111111|11111110|1001 fffe9 [20] 179 (163) |11111111|11111111|011011 3fffdb [22] 180 (164) |11111111|11111111|011100 3fffdc [22] 181 (165) |11111111|11111111|1101000 7fffe8 [23] 182 (166) |11111111|11111111|1101001 7fffe9 [23] 183 (167) |11111111|11111110|11110 1fffde [21] 184 (168) |11111111|11111111|1101010 7fffea [23] 185 (169) |11111111|11111111|011101 3fffdd [22] 186 (170) |11111111|11111111|011110 3fffde [22] 187 (171) |11111111|11111111|11110000 fffff0 [24] 188 (172) |11111111|11111110|11111 1fffdf [21] 189 (173) |11111111|11111111|011111 3fffdf [22] 190 (174) |11111111|11111111|1101011 7fffeb [23] 191 (175) |11111111|11111111|1101100 7fffec [23] 192 (176) |11111111|11111111|00000 1fffe0 [21] 193 (177) |11111111|11111111|00001 1fffe1 [21] 194 (178) |11111111|11111111|100000 3fffe0 [22] 195 (179) |11111111|11111111|00010 1fffe2 [21] 196 (180) |11111111|11111111|1101101 7fffed [23] 197 (181) |11111111|11111111|100001 3fffe1 [22] 198 (182) |11111111|11111111|1101110 7fffee [23] 199 (183) |11111111|11111111|1101111 7fffef [23] 200 (184) |11111111|11111110|1010 fffea [20] 201 (185) |11111111|11111111|100010 3fffe2 [22] 202 (186) |11111111|11111111|100011 3fffe3 [22] 203 (187) |11111111|11111111|100100 3fffe4 [22] 204 (188) |11111111|11111111|1110000 7ffff0 [23] 205 (189) |11111111|11111111|100101 3fffe5 [22] 206 (190) |11111111|11111111|100110 3fffe6 [22] 207 (191) |11111111|11111111|1110001 7ffff1 [23] 208 (192) |11111111|11111111|11111000|00 3ffffe0 [26] 209 (193) |11111111|11111111|11111000|01 3ffffe1 [26] 210 (194) |11111111|11111110|1011 fffeb [20] 211 (195) |11111111|11111110|001 7fff1 [19] 212 (196) |11111111|11111111|100111 3fffe7 [22] 213 (197) |11111111|11111111|1110010 7ffff2 [23] 214 (198) |11111111|11111111|101000 3fffe8 [22] 215 (199) |11111111|11111111|11110110|0 1ffffec [25] 216 (200) |11111111|11111111|11111000|10 3ffffe2 [26] 217 (201) |11111111|11111111|11111000|11 3ffffe3 [26] 218 (202) |11111111|11111111|11111001|00 3ffffe4 [26] 219 (203) |11111111|11111111|11111011|110 7ffffde [27] 220 (204) |11111111|11111111|11111011|111 7ffffdf [27] 221 (205) |11111111|11111111|11111001|01 3ffffe5 [26] 222 (206) |11111111|11111111|11110001 fffff1 [24] 223 (207) |11111111|11111111|11110110|1 1ffffed [25] 224 (208) |11111111|11111110|010 7fff2 [19] 225 (209) |11111111|11111111|00011 1fffe3 [21] 226 (210) |11111111|11111111|11111001|10 3ffffe6 [26] 227 (211) |11111111|11111111|11111100|000 7ffffe0 [27] 228 (212) |11111111|11111111|11111100|001 7ffffe1 [27] 229 (213) |11111111|11111111|11111001|11 3ffffe7 [26] 230 (214) |11111111|11111111|11111100|010 7ffffe2 [27] 231 (215) |11111111|11111111|11110010 fffff2 [24] 232 (216) |11111111|11111111|00100 1fffe4 [21] 233 (217) |11111111|11111111|00101 1fffe5 [21] 234 (218) |11111111|11111111|11111010|00 3ffffe8 [26] 235 (219) |11111111|11111111|11111010|01 3ffffe9 [26] 236 (220) |11111111|11111111|11111111|1101 ffffffd [28] 237 (221) |11111111|11111111|11111100|011 7ffffe3 [27] 238 (222) |11111111|11111111|11111100|100 7ffffe4 [27] 239 (223) |11111111|11111111|11111100|101 7ffffe5 [27] 240 (224) |11111111|11111110|1100 fffec [20] 241 (225) |11111111|11111111|11110011 fffff3 [24] 242 (226) |11111111|11111110|1101 fffed [20] 243 (227) |11111111|11111111|00110 1fffe6 [21] 244 (228) |11111111|11111111|101001 3fffe9 [22] 245 (229) |11111111|11111111|00111 1fffe7 [21] 246 (230) |11111111|11111111|01000 1fffe8 [21] 247 (231) |11111111|11111111|1110011 7ffff3 [23] 248 (232) |11111111|11111111|101010 3fffea [22] 249 (233) |11111111|11111111|101011 3fffeb [22] 250 (234) |11111111|11111111|11110111|0 1ffffee [25] 251 (235) |11111111|11111111|11110111|1 1ffffef [25] 252 (236) |11111111|11111111|11110100 fffff4 [24] 253 (237) |11111111|11111111|11110101 fffff5 [24] 254 (238) |11111111|11111111|11111010|10 3ffffea [26] 255 (239) |11111111|11111111|1110100 7ffff4 [23] 256 (240) |11111111|11111111|11111010|11 3ffffeb [26] 257 (241) |11111111|11111111|11111100|110 7ffffe6 [27] 258 (242) |11111111|11111111|11111011|00 3ffffec [26] 259 (243) |11111111|11111111|11111011|01 3ffffed [26] 260 (244) |11111111|11111111|11111100|111 7ffffe7 [27] 261 (245) |11111111|11111111|11111101|000 7ffffe8 [27] 262 (246) |11111111|11111111|11111101|001 7ffffe9 [27] 263 (247) |11111111|11111111|11111101|010 7ffffea [27] 264 (248) |11111111|11111111|11111101|011 7ffffeb [27] 265 (249) |11111111|11111111|11111111|1110 ffffffe [28] 266 (250) |11111111|11111111|11111101|100 7ffffec [27] 267 (251) |11111111|11111111|11111101|101 7ffffed [27] 268 (252) |11111111|11111111|11111101|110 7ffffee [27] 269 (253) |11111111|11111111|11111101|111 7ffffef [27] 270 (254) |11111111|11111111|11111110|000 7fffff0 [27] 271 (255) |11111111|11111111|11111011|10 3ffffee [26] 272EOS (256) |11111111|11111111|11111111|111111 3fffffff [30] 273""" 274 275class Node: 276 277 def __init__(self, term = None): 278 self.term = term 279 self.left = None 280 self.right = None 281 self.trans = [] 282 self.id = None 283 self.accept = False 284 285class Context: 286 287 def __init__(self): 288 self.next_id_ = 0 289 self.root = Node() 290 291 def next_id(self): 292 id = self.next_id_ 293 self.next_id_ += 1 294 return id 295 296def _add(node, sym, bits): 297 if len(bits) == 0: 298 node.term = sym 299 return 300 else: 301 if bits[0] == '0': 302 if node.left is None: 303 node.left = Node() 304 child = node.left 305 else: 306 if node.right is None: 307 node.right = Node() 308 child = node.right 309 _add(child, sym, bits[1:]) 310 311def huffman_tree_add(ctx, sym, bits): 312 _add(ctx.root, sym, bits) 313 314def _set_node_id(ctx, node, prefix): 315 if node.term is not None: 316 return 317 if len(prefix) <= 7 and [1] * len(prefix) == prefix: 318 node.accept = True 319 node.id = ctx.next_id() 320 _set_node_id(ctx, node.left, prefix + [0]) 321 _set_node_id(ctx, node.right, prefix + [1]) 322 323def huffman_tree_set_node_id(ctx): 324 _set_node_id(ctx, ctx.root, []) 325 326def _traverse(node, sym, start_node, root, left): 327 if left == 0: 328 if sym == 256: 329 sym = None 330 node = None 331 start_node.trans.append((node, sym)) 332 return 333 334 if node.term is not None: 335 node = root 336 337 def go(node): 338 if node.term is not None: 339 assert sym is None 340 nsym = node.term 341 else: 342 nsym = sym 343 344 _traverse(node, nsym, start_node, root, left - 1) 345 346 go(node.left) 347 go(node.right) 348 349def _build_transition_table(ctx, node): 350 if node is None: 351 return 352 _traverse(node, None, node, ctx.root, 4) 353 _build_transition_table(ctx, node.left) 354 _build_transition_table(ctx, node.right) 355 356def huffman_tree_build_transition_table(ctx): 357 _build_transition_table(ctx, ctx.root) 358 359NGHTTP2_HUFF_ACCEPTED = 1 << 14 360NGHTTP2_HUFF_SYM = 1 << 15 361 362def _print_transition_table(node): 363 if node.term is not None: 364 return 365 print('/* {} */'.format(node.id)) 366 print('{') 367 for nd, sym in node.trans: 368 flags = 0 369 if sym is None: 370 out = 0 371 else: 372 out = sym 373 flags |= NGHTTP2_HUFF_SYM 374 if nd is None: 375 id = 256 376 else: 377 id = nd.id 378 if id is None: 379 # if nd.id is None, it is a leaf node 380 id = 0 381 flags |= NGHTTP2_HUFF_ACCEPTED 382 elif nd.accept: 383 flags |= NGHTTP2_HUFF_ACCEPTED 384 print(' {{0x{:02x}, {}}},'.format(id | flags, out)) 385 print('},') 386 _print_transition_table(node.left) 387 _print_transition_table(node.right) 388 389def huffman_tree_print_transition_table(ctx): 390 _print_transition_table(ctx.root) 391 print('/* 256 */') 392 print('{') 393 print(' {0x100, 0},') 394 print(' {0x100, 0},') 395 print(' {0x100, 0},') 396 print(' {0x100, 0},') 397 print(' {0x100, 0},') 398 print(' {0x100, 0},') 399 print(' {0x100, 0},') 400 print(' {0x100, 0},') 401 print(' {0x100, 0},') 402 print(' {0x100, 0},') 403 print(' {0x100, 0},') 404 print(' {0x100, 0},') 405 print(' {0x100, 0},') 406 print(' {0x100, 0},') 407 print(' {0x100, 0},') 408 print(' {0x100, 0},') 409 print('},') 410 411if __name__ == '__main__': 412 ctx = Context() 413 symbol_tbl = [(None, 0) for i in range(257)] 414 415 for line in StringIO(HUFFMAN_CODE_TABLE): 416 m = re.match( 417 r'.*\(\s*(\d+)\)\s+([|01]+)\s+(\S+)\s+\[\s*(\d+)\].*', line) 418 if m: 419 sym = int(m.group(1)) 420 bits = re.sub(r'\|', '', m.group(2)) 421 code = m.group(3) 422 nbits = int(m.group(4)) 423 if len(code) > 8: 424 raise Error('Code is more than 4 bytes long') 425 assert(len(bits) == nbits) 426 symbol_tbl[sym] = (nbits, code) 427 huffman_tree_add(ctx, sym, bits) 428 429 huffman_tree_set_node_id(ctx) 430 huffman_tree_build_transition_table(ctx) 431 432 print('''\ 433typedef struct { 434 uint32_t nbits; 435 uint32_t code; 436} nghttp2_huff_sym; 437''') 438 439 print('''\ 440const nghttp2_huff_sym huff_sym_table[] = {''') 441 for i in range(257): 442 nbits = symbol_tbl[i][0] 443 k = int(symbol_tbl[i][1], 16) 444 k = k << (32 - nbits) 445 print('''\ 446 {{ {}, 0x{}u }}{}\ 447'''.format(symbol_tbl[i][0], hex(k)[2:], ',' if i < 256 else '')) 448 print('};') 449 print() 450 451 print('''\ 452enum {{ 453 NGHTTP2_HUFF_ACCEPTED = {}, 454 NGHTTP2_HUFF_SYM = {}, 455}} nghttp2_huff_decode_flag; 456'''.format(NGHTTP2_HUFF_ACCEPTED, NGHTTP2_HUFF_SYM)) 457 458 print('''\ 459typedef struct { 460 uint16_t fstate; 461 uint8_t sym; 462} nghttp2_huff_decode; 463''') 464 465 print('''\ 466const nghttp2_huff_decode huff_decode_table[][16] = {''') 467 huffman_tree_print_transition_table(ctx) 468 print('};') 469