1 /* Finding the optimal width of unicode characters in a buffer */ 2 3 #if !STRINGLIB_IS_UNICODE 4 # error "find_max_char.h is specific to Unicode" 5 #endif 6 7 /* Mask to quickly check whether a C 'size_t' contains a 8 non-ASCII, UTF8-encoded char. */ 9 #if (SIZEOF_SIZE_T == 8) 10 # define UCS1_ASCII_CHAR_MASK 0x8080808080808080ULL 11 #elif (SIZEOF_SIZE_T == 4) 12 # define UCS1_ASCII_CHAR_MASK 0x80808080U 13 #else 14 # error C 'size_t' size should be either 4 or 8! 15 #endif 16 17 #if STRINGLIB_SIZEOF_CHAR == 1 18 19 Py_LOCAL_INLINE(Py_UCS4) STRINGLIB(find_max_char)20STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) 21 { 22 const unsigned char *p = (const unsigned char *) begin; 23 24 while (p < end) { 25 if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) { 26 /* Help register allocation */ 27 const unsigned char *_p = p; 28 while (_p + SIZEOF_SIZE_T <= end) { 29 size_t value = *(const size_t *) _p; 30 if (value & UCS1_ASCII_CHAR_MASK) 31 return 255; 32 _p += SIZEOF_SIZE_T; 33 } 34 p = _p; 35 if (p == end) 36 break; 37 } 38 if (*p++ & 0x80) 39 return 255; 40 } 41 return 127; 42 } 43 44 #undef ASCII_CHAR_MASK 45 46 #else /* STRINGLIB_SIZEOF_CHAR == 1 */ 47 48 #define MASK_ASCII 0xFFFFFF80 49 #define MASK_UCS1 0xFFFFFF00 50 #define MASK_UCS2 0xFFFF0000 51 52 #define MAX_CHAR_ASCII 0x7f 53 #define MAX_CHAR_UCS1 0xff 54 #define MAX_CHAR_UCS2 0xffff 55 #define MAX_CHAR_UCS4 0x10ffff 56 57 Py_LOCAL_INLINE(Py_UCS4) STRINGLIB(find_max_char)58STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) 59 { 60 #if STRINGLIB_SIZEOF_CHAR == 2 61 const Py_UCS4 mask_limit = MASK_UCS1; 62 const Py_UCS4 max_char_limit = MAX_CHAR_UCS2; 63 #elif STRINGLIB_SIZEOF_CHAR == 4 64 const Py_UCS4 mask_limit = MASK_UCS2; 65 const Py_UCS4 max_char_limit = MAX_CHAR_UCS4; 66 #else 67 #error Invalid STRINGLIB_SIZEOF_CHAR (must be 1, 2 or 4) 68 #endif 69 Py_UCS4 mask; 70 Py_ssize_t n = end - begin; 71 const STRINGLIB_CHAR *p = begin; 72 const STRINGLIB_CHAR *unrolled_end = begin + _Py_SIZE_ROUND_DOWN(n, 4); 73 Py_UCS4 max_char; 74 75 max_char = MAX_CHAR_ASCII; 76 mask = MASK_ASCII; 77 while (p < unrolled_end) { 78 STRINGLIB_CHAR bits = p[0] | p[1] | p[2] | p[3]; 79 if (bits & mask) { 80 if (mask == mask_limit) { 81 /* Limit reached */ 82 return max_char_limit; 83 } 84 if (mask == MASK_ASCII) { 85 max_char = MAX_CHAR_UCS1; 86 mask = MASK_UCS1; 87 } 88 else { 89 /* mask can't be MASK_UCS2 because of mask_limit above */ 90 assert(mask == MASK_UCS1); 91 max_char = MAX_CHAR_UCS2; 92 mask = MASK_UCS2; 93 } 94 /* We check the new mask on the same chars in the next iteration */ 95 continue; 96 } 97 p += 4; 98 } 99 while (p < end) { 100 if (p[0] & mask) { 101 if (mask == mask_limit) { 102 /* Limit reached */ 103 return max_char_limit; 104 } 105 if (mask == MASK_ASCII) { 106 max_char = MAX_CHAR_UCS1; 107 mask = MASK_UCS1; 108 } 109 else { 110 /* mask can't be MASK_UCS2 because of mask_limit above */ 111 assert(mask == MASK_UCS1); 112 max_char = MAX_CHAR_UCS2; 113 mask = MASK_UCS2; 114 } 115 /* We check the new mask on the same chars in the next iteration */ 116 continue; 117 } 118 p++; 119 } 120 return max_char; 121 } 122 123 #undef MASK_ASCII 124 #undef MASK_UCS1 125 #undef MASK_UCS2 126 #undef MAX_CHAR_ASCII 127 #undef MAX_CHAR_UCS1 128 #undef MAX_CHAR_UCS2 129 #undef MAX_CHAR_UCS4 130 131 #endif /* STRINGLIB_SIZEOF_CHAR == 1 */ 132 133