1 /* Finding the optimal width of unicode characters in a buffer */ 2 3 #if !STRINGLIB_IS_UNICODE 4 # error "find_max_char.h is specific to Unicode" 5 #endif 6 7 /* Mask to quickly check whether a C 'long' contains a 8 non-ASCII, UTF8-encoded char. */ 9 #if (SIZEOF_LONG == 8) 10 # define UCS1_ASCII_CHAR_MASK 0x8080808080808080UL 11 #elif (SIZEOF_LONG == 4) 12 # define UCS1_ASCII_CHAR_MASK 0x80808080UL 13 #else 14 # error C 'long' size should be either 4 or 8! 15 #endif 16 17 #if STRINGLIB_SIZEOF_CHAR == 1 18 19 Py_LOCAL_INLINE(Py_UCS4) STRINGLIB(find_max_char)20STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) 21 { 22 const unsigned char *p = (const unsigned char *) begin; 23 const unsigned char *aligned_end = 24 (const unsigned char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); 25 26 while (p < end) { 27 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { 28 /* Help register allocation */ 29 const unsigned char *_p = p; 30 while (_p < aligned_end) { 31 unsigned long value = *(unsigned long *) _p; 32 if (value & UCS1_ASCII_CHAR_MASK) 33 return 255; 34 _p += SIZEOF_LONG; 35 } 36 p = _p; 37 if (p == end) 38 break; 39 } 40 if (*p++ & 0x80) 41 return 255; 42 } 43 return 127; 44 } 45 46 #undef ASCII_CHAR_MASK 47 48 #else /* STRINGLIB_SIZEOF_CHAR == 1 */ 49 50 #define MASK_ASCII 0xFFFFFF80 51 #define MASK_UCS1 0xFFFFFF00 52 #define MASK_UCS2 0xFFFF0000 53 54 #define MAX_CHAR_ASCII 0x7f 55 #define MAX_CHAR_UCS1 0xff 56 #define MAX_CHAR_UCS2 0xffff 57 #define MAX_CHAR_UCS4 0x10ffff 58 59 Py_LOCAL_INLINE(Py_UCS4) STRINGLIB(find_max_char)60STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end) 61 { 62 #if STRINGLIB_SIZEOF_CHAR == 2 63 const Py_UCS4 mask_limit = MASK_UCS1; 64 const Py_UCS4 max_char_limit = MAX_CHAR_UCS2; 65 #elif STRINGLIB_SIZEOF_CHAR == 4 66 const Py_UCS4 mask_limit = MASK_UCS2; 67 const Py_UCS4 max_char_limit = MAX_CHAR_UCS4; 68 #else 69 #error Invalid STRINGLIB_SIZEOF_CHAR (must be 1, 2 or 4) 70 #endif 71 Py_UCS4 mask; 72 Py_ssize_t n = end - begin; 73 const STRINGLIB_CHAR *p = begin; 74 const STRINGLIB_CHAR *unrolled_end = begin + _Py_SIZE_ROUND_DOWN(n, 4); 75 Py_UCS4 max_char; 76 77 max_char = MAX_CHAR_ASCII; 78 mask = MASK_ASCII; 79 while (p < unrolled_end) { 80 STRINGLIB_CHAR bits = p[0] | p[1] | p[2] | p[3]; 81 if (bits & mask) { 82 if (mask == mask_limit) { 83 /* Limit reached */ 84 return max_char_limit; 85 } 86 if (mask == MASK_ASCII) { 87 max_char = MAX_CHAR_UCS1; 88 mask = MASK_UCS1; 89 } 90 else { 91 /* mask can't be MASK_UCS2 because of mask_limit above */ 92 assert(mask == MASK_UCS1); 93 max_char = MAX_CHAR_UCS2; 94 mask = MASK_UCS2; 95 } 96 /* We check the new mask on the same chars in the next iteration */ 97 continue; 98 } 99 p += 4; 100 } 101 while (p < end) { 102 if (p[0] & mask) { 103 if (mask == mask_limit) { 104 /* Limit reached */ 105 return max_char_limit; 106 } 107 if (mask == MASK_ASCII) { 108 max_char = MAX_CHAR_UCS1; 109 mask = MASK_UCS1; 110 } 111 else { 112 /* mask can't be MASK_UCS2 because of mask_limit above */ 113 assert(mask == MASK_UCS1); 114 max_char = MAX_CHAR_UCS2; 115 mask = MASK_UCS2; 116 } 117 /* We check the new mask on the same chars in the next iteration */ 118 continue; 119 } 120 p++; 121 } 122 return max_char; 123 } 124 125 #undef MASK_ASCII 126 #undef MASK_UCS1 127 #undef MASK_UCS2 128 #undef MAX_CHAR_ASCII 129 #undef MAX_CHAR_UCS1 130 #undef MAX_CHAR_UCS2 131 #undef MAX_CHAR_UCS4 132 133 #endif /* STRINGLIB_SIZEOF_CHAR == 1 */ 134 135