• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "absl/strings/ascii.h"
16 
17 #include <climits>
18 #include <cstdint>
19 #include <cstring>
20 #include <string>
21 #include <type_traits>
22 
23 #include "absl/base/config.h"
24 #include "absl/base/nullability.h"
25 
26 namespace absl {
27 ABSL_NAMESPACE_BEGIN
28 namespace ascii_internal {
29 
30 // # Table generated by this Python code (bit 0x02 is currently unused):
31 // TODO(mbar) Move Python code for generation of table to BUILD and link here.
32 
33 // NOTE: The kAsciiPropertyBits table used within this code was generated by
34 // Python code of the following form. (Bit 0x02 is currently unused and
35 // available.)
36 //
37 // def Hex2(n):
38 //   return '0x' + hex(n/16)[2:] + hex(n%16)[2:]
39 // def IsPunct(ch):
40 //   return (ord(ch) >= 32 and ord(ch) < 127 and
41 //           not ch.isspace() and not ch.isalnum())
42 // def IsBlank(ch):
43 //   return ch in ' \t'
44 // def IsCntrl(ch):
45 //   return ord(ch) < 32 or ord(ch) == 127
46 // def IsXDigit(ch):
47 //   return ch.isdigit() or ch.lower() in 'abcdef'
48 // for i in range(128):
49 //   ch = chr(i)
50 //   mask = ((ch.isalpha() and 0x01 or 0) |
51 //           (ch.isalnum() and 0x04 or 0) |
52 //           (ch.isspace() and 0x08 or 0) |
53 //           (IsPunct(ch) and 0x10 or 0) |
54 //           (IsBlank(ch) and 0x20 or 0) |
55 //           (IsCntrl(ch) and 0x40 or 0) |
56 //           (IsXDigit(ch) and 0x80 or 0))
57 //   print Hex2(mask) + ',',
58 //   if i % 16 == 7:
59 //     print ' //', Hex2(i & 0x78)
60 //   elif i % 16 == 15:
61 //     print
62 
63 // clang-format off
64 // Array of bitfields holding character information. Each bit value corresponds
65 // to a particular character feature. For readability, and because the value
66 // of these bits is tightly coupled to this implementation, the individual bits
67 // are not named. Note that bitfields for all characters above ASCII 127 are
68 // zero-initialized.
69 ABSL_DLL const unsigned char kPropertyBits[256] = {
70     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  // 0x00
71     0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40,
72     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  // 0x10
73     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
74     0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,  // 0x20
75     0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
76     0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84,  // 0x30
77     0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
78     0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05,  // 0x40
79     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
80     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  // 0x50
81     0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10,
82     0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05,  // 0x60
83     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
84     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  // 0x70
85     0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40,
86 };
87 
88 // Array of characters for the ascii_tolower() function. For values 'A'
89 // through 'Z', return the lower-case character; otherwise, return the
90 // identity of the passed character.
91 ABSL_DLL const char kToLower[256] = {
92   '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
93   '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
94   '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
95   '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
96   '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
97   '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
98   '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
99   '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
100   '\x40',    'a',    'b',    'c',    'd',    'e',    'f',    'g',
101      'h',    'i',    'j',    'k',    'l',    'm',    'n',    'o',
102      'p',    'q',    'r',    's',    't',    'u',    'v',    'w',
103      'x',    'y',    'z', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
104   '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67',
105   '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f',
106   '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77',
107   '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
108   '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
109   '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
110   '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
111   '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
112   '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
113   '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
114   '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
115   '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
116   '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
117   '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
118   '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
119   '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
120   '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
121   '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
122   '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
123   '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
124 };
125 
126 // Array of characters for the ascii_toupper() function. For values 'a'
127 // through 'z', return the upper-case character; otherwise, return the
128 // identity of the passed character.
129 ABSL_DLL const char kToUpper[256] = {
130   '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
131   '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
132   '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
133   '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
134   '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
135   '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
136   '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
137   '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
138   '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47',
139   '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f',
140   '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57',
141   '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
142   '\x60',    'A',    'B',    'C',    'D',    'E',    'F',    'G',
143      'H',    'I',    'J',    'K',    'L',    'M',    'N',    'O',
144      'P',    'Q',    'R',    'S',    'T',    'U',    'V',    'W',
145      'X',    'Y',    'Z', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
146   '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
147   '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
148   '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
149   '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
150   '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
151   '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
152   '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
153   '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
154   '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
155   '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
156   '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
157   '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
158   '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
159   '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
160   '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
161   '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
162 };
163 // clang-format on
164 
165 template <class T>
BroadcastByte(unsigned char value)166 static constexpr T BroadcastByte(unsigned char value) {
167   static_assert(std::is_integral<T>::value && sizeof(T) <= sizeof(uint64_t) &&
168                     std::is_unsigned<T>::value,
169                 "only unsigned integers up to 64-bit allowed");
170   T result = value;
171   constexpr size_t result_bit_width = sizeof(result) * CHAR_BIT;
172   result |= result << ((CHAR_BIT << 0) & (result_bit_width - 1));
173   result |= result << ((CHAR_BIT << 1) & (result_bit_width - 1));
174   result |= result << ((CHAR_BIT << 2) & (result_bit_width - 1));
175   return result;
176 }
177 
178 // Returns whether `c` is in the a-z/A-Z range (w.r.t. `ToUpper`).
179 // Implemented by:
180 //  1. Pushing the a-z/A-Z range to [SCHAR_MIN, SCHAR_MIN + 26).
181 //  2. Comparing to SCHAR_MIN + 26.
182 template <bool ToUpper>
AsciiInAZRange(unsigned char c)183 constexpr bool AsciiInAZRange(unsigned char c) {
184   constexpr unsigned char sub = (ToUpper ? 'a' : 'A') - SCHAR_MIN;
185   constexpr signed char threshold = SCHAR_MIN + 26;  // 26 = alphabet size.
186   // Using unsigned arithmetic as overflows/underflows are well defined.
187   unsigned char u = c - sub;
188   // Using signed cmp, as SIMD unsigned cmp isn't available in many platforms.
189   return static_cast<signed char>(u) < threshold;
190 }
191 
192 template <bool ToUpper>
PartialAsciiStrCaseFold(absl::Nonnull<char * > p,absl::Nonnull<char * > end)193 static constexpr char* PartialAsciiStrCaseFold(absl::Nonnull<char*> p,
194                                                absl::Nonnull<char*> end) {
195   using vec_t = size_t;
196   const size_t n = static_cast<size_t>(end - p);
197 
198   // SWAR algorithm: http://0x80.pl/notesen/2016-01-06-swar-swap-case.html
199   constexpr char ch_a = ToUpper ? 'a' : 'A', ch_z = ToUpper ? 'z' : 'Z';
200   char* const swar_end = p + (n / sizeof(vec_t)) * sizeof(vec_t);
201   while (p < swar_end) {
202     vec_t v = vec_t();
203 
204     // memcpy the vector, but constexpr
205     for (size_t i = 0; i < sizeof(vec_t); ++i) {
206       v |= static_cast<vec_t>(static_cast<unsigned char>(p[i]))
207            << (i * CHAR_BIT);
208     }
209 
210     constexpr unsigned int msb = 1u << (CHAR_BIT - 1);
211     const vec_t v_msb = v & BroadcastByte<vec_t>(msb);
212     const vec_t v_nonascii_mask = (v_msb << 1) - (v_msb >> (CHAR_BIT - 1));
213     const vec_t v_nonascii = v & v_nonascii_mask;
214     const vec_t v_ascii = v & ~v_nonascii_mask;
215     const vec_t a = v_ascii + BroadcastByte<vec_t>(msb - ch_a - 0),
216                 z = v_ascii + BroadcastByte<vec_t>(msb - ch_z - 1);
217     v = v_nonascii | (v_ascii ^ ((a ^ z) & BroadcastByte<vec_t>(msb)) >> 2);
218 
219     // memcpy the vector, but constexpr
220     for (size_t i = 0; i < sizeof(vec_t); ++i) {
221       p[i] = static_cast<char>(v >> (i * CHAR_BIT));
222     }
223 
224     p += sizeof(v);
225   }
226 
227   return p;
228 }
229 
230 template <bool ToUpper>
AsciiStrCaseFold(absl::Nonnull<char * > p,absl::Nonnull<char * > end)231 static constexpr void AsciiStrCaseFold(absl::Nonnull<char*> p,
232                                        absl::Nonnull<char*> end) {
233   // The upper- and lowercase versions of ASCII characters differ by only 1 bit.
234   // When we need to flip the case, we can xor with this bit to achieve the
235   // desired result. Note that the choice of 'a' and 'A' here is arbitrary. We
236   // could have chosen 'z' and 'Z', or any other pair of characters as they all
237   // have the same single bit difference.
238   constexpr unsigned char kAsciiCaseBitFlip = 'a' ^ 'A';
239 
240   using vec_t = size_t;
241   // TODO(b/316380338): When FDO becomes able to vectorize these,
242   // revert this manual optimization and just leave the naive loop.
243   if (static_cast<size_t>(end - p) >= sizeof(vec_t)) {
244     p = ascii_internal::PartialAsciiStrCaseFold<ToUpper>(p, end);
245   }
246   while (p < end) {
247     unsigned char v = static_cast<unsigned char>(*p);
248     v ^= AsciiInAZRange<ToUpper>(v) ? kAsciiCaseBitFlip : 0;
249     *p = static_cast<char>(v);
250     ++p;
251   }
252 }
253 
ValidateAsciiCasefold()254 static constexpr size_t ValidateAsciiCasefold() {
255   constexpr size_t num_chars = 1 + CHAR_MAX - CHAR_MIN;
256   size_t incorrect_index = 0;
257   char lowered[num_chars] = {};
258   char uppered[num_chars] = {};
259   for (unsigned int i = 0; i < num_chars; ++i) {
260     uppered[i] = lowered[i] = static_cast<char>(i);
261   }
262   AsciiStrCaseFold<false>(&lowered[0], &lowered[num_chars]);
263   AsciiStrCaseFold<true>(&uppered[0], &uppered[num_chars]);
264   for (size_t i = 0; i < num_chars; ++i) {
265     const char ch = static_cast<char>(i),
266                ch_upper = ('a' <= ch && ch <= 'z' ? 'A' + (ch - 'a') : ch),
267                ch_lower = ('A' <= ch && ch <= 'Z' ? 'a' + (ch - 'A') : ch);
268     if (uppered[i] != ch_upper || lowered[i] != ch_lower) {
269       incorrect_index = i > 0 ? i : num_chars;
270       break;
271     }
272   }
273   return incorrect_index;
274 }
275 
276 static_assert(ValidateAsciiCasefold() == 0, "error in case conversion");
277 
278 }  // namespace ascii_internal
279 
AsciiStrToLower(absl::Nonnull<std::string * > s)280 void AsciiStrToLower(absl::Nonnull<std::string*> s) {
281   char* p = &(*s)[0];  // Guaranteed to be valid for empty strings
282   return ascii_internal::AsciiStrCaseFold<false>(p, p + s->size());
283 }
284 
AsciiStrToUpper(absl::Nonnull<std::string * > s)285 void AsciiStrToUpper(absl::Nonnull<std::string*> s) {
286   char* p = &(*s)[0];  // Guaranteed to be valid for empty strings
287   return ascii_internal::AsciiStrCaseFold<true>(p, p + s->size());
288 }
289 
RemoveExtraAsciiWhitespace(absl::Nonnull<std::string * > str)290 void RemoveExtraAsciiWhitespace(absl::Nonnull<std::string*> str) {
291   auto stripped = StripAsciiWhitespace(*str);
292 
293   if (stripped.empty()) {
294     str->clear();
295     return;
296   }
297 
298   auto input_it = stripped.begin();
299   auto input_end = stripped.end();
300   auto output_it = &(*str)[0];
301   bool is_ws = false;
302 
303   for (; input_it < input_end; ++input_it) {
304     if (is_ws) {
305       // Consecutive whitespace?  Keep only the last.
306       is_ws = absl::ascii_isspace(static_cast<unsigned char>(*input_it));
307       if (is_ws) --output_it;
308     } else {
309       is_ws = absl::ascii_isspace(static_cast<unsigned char>(*input_it));
310     }
311 
312     *output_it = *input_it;
313     ++output_it;
314   }
315 
316   str->erase(static_cast<size_t>(output_it - &(*str)[0]));
317 }
318 
319 ABSL_NAMESPACE_END
320 }  // namespace absl
321