• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "absl/strings/ascii.h"
16 
17 #include <climits>
18 #include <cstddef>
19 #include <cstring>
20 #include <string>
21 
22 #include "absl/base/config.h"
23 #include "absl/base/nullability.h"
24 
25 namespace absl {
26 ABSL_NAMESPACE_BEGIN
27 namespace ascii_internal {
28 
29 // # Table generated by this Python code (bit 0x02 is currently unused):
30 // TODO(mbar) Move Python code for generation of table to BUILD and link here.
31 
32 // NOTE: The kAsciiPropertyBits table used within this code was generated by
33 // Python code of the following form. (Bit 0x02 is currently unused and
34 // available.)
35 //
36 // def Hex2(n):
37 //   return '0x' + hex(n/16)[2:] + hex(n%16)[2:]
38 // def IsPunct(ch):
39 //   return (ord(ch) >= 32 and ord(ch) < 127 and
40 //           not ch.isspace() and not ch.isalnum())
41 // def IsBlank(ch):
42 //   return ch in ' \t'
43 // def IsCntrl(ch):
44 //   return ord(ch) < 32 or ord(ch) == 127
45 // def IsXDigit(ch):
46 //   return ch.isdigit() or ch.lower() in 'abcdef'
47 // for i in range(128):
48 //   ch = chr(i)
49 //   mask = ((ch.isalpha() and 0x01 or 0) |
50 //           (ch.isalnum() and 0x04 or 0) |
51 //           (ch.isspace() and 0x08 or 0) |
52 //           (IsPunct(ch) and 0x10 or 0) |
53 //           (IsBlank(ch) and 0x20 or 0) |
54 //           (IsCntrl(ch) and 0x40 or 0) |
55 //           (IsXDigit(ch) and 0x80 or 0))
56 //   print Hex2(mask) + ',',
57 //   if i % 16 == 7:
58 //     print ' //', Hex2(i & 0x78)
59 //   elif i % 16 == 15:
60 //     print
61 
62 // clang-format off
63 // Array of bitfields holding character information. Each bit value corresponds
64 // to a particular character feature. For readability, and because the value
65 // of these bits is tightly coupled to this implementation, the individual bits
66 // are not named. Note that bitfields for all characters above ASCII 127 are
67 // zero-initialized.
68 ABSL_DLL const unsigned char kPropertyBits[256] = {
69     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  // 0x00
70     0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40,
71     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  // 0x10
72     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
73     0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,  // 0x20
74     0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
75     0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84,  // 0x30
76     0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
77     0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05,  // 0x40
78     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
79     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  // 0x50
80     0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10,
81     0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05,  // 0x60
82     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
83     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  // 0x70
84     0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40,
85 };
86 
87 // Array of characters for the ascii_tolower() function. For values 'A'
88 // through 'Z', return the lower-case character; otherwise, return the
89 // identity of the passed character.
90 ABSL_DLL const char kToLower[256] = {
91   '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
92   '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
93   '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
94   '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
95   '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
96   '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
97   '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
98   '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
99   '\x40',    'a',    'b',    'c',    'd',    'e',    'f',    'g',
100      'h',    'i',    'j',    'k',    'l',    'm',    'n',    'o',
101      'p',    'q',    'r',    's',    't',    'u',    'v',    'w',
102      'x',    'y',    'z', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
103   '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67',
104   '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f',
105   '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77',
106   '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
107   '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
108   '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
109   '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
110   '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
111   '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
112   '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
113   '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
114   '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
115   '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
116   '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
117   '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
118   '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
119   '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
120   '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
121   '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
122   '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
123 };
124 
125 // Array of characters for the ascii_toupper() function. For values 'a'
126 // through 'z', return the upper-case character; otherwise, return the
127 // identity of the passed character.
128 ABSL_DLL const char kToUpper[256] = {
129   '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
130   '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
131   '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
132   '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
133   '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
134   '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
135   '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
136   '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
137   '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47',
138   '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f',
139   '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57',
140   '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
141   '\x60',    'A',    'B',    'C',    'D',    'E',    'F',    'G',
142      'H',    'I',    'J',    'K',    'L',    'M',    'N',    'O',
143      'P',    'Q',    'R',    'S',    'T',    'U',    'V',    'W',
144      'X',    'Y',    'Z', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
145   '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
146   '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
147   '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
148   '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
149   '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
150   '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
151   '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
152   '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
153   '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
154   '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
155   '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
156   '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
157   '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
158   '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
159   '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
160   '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
161 };
162 // clang-format on
163 
164 // Returns whether `c` is in the a-z/A-Z range (w.r.t. `ToUpper`).
165 // Implemented by:
166 //  1. Pushing the a-z/A-Z range to [SCHAR_MIN, SCHAR_MIN + 26).
167 //  2. Comparing to SCHAR_MIN + 26.
168 template <bool ToUpper>
AsciiInAZRange(unsigned char c)169 constexpr bool AsciiInAZRange(unsigned char c) {
170   constexpr unsigned char sub = (ToUpper ? 'a' : 'A') - SCHAR_MIN;
171   constexpr signed char threshold = SCHAR_MIN + 26;  // 26 = alphabet size.
172   // Using unsigned arithmetic as overflows/underflows are well defined.
173   unsigned char u = c - sub;
174   // Using signed cmp, as SIMD unsigned cmp isn't available in many platforms.
175   return static_cast<signed char>(u) < threshold;
176 }
177 
178 template <bool ToUpper>
AsciiInAZRangeNaive(unsigned char c)179 constexpr bool AsciiInAZRangeNaive(unsigned char c) {
180   constexpr unsigned char a = (ToUpper ? 'a' : 'A');
181   constexpr unsigned char z = (ToUpper ? 'z' : 'Z');
182   return a <= c && c <= z;
183 }
184 
185 template <bool ToUpper, bool Naive>
AsciiStrCaseFoldImpl(absl::Nonnull<char * > dst,absl::Nullable<const char * > src,size_t size)186 constexpr void AsciiStrCaseFoldImpl(absl::Nonnull<char*> dst,
187                                     absl::Nullable<const char*> src,
188                                     size_t size) {
189   // The upper- and lowercase versions of ASCII characters differ by only 1 bit.
190   // When we need to flip the case, we can xor with this bit to achieve the
191   // desired result. Note that the choice of 'a' and 'A' here is arbitrary. We
192   // could have chosen 'z' and 'Z', or any other pair of characters as they all
193   // have the same single bit difference.
194   constexpr unsigned char kAsciiCaseBitFlip = 'a' ^ 'A';
195 
196   for (size_t i = 0; i < size; ++i) {
197     unsigned char v = static_cast<unsigned char>(src[i]);
198     if ABSL_INTERNAL_CONSTEXPR_SINCE_CXX17 (Naive) {
199       v ^= AsciiInAZRangeNaive<ToUpper>(v) ? kAsciiCaseBitFlip : 0;
200     } else {
201       v ^= AsciiInAZRange<ToUpper>(v) ? kAsciiCaseBitFlip : 0;
202     }
203     dst[i] = static_cast<char>(v);
204   }
205 }
206 
207 // Splitting to short and long strings to allow vectorization decisions
208 // to be made separately in the long and short cases.
209 // Using slightly different implementations so the compiler won't optimize them
210 // into the same code (the non-naive version is needed for SIMD, so for short
211 // strings it's not important).
212 // `src` may be null iff `size` is zero.
213 template <bool ToUpper>
AsciiStrCaseFold(absl::Nonnull<char * > dst,absl::Nullable<const char * > src,size_t size)214 constexpr void AsciiStrCaseFold(absl::Nonnull<char*> dst,
215                                 absl::Nullable<const char*> src, size_t size) {
216   size < 16 ? AsciiStrCaseFoldImpl<ToUpper, /*Naive=*/true>(dst, src, size)
217             : AsciiStrCaseFoldImpl<ToUpper, /*Naive=*/false>(dst, src, size);
218 }
219 
AsciiStrToLower(absl::Nonnull<char * > dst,absl::Nullable<const char * > src,size_t n)220 void AsciiStrToLower(absl::Nonnull<char*> dst, absl::Nullable<const char*> src,
221                      size_t n) {
222   return AsciiStrCaseFold<false>(dst, src, n);
223 }
224 
AsciiStrToUpper(absl::Nonnull<char * > dst,absl::Nullable<const char * > src,size_t n)225 void AsciiStrToUpper(absl::Nonnull<char*> dst, absl::Nullable<const char*> src,
226                      size_t n) {
227   return AsciiStrCaseFold<true>(dst, src, n);
228 }
229 
ValidateAsciiCasefold()230 static constexpr size_t ValidateAsciiCasefold() {
231   constexpr size_t num_chars = 1 + CHAR_MAX - CHAR_MIN;
232   size_t incorrect_index = 0;
233   char lowered[num_chars] = {};
234   char uppered[num_chars] = {};
235   for (unsigned int i = 0; i < num_chars; ++i) {
236     uppered[i] = lowered[i] = static_cast<char>(i);
237   }
238   AsciiStrCaseFold<false>(&lowered[0], &lowered[0], num_chars);
239   AsciiStrCaseFold<true>(&uppered[0], &uppered[0], num_chars);
240   for (size_t i = 0; i < num_chars; ++i) {
241     const char ch = static_cast<char>(i),
242                ch_upper = ('a' <= ch && ch <= 'z' ? 'A' + (ch - 'a') : ch),
243                ch_lower = ('A' <= ch && ch <= 'Z' ? 'a' + (ch - 'A') : ch);
244     if (uppered[i] != ch_upper || lowered[i] != ch_lower) {
245       incorrect_index = i > 0 ? i : num_chars;
246       break;
247     }
248   }
249   return incorrect_index;
250 }
251 
252 static_assert(ValidateAsciiCasefold() == 0, "error in case conversion");
253 
254 }  // namespace ascii_internal
255 
AsciiStrToLower(absl::Nonnull<std::string * > s)256 void AsciiStrToLower(absl::Nonnull<std::string*> s) {
257   char* p = &(*s)[0];
258   return ascii_internal::AsciiStrCaseFold<false>(p, p, s->size());
259 }
260 
AsciiStrToUpper(absl::Nonnull<std::string * > s)261 void AsciiStrToUpper(absl::Nonnull<std::string*> s) {
262   char* p = &(*s)[0];
263   return ascii_internal::AsciiStrCaseFold<true>(p, p, s->size());
264 }
265 
RemoveExtraAsciiWhitespace(absl::Nonnull<std::string * > str)266 void RemoveExtraAsciiWhitespace(absl::Nonnull<std::string*> str) {
267   auto stripped = StripAsciiWhitespace(*str);
268 
269   if (stripped.empty()) {
270     str->clear();
271     return;
272   }
273 
274   auto input_it = stripped.begin();
275   auto input_end = stripped.end();
276   auto output_it = &(*str)[0];
277   bool is_ws = false;
278 
279   for (; input_it < input_end; ++input_it) {
280     if (is_ws) {
281       // Consecutive whitespace?  Keep only the last.
282       is_ws = absl::ascii_isspace(static_cast<unsigned char>(*input_it));
283       if (is_ws) --output_it;
284     } else {
285       is_ws = absl::ascii_isspace(static_cast<unsigned char>(*input_it));
286     }
287 
288     *output_it = *input_it;
289     ++output_it;
290   }
291 
292   str->erase(static_cast<size_t>(output_it - &(*str)[0]));
293 }
294 
295 ABSL_NAMESPACE_END
296 }  // namespace absl
297