1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "absl/strings/ascii.h"
16
17 #include <climits>
18 #include <cstddef>
19 #include <cstring>
20 #include <string>
21
22 #include "absl/base/config.h"
23 #include "absl/base/nullability.h"
24
25 namespace absl {
26 ABSL_NAMESPACE_BEGIN
27 namespace ascii_internal {
28
29 // # Table generated by this Python code (bit 0x02 is currently unused):
30 // TODO(mbar) Move Python code for generation of table to BUILD and link here.
31
32 // NOTE: The kAsciiPropertyBits table used within this code was generated by
33 // Python code of the following form. (Bit 0x02 is currently unused and
34 // available.)
35 //
36 // def Hex2(n):
37 // return '0x' + hex(n/16)[2:] + hex(n%16)[2:]
38 // def IsPunct(ch):
39 // return (ord(ch) >= 32 and ord(ch) < 127 and
40 // not ch.isspace() and not ch.isalnum())
41 // def IsBlank(ch):
42 // return ch in ' \t'
43 // def IsCntrl(ch):
44 // return ord(ch) < 32 or ord(ch) == 127
45 // def IsXDigit(ch):
46 // return ch.isdigit() or ch.lower() in 'abcdef'
47 // for i in range(128):
48 // ch = chr(i)
49 // mask = ((ch.isalpha() and 0x01 or 0) |
50 // (ch.isalnum() and 0x04 or 0) |
51 // (ch.isspace() and 0x08 or 0) |
52 // (IsPunct(ch) and 0x10 or 0) |
53 // (IsBlank(ch) and 0x20 or 0) |
54 // (IsCntrl(ch) and 0x40 or 0) |
55 // (IsXDigit(ch) and 0x80 or 0))
56 // print Hex2(mask) + ',',
57 // if i % 16 == 7:
58 // print ' //', Hex2(i & 0x78)
59 // elif i % 16 == 15:
60 // print
61
62 // clang-format off
63 // Array of bitfields holding character information. Each bit value corresponds
64 // to a particular character feature. For readability, and because the value
65 // of these bits is tightly coupled to this implementation, the individual bits
66 // are not named. Note that bitfields for all characters above ASCII 127 are
67 // zero-initialized.
68 ABSL_DLL const unsigned char kPropertyBits[256] = {
69 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x00
70 0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40,
71 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x10
72 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
73 0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, // 0x20
74 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
75 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, // 0x30
76 0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
77 0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x40
78 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
79 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x50
80 0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10,
81 0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x60
82 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
83 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x70
84 0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40,
85 };
86
87 // Array of characters for the ascii_tolower() function. For values 'A'
88 // through 'Z', return the lower-case character; otherwise, return the
89 // identity of the passed character.
90 ABSL_DLL const char kToLower[256] = {
91 '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
92 '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
93 '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
94 '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
95 '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
96 '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
97 '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
98 '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
99 '\x40', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
100 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
101 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
102 'x', 'y', 'z', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
103 '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67',
104 '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f',
105 '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77',
106 '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
107 '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
108 '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
109 '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
110 '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
111 '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
112 '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
113 '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
114 '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
115 '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
116 '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
117 '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
118 '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
119 '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
120 '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
121 '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
122 '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
123 };
124
125 // Array of characters for the ascii_toupper() function. For values 'a'
126 // through 'z', return the upper-case character; otherwise, return the
127 // identity of the passed character.
128 ABSL_DLL const char kToUpper[256] = {
129 '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
130 '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
131 '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
132 '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
133 '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
134 '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
135 '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
136 '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
137 '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47',
138 '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f',
139 '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57',
140 '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
141 '\x60', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
142 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
143 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
144 'X', 'Y', 'Z', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
145 '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
146 '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
147 '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
148 '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
149 '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
150 '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
151 '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
152 '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
153 '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
154 '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
155 '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
156 '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
157 '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
158 '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
159 '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
160 '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
161 };
162 // clang-format on
163
164 // Returns whether `c` is in the a-z/A-Z range (w.r.t. `ToUpper`).
165 // Implemented by:
166 // 1. Pushing the a-z/A-Z range to [SCHAR_MIN, SCHAR_MIN + 26).
167 // 2. Comparing to SCHAR_MIN + 26.
168 template <bool ToUpper>
AsciiInAZRange(unsigned char c)169 constexpr bool AsciiInAZRange(unsigned char c) {
170 constexpr unsigned char sub = (ToUpper ? 'a' : 'A') - SCHAR_MIN;
171 constexpr signed char threshold = SCHAR_MIN + 26; // 26 = alphabet size.
172 // Using unsigned arithmetic as overflows/underflows are well defined.
173 unsigned char u = c - sub;
174 // Using signed cmp, as SIMD unsigned cmp isn't available in many platforms.
175 return static_cast<signed char>(u) < threshold;
176 }
177
178 template <bool ToUpper>
AsciiInAZRangeNaive(unsigned char c)179 constexpr bool AsciiInAZRangeNaive(unsigned char c) {
180 constexpr unsigned char a = (ToUpper ? 'a' : 'A');
181 constexpr unsigned char z = (ToUpper ? 'z' : 'Z');
182 return a <= c && c <= z;
183 }
184
185 template <bool ToUpper, bool Naive>
AsciiStrCaseFoldImpl(absl::Nonnull<char * > dst,absl::Nullable<const char * > src,size_t size)186 constexpr void AsciiStrCaseFoldImpl(absl::Nonnull<char*> dst,
187 absl::Nullable<const char*> src,
188 size_t size) {
189 // The upper- and lowercase versions of ASCII characters differ by only 1 bit.
190 // When we need to flip the case, we can xor with this bit to achieve the
191 // desired result. Note that the choice of 'a' and 'A' here is arbitrary. We
192 // could have chosen 'z' and 'Z', or any other pair of characters as they all
193 // have the same single bit difference.
194 constexpr unsigned char kAsciiCaseBitFlip = 'a' ^ 'A';
195
196 for (size_t i = 0; i < size; ++i) {
197 unsigned char v = static_cast<unsigned char>(src[i]);
198 if ABSL_INTERNAL_CONSTEXPR_SINCE_CXX17 (Naive) {
199 v ^= AsciiInAZRangeNaive<ToUpper>(v) ? kAsciiCaseBitFlip : 0;
200 } else {
201 v ^= AsciiInAZRange<ToUpper>(v) ? kAsciiCaseBitFlip : 0;
202 }
203 dst[i] = static_cast<char>(v);
204 }
205 }
206
207 // Splitting to short and long strings to allow vectorization decisions
208 // to be made separately in the long and short cases.
209 // Using slightly different implementations so the compiler won't optimize them
210 // into the same code (the non-naive version is needed for SIMD, so for short
211 // strings it's not important).
212 // `src` may be null iff `size` is zero.
213 template <bool ToUpper>
AsciiStrCaseFold(absl::Nonnull<char * > dst,absl::Nullable<const char * > src,size_t size)214 constexpr void AsciiStrCaseFold(absl::Nonnull<char*> dst,
215 absl::Nullable<const char*> src, size_t size) {
216 size < 16 ? AsciiStrCaseFoldImpl<ToUpper, /*Naive=*/true>(dst, src, size)
217 : AsciiStrCaseFoldImpl<ToUpper, /*Naive=*/false>(dst, src, size);
218 }
219
AsciiStrToLower(absl::Nonnull<char * > dst,absl::Nullable<const char * > src,size_t n)220 void AsciiStrToLower(absl::Nonnull<char*> dst, absl::Nullable<const char*> src,
221 size_t n) {
222 return AsciiStrCaseFold<false>(dst, src, n);
223 }
224
AsciiStrToUpper(absl::Nonnull<char * > dst,absl::Nullable<const char * > src,size_t n)225 void AsciiStrToUpper(absl::Nonnull<char*> dst, absl::Nullable<const char*> src,
226 size_t n) {
227 return AsciiStrCaseFold<true>(dst, src, n);
228 }
229
ValidateAsciiCasefold()230 static constexpr size_t ValidateAsciiCasefold() {
231 constexpr size_t num_chars = 1 + CHAR_MAX - CHAR_MIN;
232 size_t incorrect_index = 0;
233 char lowered[num_chars] = {};
234 char uppered[num_chars] = {};
235 for (unsigned int i = 0; i < num_chars; ++i) {
236 uppered[i] = lowered[i] = static_cast<char>(i);
237 }
238 AsciiStrCaseFold<false>(&lowered[0], &lowered[0], num_chars);
239 AsciiStrCaseFold<true>(&uppered[0], &uppered[0], num_chars);
240 for (size_t i = 0; i < num_chars; ++i) {
241 const char ch = static_cast<char>(i),
242 ch_upper = ('a' <= ch && ch <= 'z' ? 'A' + (ch - 'a') : ch),
243 ch_lower = ('A' <= ch && ch <= 'Z' ? 'a' + (ch - 'A') : ch);
244 if (uppered[i] != ch_upper || lowered[i] != ch_lower) {
245 incorrect_index = i > 0 ? i : num_chars;
246 break;
247 }
248 }
249 return incorrect_index;
250 }
251
252 static_assert(ValidateAsciiCasefold() == 0, "error in case conversion");
253
254 } // namespace ascii_internal
255
AsciiStrToLower(absl::Nonnull<std::string * > s)256 void AsciiStrToLower(absl::Nonnull<std::string*> s) {
257 char* p = &(*s)[0];
258 return ascii_internal::AsciiStrCaseFold<false>(p, p, s->size());
259 }
260
AsciiStrToUpper(absl::Nonnull<std::string * > s)261 void AsciiStrToUpper(absl::Nonnull<std::string*> s) {
262 char* p = &(*s)[0];
263 return ascii_internal::AsciiStrCaseFold<true>(p, p, s->size());
264 }
265
RemoveExtraAsciiWhitespace(absl::Nonnull<std::string * > str)266 void RemoveExtraAsciiWhitespace(absl::Nonnull<std::string*> str) {
267 auto stripped = StripAsciiWhitespace(*str);
268
269 if (stripped.empty()) {
270 str->clear();
271 return;
272 }
273
274 auto input_it = stripped.begin();
275 auto input_end = stripped.end();
276 auto output_it = &(*str)[0];
277 bool is_ws = false;
278
279 for (; input_it < input_end; ++input_it) {
280 if (is_ws) {
281 // Consecutive whitespace? Keep only the last.
282 is_ws = absl::ascii_isspace(static_cast<unsigned char>(*input_it));
283 if (is_ws) --output_it;
284 } else {
285 is_ws = absl::ascii_isspace(static_cast<unsigned char>(*input_it));
286 }
287
288 *output_it = *input_it;
289 ++output_it;
290 }
291
292 str->erase(static_cast<size_t>(output_it - &(*str)[0]));
293 }
294
295 ABSL_NAMESPACE_END
296 } // namespace absl
297