1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "absl/strings/ascii.h"
16
17 #include <climits>
18 #include <cstring>
19 #include <string>
20
21 namespace absl {
22 ABSL_NAMESPACE_BEGIN
23 namespace ascii_internal {
24
25 // # Table generated by this Python code (bit 0x02 is currently unused):
26 // TODO(mbar) Move Python code for generation of table to BUILD and link here.
27
28 // NOTE: The kAsciiPropertyBits table used within this code was generated by
29 // Python code of the following form. (Bit 0x02 is currently unused and
30 // available.)
31 //
32 // def Hex2(n):
33 // return '0x' + hex(n/16)[2:] + hex(n%16)[2:]
34 // def IsPunct(ch):
35 // return (ord(ch) >= 32 and ord(ch) < 127 and
36 // not ch.isspace() and not ch.isalnum())
37 // def IsBlank(ch):
38 // return ch in ' \t'
39 // def IsCntrl(ch):
40 // return ord(ch) < 32 or ord(ch) == 127
41 // def IsXDigit(ch):
42 // return ch.isdigit() or ch.lower() in 'abcdef'
43 // for i in range(128):
44 // ch = chr(i)
45 // mask = ((ch.isalpha() and 0x01 or 0) |
46 // (ch.isalnum() and 0x04 or 0) |
47 // (ch.isspace() and 0x08 or 0) |
48 // (IsPunct(ch) and 0x10 or 0) |
49 // (IsBlank(ch) and 0x20 or 0) |
50 // (IsCntrl(ch) and 0x40 or 0) |
51 // (IsXDigit(ch) and 0x80 or 0))
52 // print Hex2(mask) + ',',
53 // if i % 16 == 7:
54 // print ' //', Hex2(i & 0x78)
55 // elif i % 16 == 15:
56 // print
57
58 // clang-format off
59 // Array of bitfields holding character information. Each bit value corresponds
60 // to a particular character feature. For readability, and because the value
61 // of these bits is tightly coupled to this implementation, the individual bits
62 // are not named. Note that bitfields for all characters above ASCII 127 are
63 // zero-initialized.
64 ABSL_DLL const unsigned char kPropertyBits[256] = {
65 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x00
66 0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40,
67 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x10
68 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
69 0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, // 0x20
70 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
71 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, // 0x30
72 0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
73 0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x40
74 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
75 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x50
76 0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10,
77 0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x60
78 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
79 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x70
80 0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40,
81 };
82
83 // Array of characters for the ascii_tolower() function. For values 'A'
84 // through 'Z', return the lower-case character; otherwise, return the
85 // identity of the passed character.
86 ABSL_DLL const char kToLower[256] = {
87 '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
88 '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
89 '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
90 '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
91 '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
92 '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
93 '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
94 '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
95 '\x40', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
96 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
97 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
98 'x', 'y', 'z', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
99 '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67',
100 '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f',
101 '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77',
102 '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
103 '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
104 '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
105 '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
106 '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
107 '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
108 '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
109 '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
110 '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
111 '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
112 '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
113 '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
114 '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
115 '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
116 '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
117 '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
118 '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
119 };
120
121 // Array of characters for the ascii_toupper() function. For values 'a'
122 // through 'z', return the upper-case character; otherwise, return the
123 // identity of the passed character.
124 ABSL_DLL const char kToUpper[256] = {
125 '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
126 '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
127 '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
128 '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
129 '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
130 '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
131 '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
132 '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
133 '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47',
134 '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f',
135 '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57',
136 '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
137 '\x60', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
138 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
139 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
140 'X', 'Y', 'Z', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
141 '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
142 '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
143 '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
144 '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
145 '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
146 '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
147 '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
148 '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
149 '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
150 '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
151 '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
152 '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
153 '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
154 '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
155 '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
156 '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
157 };
158 // clang-format on
159
160 template <bool ToUpper>
AsciiStrCaseFold(char * p,char * end)161 constexpr void AsciiStrCaseFold(char* p, char* end) {
162 // The upper- and lowercase versions of ASCII characters differ by only 1 bit.
163 // When we need to flip the case, we can xor with this bit to achieve the
164 // desired result. Note that the choice of 'a' and 'A' here is arbitrary. We
165 // could have chosen 'z' and 'Z', or any other pair of characters as they all
166 // have the same single bit difference.
167 constexpr unsigned char kAsciiCaseBitFlip = 'a' ^ 'A';
168
169 constexpr char ch_a = ToUpper ? 'a' : 'A';
170 constexpr char ch_z = ToUpper ? 'z' : 'Z';
171 for (; p < end; ++p) {
172 unsigned char v = static_cast<unsigned char>(*p);
173 // We use & instead of && to ensure this always stays branchless
174 // We use static_cast<int> to suppress -Wbitwise-instead-of-logical
175 bool is_in_range = static_cast<bool>(static_cast<int>(ch_a <= v) &
176 static_cast<int>(v <= ch_z));
177 v ^= is_in_range ? kAsciiCaseBitFlip : 0;
178 *p = static_cast<char>(v);
179 }
180 }
181
ValidateAsciiCasefold()182 static constexpr size_t ValidateAsciiCasefold() {
183 constexpr size_t num_chars = 1 + CHAR_MAX - CHAR_MIN;
184 size_t incorrect_index = 0;
185 char lowered[num_chars] = {};
186 char uppered[num_chars] = {};
187 for (unsigned int i = 0; i < num_chars; ++i) {
188 uppered[i] = lowered[i] = static_cast<char>(i);
189 }
190 AsciiStrCaseFold<false>(&lowered[0], &lowered[num_chars]);
191 AsciiStrCaseFold<true>(&uppered[0], &uppered[num_chars]);
192 for (size_t i = 0; i < num_chars; ++i) {
193 const char ch = static_cast<char>(i),
194 ch_upper = ('a' <= ch && ch <= 'z' ? 'A' + (ch - 'a') : ch),
195 ch_lower = ('A' <= ch && ch <= 'Z' ? 'a' + (ch - 'A') : ch);
196 if (uppered[i] != ch_upper || lowered[i] != ch_lower) {
197 incorrect_index = i > 0 ? i : num_chars;
198 break;
199 }
200 }
201 return incorrect_index;
202 }
203
204 static_assert(ValidateAsciiCasefold() == 0, "error in case conversion");
205
206 } // namespace ascii_internal
207
AsciiStrToLower(std::string * s)208 void AsciiStrToLower(std::string* s) {
209 char* p = &(*s)[0]; // Guaranteed to be valid for empty strings
210 return ascii_internal::AsciiStrCaseFold<false>(p, p + s->size());
211 }
212
AsciiStrToUpper(std::string * s)213 void AsciiStrToUpper(std::string* s) {
214 char* p = &(*s)[0]; // Guaranteed to be valid for empty strings
215 return ascii_internal::AsciiStrCaseFold<true>(p, p + s->size());
216 }
217
RemoveExtraAsciiWhitespace(std::string * str)218 void RemoveExtraAsciiWhitespace(std::string* str) {
219 auto stripped = StripAsciiWhitespace(*str);
220
221 if (stripped.empty()) {
222 str->clear();
223 return;
224 }
225
226 auto input_it = stripped.begin();
227 auto input_end = stripped.end();
228 auto output_it = &(*str)[0];
229 bool is_ws = false;
230
231 for (; input_it < input_end; ++input_it) {
232 if (is_ws) {
233 // Consecutive whitespace? Keep only the last.
234 is_ws = absl::ascii_isspace(static_cast<unsigned char>(*input_it));
235 if (is_ws) --output_it;
236 } else {
237 is_ws = absl::ascii_isspace(static_cast<unsigned char>(*input_it));
238 }
239
240 *output_it = *input_it;
241 ++output_it;
242 }
243
244 str->erase(static_cast<size_t>(output_it - &(*str)[0]));
245 }
246
247 ABSL_NAMESPACE_END
248 } // namespace absl
249