• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "absl/strings/ascii.h"
16 
17 #include <climits>
18 #include <cstring>
19 #include <string>
20 
21 namespace absl {
22 ABSL_NAMESPACE_BEGIN
23 namespace ascii_internal {
24 
25 // # Table generated by this Python code (bit 0x02 is currently unused):
26 // TODO(mbar) Move Python code for generation of table to BUILD and link here.
27 
28 // NOTE: The kAsciiPropertyBits table used within this code was generated by
29 // Python code of the following form. (Bit 0x02 is currently unused and
30 // available.)
31 //
32 // def Hex2(n):
33 //   return '0x' + hex(n/16)[2:] + hex(n%16)[2:]
34 // def IsPunct(ch):
35 //   return (ord(ch) >= 32 and ord(ch) < 127 and
36 //           not ch.isspace() and not ch.isalnum())
37 // def IsBlank(ch):
38 //   return ch in ' \t'
39 // def IsCntrl(ch):
40 //   return ord(ch) < 32 or ord(ch) == 127
41 // def IsXDigit(ch):
42 //   return ch.isdigit() or ch.lower() in 'abcdef'
43 // for i in range(128):
44 //   ch = chr(i)
45 //   mask = ((ch.isalpha() and 0x01 or 0) |
46 //           (ch.isalnum() and 0x04 or 0) |
47 //           (ch.isspace() and 0x08 or 0) |
48 //           (IsPunct(ch) and 0x10 or 0) |
49 //           (IsBlank(ch) and 0x20 or 0) |
50 //           (IsCntrl(ch) and 0x40 or 0) |
51 //           (IsXDigit(ch) and 0x80 or 0))
52 //   print Hex2(mask) + ',',
53 //   if i % 16 == 7:
54 //     print ' //', Hex2(i & 0x78)
55 //   elif i % 16 == 15:
56 //     print
57 
58 // clang-format off
59 // Array of bitfields holding character information. Each bit value corresponds
60 // to a particular character feature. For readability, and because the value
61 // of these bits is tightly coupled to this implementation, the individual bits
62 // are not named. Note that bitfields for all characters above ASCII 127 are
63 // zero-initialized.
64 ABSL_DLL const unsigned char kPropertyBits[256] = {
65     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  // 0x00
66     0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40,
67     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  // 0x10
68     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
69     0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,  // 0x20
70     0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
71     0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84,  // 0x30
72     0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
73     0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05,  // 0x40
74     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
75     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  // 0x50
76     0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10,
77     0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05,  // 0x60
78     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
79     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  // 0x70
80     0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40,
81 };
82 
83 // Array of characters for the ascii_tolower() function. For values 'A'
84 // through 'Z', return the lower-case character; otherwise, return the
85 // identity of the passed character.
86 ABSL_DLL const char kToLower[256] = {
87   '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
88   '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
89   '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
90   '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
91   '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
92   '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
93   '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
94   '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
95   '\x40',    'a',    'b',    'c',    'd',    'e',    'f',    'g',
96      'h',    'i',    'j',    'k',    'l',    'm',    'n',    'o',
97      'p',    'q',    'r',    's',    't',    'u',    'v',    'w',
98      'x',    'y',    'z', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
99   '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67',
100   '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f',
101   '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77',
102   '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
103   '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
104   '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
105   '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
106   '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
107   '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
108   '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
109   '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
110   '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
111   '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
112   '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
113   '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
114   '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
115   '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
116   '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
117   '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
118   '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
119 };
120 
121 // Array of characters for the ascii_toupper() function. For values 'a'
122 // through 'z', return the upper-case character; otherwise, return the
123 // identity of the passed character.
124 ABSL_DLL const char kToUpper[256] = {
125   '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
126   '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
127   '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
128   '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
129   '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
130   '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
131   '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
132   '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
133   '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47',
134   '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f',
135   '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57',
136   '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
137   '\x60',    'A',    'B',    'C',    'D',    'E',    'F',    'G',
138      'H',    'I',    'J',    'K',    'L',    'M',    'N',    'O',
139      'P',    'Q',    'R',    'S',    'T',    'U',    'V',    'W',
140      'X',    'Y',    'Z', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
141   '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
142   '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
143   '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
144   '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
145   '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
146   '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
147   '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
148   '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
149   '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
150   '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
151   '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
152   '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
153   '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
154   '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
155   '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
156   '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
157 };
158 // clang-format on
159 
160 template <bool ToUpper>
AsciiStrCaseFold(char * p,char * end)161 constexpr void AsciiStrCaseFold(char* p, char* end) {
162   // The upper- and lowercase versions of ASCII characters differ by only 1 bit.
163   // When we need to flip the case, we can xor with this bit to achieve the
164   // desired result. Note that the choice of 'a' and 'A' here is arbitrary. We
165   // could have chosen 'z' and 'Z', or any other pair of characters as they all
166   // have the same single bit difference.
167   constexpr unsigned char kAsciiCaseBitFlip = 'a' ^ 'A';
168 
169   constexpr char ch_a = ToUpper ? 'a' : 'A';
170   constexpr char ch_z = ToUpper ? 'z' : 'Z';
171   for (; p < end; ++p) {
172     unsigned char v = static_cast<unsigned char>(*p);
173     // We use & instead of && to ensure this always stays branchless
174     // We use static_cast<int> to suppress -Wbitwise-instead-of-logical
175     bool is_in_range = static_cast<bool>(static_cast<int>(ch_a <= v) &
176                                          static_cast<int>(v <= ch_z));
177     v ^= is_in_range ? kAsciiCaseBitFlip : 0;
178     *p = static_cast<char>(v);
179   }
180 }
181 
ValidateAsciiCasefold()182 static constexpr size_t ValidateAsciiCasefold() {
183   constexpr size_t num_chars = 1 + CHAR_MAX - CHAR_MIN;
184   size_t incorrect_index = 0;
185   char lowered[num_chars] = {};
186   char uppered[num_chars] = {};
187   for (unsigned int i = 0; i < num_chars; ++i) {
188     uppered[i] = lowered[i] = static_cast<char>(i);
189   }
190   AsciiStrCaseFold<false>(&lowered[0], &lowered[num_chars]);
191   AsciiStrCaseFold<true>(&uppered[0], &uppered[num_chars]);
192   for (size_t i = 0; i < num_chars; ++i) {
193     const char ch = static_cast<char>(i),
194                ch_upper = ('a' <= ch && ch <= 'z' ? 'A' + (ch - 'a') : ch),
195                ch_lower = ('A' <= ch && ch <= 'Z' ? 'a' + (ch - 'A') : ch);
196     if (uppered[i] != ch_upper || lowered[i] != ch_lower) {
197       incorrect_index = i > 0 ? i : num_chars;
198       break;
199     }
200   }
201   return incorrect_index;
202 }
203 
204 static_assert(ValidateAsciiCasefold() == 0, "error in case conversion");
205 
206 }  // namespace ascii_internal
207 
AsciiStrToLower(std::string * s)208 void AsciiStrToLower(std::string* s) {
209   char* p = &(*s)[0];  // Guaranteed to be valid for empty strings
210   return ascii_internal::AsciiStrCaseFold<false>(p, p + s->size());
211 }
212 
AsciiStrToUpper(std::string * s)213 void AsciiStrToUpper(std::string* s) {
214   char* p = &(*s)[0];  // Guaranteed to be valid for empty strings
215   return ascii_internal::AsciiStrCaseFold<true>(p, p + s->size());
216 }
217 
RemoveExtraAsciiWhitespace(std::string * str)218 void RemoveExtraAsciiWhitespace(std::string* str) {
219   auto stripped = StripAsciiWhitespace(*str);
220 
221   if (stripped.empty()) {
222     str->clear();
223     return;
224   }
225 
226   auto input_it = stripped.begin();
227   auto input_end = stripped.end();
228   auto output_it = &(*str)[0];
229   bool is_ws = false;
230 
231   for (; input_it < input_end; ++input_it) {
232     if (is_ws) {
233       // Consecutive whitespace?  Keep only the last.
234       is_ws = absl::ascii_isspace(static_cast<unsigned char>(*input_it));
235       if (is_ws) --output_it;
236     } else {
237       is_ws = absl::ascii_isspace(static_cast<unsigned char>(*input_it));
238     }
239 
240     *output_it = *input_it;
241     ++output_it;
242   }
243 
244   str->erase(static_cast<size_t>(output_it - &(*str)[0]));
245 }
246 
247 ABSL_NAMESPACE_END
248 }  // namespace absl
249