• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "absl/strings/ascii.h"
16 
17 #include <climits>
18 #include <cstring>
19 #include <string>
20 
21 #include "absl/base/config.h"
22 
23 namespace absl {
24 ABSL_NAMESPACE_BEGIN
25 namespace ascii_internal {
26 
27 // # Table generated by this Python code (bit 0x02 is currently unused):
28 // TODO(mbar) Move Python code for generation of table to BUILD and link here.
29 
30 // NOTE: The kAsciiPropertyBits table used within this code was generated by
31 // Python code of the following form. (Bit 0x02 is currently unused and
32 // available.)
33 //
34 // def Hex2(n):
35 //   return '0x' + hex(n/16)[2:] + hex(n%16)[2:]
36 // def IsPunct(ch):
37 //   return (ord(ch) >= 32 and ord(ch) < 127 and
38 //           not ch.isspace() and not ch.isalnum())
39 // def IsBlank(ch):
40 //   return ch in ' \t'
41 // def IsCntrl(ch):
42 //   return ord(ch) < 32 or ord(ch) == 127
43 // def IsXDigit(ch):
44 //   return ch.isdigit() or ch.lower() in 'abcdef'
45 // for i in range(128):
46 //   ch = chr(i)
47 //   mask = ((ch.isalpha() and 0x01 or 0) |
48 //           (ch.isalnum() and 0x04 or 0) |
49 //           (ch.isspace() and 0x08 or 0) |
50 //           (IsPunct(ch) and 0x10 or 0) |
51 //           (IsBlank(ch) and 0x20 or 0) |
52 //           (IsCntrl(ch) and 0x40 or 0) |
53 //           (IsXDigit(ch) and 0x80 or 0))
54 //   print Hex2(mask) + ',',
55 //   if i % 16 == 7:
56 //     print ' //', Hex2(i & 0x78)
57 //   elif i % 16 == 15:
58 //     print
59 
60 // clang-format off
61 // Array of bitfields holding character information. Each bit value corresponds
62 // to a particular character feature. For readability, and because the value
63 // of these bits is tightly coupled to this implementation, the individual bits
64 // are not named. Note that bitfields for all characters above ASCII 127 are
65 // zero-initialized.
66 ABSL_DLL const unsigned char kPropertyBits[256] = {
67     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  // 0x00
68     0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40,
69     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  // 0x10
70     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
71     0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,  // 0x20
72     0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
73     0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84,  // 0x30
74     0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
75     0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05,  // 0x40
76     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
77     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  // 0x50
78     0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10,
79     0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05,  // 0x60
80     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
81     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  // 0x70
82     0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40,
83 };
84 
85 // Array of characters for the ascii_tolower() function. For values 'A'
86 // through 'Z', return the lower-case character; otherwise, return the
87 // identity of the passed character.
88 ABSL_DLL const char kToLower[256] = {
89   '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
90   '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
91   '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
92   '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
93   '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
94   '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
95   '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
96   '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
97   '\x40',    'a',    'b',    'c',    'd',    'e',    'f',    'g',
98      'h',    'i',    'j',    'k',    'l',    'm',    'n',    'o',
99      'p',    'q',    'r',    's',    't',    'u',    'v',    'w',
100      'x',    'y',    'z', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
101   '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67',
102   '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f',
103   '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77',
104   '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
105   '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
106   '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
107   '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
108   '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
109   '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
110   '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
111   '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
112   '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
113   '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
114   '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
115   '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
116   '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
117   '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
118   '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
119   '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
120   '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
121 };
122 
123 // Array of characters for the ascii_toupper() function. For values 'a'
124 // through 'z', return the upper-case character; otherwise, return the
125 // identity of the passed character.
126 ABSL_DLL const char kToUpper[256] = {
127   '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
128   '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
129   '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
130   '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
131   '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
132   '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
133   '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
134   '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
135   '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47',
136   '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f',
137   '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57',
138   '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
139   '\x60',    'A',    'B',    'C',    'D',    'E',    'F',    'G',
140      'H',    'I',    'J',    'K',    'L',    'M',    'N',    'O',
141      'P',    'Q',    'R',    'S',    'T',    'U',    'V',    'W',
142      'X',    'Y',    'Z', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
143   '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
144   '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
145   '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
146   '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
147   '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
148   '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
149   '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
150   '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
151   '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
152   '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
153   '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
154   '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
155   '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
156   '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
157   '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
158   '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
159 };
160 // clang-format on
161 
162 // Returns whether `c` is in the a-z/A-Z range (w.r.t. `ToUpper`).
163 // Implemented by:
164 //  1. Pushing the a-z/A-Z range to [SCHAR_MIN, SCHAR_MIN + 26).
165 //  2. Comparing to SCHAR_MIN + 26.
166 template <bool ToUpper>
AsciiInAZRange(unsigned char c)167 constexpr bool AsciiInAZRange(unsigned char c) {
168   constexpr unsigned char sub = (ToUpper ? 'a' : 'A') - SCHAR_MIN;
169   constexpr signed char threshold = SCHAR_MIN + 26;  // 26 = alphabet size.
170   // Using unsigned arithmetic as overflows/underflows are well defined.
171   unsigned char u = c - sub;
172   // Using signed cmp, as SIMD unsigned cmp isn't available in many platforms.
173   return static_cast<signed char>(u) < threshold;
174 }
175 
176 template <bool ToUpper>
AsciiStrCaseFold(char * p,char * end)177 constexpr void AsciiStrCaseFold(char* p, char* end) {
178   // The upper- and lowercase versions of ASCII characters differ by only 1 bit.
179   // When we need to flip the case, we can xor with this bit to achieve the
180   // desired result. Note that the choice of 'a' and 'A' here is arbitrary. We
181   // could have chosen 'z' and 'Z', or any other pair of characters as they all
182   // have the same single bit difference.
183   constexpr unsigned char kAsciiCaseBitFlip = 'a' ^ 'A';
184 
185   for (; p < end; ++p) {
186     unsigned char v = static_cast<unsigned char>(*p);
187     v ^= AsciiInAZRange<ToUpper>(v) ? kAsciiCaseBitFlip : 0;
188     *p = static_cast<char>(v);
189   }
190 }
191 
ValidateAsciiCasefold()192 static constexpr size_t ValidateAsciiCasefold() {
193   constexpr size_t num_chars = 1 + CHAR_MAX - CHAR_MIN;
194   size_t incorrect_index = 0;
195   char lowered[num_chars] = {};
196   char uppered[num_chars] = {};
197   for (unsigned int i = 0; i < num_chars; ++i) {
198     uppered[i] = lowered[i] = static_cast<char>(i);
199   }
200   AsciiStrCaseFold<false>(&lowered[0], &lowered[num_chars]);
201   AsciiStrCaseFold<true>(&uppered[0], &uppered[num_chars]);
202   for (size_t i = 0; i < num_chars; ++i) {
203     const char ch = static_cast<char>(i),
204                ch_upper = ('a' <= ch && ch <= 'z' ? 'A' + (ch - 'a') : ch),
205                ch_lower = ('A' <= ch && ch <= 'Z' ? 'a' + (ch - 'A') : ch);
206     if (uppered[i] != ch_upper || lowered[i] != ch_lower) {
207       incorrect_index = i > 0 ? i : num_chars;
208       break;
209     }
210   }
211   return incorrect_index;
212 }
213 
214 static_assert(ValidateAsciiCasefold() == 0, "error in case conversion");
215 
216 }  // namespace ascii_internal
217 
AsciiStrToLower(std::string * s)218 void AsciiStrToLower(std::string* s) {
219   char* p = &(*s)[0];  // Guaranteed to be valid for empty strings
220   return ascii_internal::AsciiStrCaseFold<false>(p, p + s->size());
221 }
222 
AsciiStrToUpper(std::string * s)223 void AsciiStrToUpper(std::string* s) {
224   char* p = &(*s)[0];  // Guaranteed to be valid for empty strings
225   return ascii_internal::AsciiStrCaseFold<true>(p, p + s->size());
226 }
227 
RemoveExtraAsciiWhitespace(std::string * str)228 void RemoveExtraAsciiWhitespace(std::string* str) {
229   auto stripped = StripAsciiWhitespace(*str);
230 
231   if (stripped.empty()) {
232     str->clear();
233     return;
234   }
235 
236   auto input_it = stripped.begin();
237   auto input_end = stripped.end();
238   auto output_it = &(*str)[0];
239   bool is_ws = false;
240 
241   for (; input_it < input_end; ++input_it) {
242     if (is_ws) {
243       // Consecutive whitespace?  Keep only the last.
244       is_ws = absl::ascii_isspace(static_cast<unsigned char>(*input_it));
245       if (is_ws) --output_it;
246     } else {
247       is_ws = absl::ascii_isspace(static_cast<unsigned char>(*input_it));
248     }
249 
250     *output_it = *input_it;
251     ++output_it;
252   }
253 
254   str->erase(static_cast<size_t>(output_it - &(*str)[0]));
255 }
256 
257 ABSL_NAMESPACE_END
258 }  // namespace absl
259