1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "absl/strings/ascii.h"
16
17 namespace absl {
18 ABSL_NAMESPACE_BEGIN
19 namespace ascii_internal {
20
21 // # Table generated by this Python code (bit 0x02 is currently unused):
22 // TODO(mbar) Move Python code for generation of table to BUILD and link here.
23
24 // NOTE: The kAsciiPropertyBits table used within this code was generated by
25 // Python code of the following form. (Bit 0x02 is currently unused and
26 // available.)
27 //
28 // def Hex2(n):
29 // return '0x' + hex(n/16)[2:] + hex(n%16)[2:]
30 // def IsPunct(ch):
31 // return (ord(ch) >= 32 and ord(ch) < 127 and
32 // not ch.isspace() and not ch.isalnum())
33 // def IsBlank(ch):
34 // return ch in ' \t'
35 // def IsCntrl(ch):
36 // return ord(ch) < 32 or ord(ch) == 127
37 // def IsXDigit(ch):
38 // return ch.isdigit() or ch.lower() in 'abcdef'
39 // for i in range(128):
40 // ch = chr(i)
41 // mask = ((ch.isalpha() and 0x01 or 0) |
42 // (ch.isalnum() and 0x04 or 0) |
43 // (ch.isspace() and 0x08 or 0) |
44 // (IsPunct(ch) and 0x10 or 0) |
45 // (IsBlank(ch) and 0x20 or 0) |
46 // (IsCntrl(ch) and 0x40 or 0) |
47 // (IsXDigit(ch) and 0x80 or 0))
48 // print Hex2(mask) + ',',
49 // if i % 16 == 7:
50 // print ' //', Hex2(i & 0x78)
51 // elif i % 16 == 15:
52 // print
53
54 // clang-format off
55 // Array of bitfields holding character information. Each bit value corresponds
56 // to a particular character feature. For readability, and because the value
57 // of these bits is tightly coupled to this implementation, the individual bits
58 // are not named. Note that bitfields for all characters above ASCII 127 are
59 // zero-initialized.
60 ABSL_DLL const unsigned char kPropertyBits[256] = {
61 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x00
62 0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40,
63 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // 0x10
64 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
65 0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, // 0x20
66 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
67 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, // 0x30
68 0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
69 0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x40
70 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
71 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x50
72 0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10,
73 0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05, // 0x60
74 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
75 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, // 0x70
76 0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40,
77 };
78
79 // Array of characters for the ascii_tolower() function. For values 'A'
80 // through 'Z', return the lower-case character; otherwise, return the
81 // identity of the passed character.
82 ABSL_DLL const char kToLower[256] = {
83 '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
84 '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
85 '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
86 '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
87 '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
88 '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
89 '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
90 '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
91 '\x40', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
92 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
93 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
94 'x', 'y', 'z', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
95 '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67',
96 '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f',
97 '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77',
98 '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
99 '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
100 '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
101 '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
102 '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
103 '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
104 '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
105 '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
106 '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
107 '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
108 '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
109 '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
110 '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
111 '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
112 '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
113 '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
114 '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
115 };
116
117 // Array of characters for the ascii_toupper() function. For values 'a'
118 // through 'z', return the upper-case character; otherwise, return the
119 // identity of the passed character.
120 ABSL_DLL const char kToUpper[256] = {
121 '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
122 '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
123 '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
124 '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
125 '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
126 '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
127 '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
128 '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
129 '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47',
130 '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f',
131 '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57',
132 '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
133 '\x60', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
134 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
135 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
136 'X', 'Y', 'Z', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
137 '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
138 '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
139 '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
140 '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
141 '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
142 '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
143 '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
144 '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
145 '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
146 '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
147 '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
148 '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
149 '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
150 '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
151 '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
152 '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
153 };
154 // clang-format on
155
156 } // namespace ascii_internal
157
AsciiStrToLower(std::string * s)158 void AsciiStrToLower(std::string* s) {
159 for (auto& ch : *s) {
160 ch = absl::ascii_tolower(ch);
161 }
162 }
163
AsciiStrToUpper(std::string * s)164 void AsciiStrToUpper(std::string* s) {
165 for (auto& ch : *s) {
166 ch = absl::ascii_toupper(ch);
167 }
168 }
169
RemoveExtraAsciiWhitespace(std::string * str)170 void RemoveExtraAsciiWhitespace(std::string* str) {
171 auto stripped = StripAsciiWhitespace(*str);
172
173 if (stripped.empty()) {
174 str->clear();
175 return;
176 }
177
178 auto input_it = stripped.begin();
179 auto input_end = stripped.end();
180 auto output_it = &(*str)[0];
181 bool is_ws = false;
182
183 for (; input_it < input_end; ++input_it) {
184 if (is_ws) {
185 // Consecutive whitespace? Keep only the last.
186 is_ws = absl::ascii_isspace(*input_it);
187 if (is_ws) --output_it;
188 } else {
189 is_ws = absl::ascii_isspace(*input_it);
190 }
191
192 *output_it = *input_it;
193 ++output_it;
194 }
195
196 str->erase(output_it - &(*str)[0]);
197 }
198
199 ABSL_NAMESPACE_END
200 } // namespace absl
201