• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "absl/strings/ascii.h"
16 
17 namespace absl {
18 ABSL_NAMESPACE_BEGIN
19 namespace ascii_internal {
20 
21 // # Table generated by this Python code (bit 0x02 is currently unused):
22 // TODO(mbar) Move Python code for generation of table to BUILD and link here.
23 
24 // NOTE: The kAsciiPropertyBits table used within this code was generated by
25 // Python code of the following form. (Bit 0x02 is currently unused and
26 // available.)
27 //
28 // def Hex2(n):
29 //   return '0x' + hex(n/16)[2:] + hex(n%16)[2:]
30 // def IsPunct(ch):
31 //   return (ord(ch) >= 32 and ord(ch) < 127 and
32 //           not ch.isspace() and not ch.isalnum())
33 // def IsBlank(ch):
34 //   return ch in ' \t'
35 // def IsCntrl(ch):
36 //   return ord(ch) < 32 or ord(ch) == 127
37 // def IsXDigit(ch):
38 //   return ch.isdigit() or ch.lower() in 'abcdef'
39 // for i in range(128):
40 //   ch = chr(i)
41 //   mask = ((ch.isalpha() and 0x01 or 0) |
42 //           (ch.isalnum() and 0x04 or 0) |
43 //           (ch.isspace() and 0x08 or 0) |
44 //           (IsPunct(ch) and 0x10 or 0) |
45 //           (IsBlank(ch) and 0x20 or 0) |
46 //           (IsCntrl(ch) and 0x40 or 0) |
47 //           (IsXDigit(ch) and 0x80 or 0))
48 //   print Hex2(mask) + ',',
49 //   if i % 16 == 7:
50 //     print ' //', Hex2(i & 0x78)
51 //   elif i % 16 == 15:
52 //     print
53 
54 // clang-format off
55 // Array of bitfields holding character information. Each bit value corresponds
56 // to a particular character feature. For readability, and because the value
57 // of these bits is tightly coupled to this implementation, the individual bits
58 // are not named. Note that bitfields for all characters above ASCII 127 are
59 // zero-initialized.
60 ABSL_DLL const unsigned char kPropertyBits[256] = {
61     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  // 0x00
62     0x40, 0x68, 0x48, 0x48, 0x48, 0x48, 0x40, 0x40,
63     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  // 0x10
64     0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
65     0x28, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,  // 0x20
66     0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
67     0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84, 0x84,  // 0x30
68     0x84, 0x84, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
69     0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05,  // 0x40
70     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
71     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  // 0x50
72     0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x10,
73     0x10, 0x85, 0x85, 0x85, 0x85, 0x85, 0x85, 0x05,  // 0x60
74     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
75     0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,  // 0x70
76     0x05, 0x05, 0x05, 0x10, 0x10, 0x10, 0x10, 0x40,
77 };
78 
79 // Array of characters for the ascii_tolower() function. For values 'A'
80 // through 'Z', return the lower-case character; otherwise, return the
81 // identity of the passed character.
82 ABSL_DLL const char kToLower[256] = {
83   '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
84   '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
85   '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
86   '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
87   '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
88   '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
89   '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
90   '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
91   '\x40',    'a',    'b',    'c',    'd',    'e',    'f',    'g',
92      'h',    'i',    'j',    'k',    'l',    'm',    'n',    'o',
93      'p',    'q',    'r',    's',    't',    'u',    'v',    'w',
94      'x',    'y',    'z', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
95   '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67',
96   '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f',
97   '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77',
98   '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
99   '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
100   '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
101   '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
102   '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
103   '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
104   '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
105   '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
106   '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
107   '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
108   '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
109   '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
110   '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
111   '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
112   '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
113   '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
114   '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
115 };
116 
117 // Array of characters for the ascii_toupper() function. For values 'a'
118 // through 'z', return the upper-case character; otherwise, return the
119 // identity of the passed character.
120 ABSL_DLL const char kToUpper[256] = {
121   '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
122   '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
123   '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
124   '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
125   '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
126   '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
127   '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
128   '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
129   '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47',
130   '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f',
131   '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57',
132   '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
133   '\x60',    'A',    'B',    'C',    'D',    'E',    'F',    'G',
134      'H',    'I',    'J',    'K',    'L',    'M',    'N',    'O',
135      'P',    'Q',    'R',    'S',    'T',    'U',    'V',    'W',
136      'X',    'Y',    'Z', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f',
137   '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87',
138   '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f',
139   '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97',
140   '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f',
141   '\xa0', '\xa1', '\xa2', '\xa3', '\xa4', '\xa5', '\xa6', '\xa7',
142   '\xa8', '\xa9', '\xaa', '\xab', '\xac', '\xad', '\xae', '\xaf',
143   '\xb0', '\xb1', '\xb2', '\xb3', '\xb4', '\xb5', '\xb6', '\xb7',
144   '\xb8', '\xb9', '\xba', '\xbb', '\xbc', '\xbd', '\xbe', '\xbf',
145   '\xc0', '\xc1', '\xc2', '\xc3', '\xc4', '\xc5', '\xc6', '\xc7',
146   '\xc8', '\xc9', '\xca', '\xcb', '\xcc', '\xcd', '\xce', '\xcf',
147   '\xd0', '\xd1', '\xd2', '\xd3', '\xd4', '\xd5', '\xd6', '\xd7',
148   '\xd8', '\xd9', '\xda', '\xdb', '\xdc', '\xdd', '\xde', '\xdf',
149   '\xe0', '\xe1', '\xe2', '\xe3', '\xe4', '\xe5', '\xe6', '\xe7',
150   '\xe8', '\xe9', '\xea', '\xeb', '\xec', '\xed', '\xee', '\xef',
151   '\xf0', '\xf1', '\xf2', '\xf3', '\xf4', '\xf5', '\xf6', '\xf7',
152   '\xf8', '\xf9', '\xfa', '\xfb', '\xfc', '\xfd', '\xfe', '\xff',
153 };
154 // clang-format on
155 
156 }  // namespace ascii_internal
157 
AsciiStrToLower(std::string * s)158 void AsciiStrToLower(std::string* s) {
159   for (auto& ch : *s) {
160     ch = absl::ascii_tolower(ch);
161   }
162 }
163 
AsciiStrToUpper(std::string * s)164 void AsciiStrToUpper(std::string* s) {
165   for (auto& ch : *s) {
166     ch = absl::ascii_toupper(ch);
167   }
168 }
169 
RemoveExtraAsciiWhitespace(std::string * str)170 void RemoveExtraAsciiWhitespace(std::string* str) {
171   auto stripped = StripAsciiWhitespace(*str);
172 
173   if (stripped.empty()) {
174     str->clear();
175     return;
176   }
177 
178   auto input_it = stripped.begin();
179   auto input_end = stripped.end();
180   auto output_it = &(*str)[0];
181   bool is_ws = false;
182 
183   for (; input_it < input_end; ++input_it) {
184     if (is_ws) {
185       // Consecutive whitespace?  Keep only the last.
186       is_ws = absl::ascii_isspace(*input_it);
187       if (is_ws) --output_it;
188     } else {
189       is_ws = absl::ascii_isspace(*input_it);
190     }
191 
192     *output_it = *input_it;
193     ++output_it;
194   }
195 
196   str->erase(output_it - &(*str)[0]);
197 }
198 
199 ABSL_NAMESPACE_END
200 }  // namespace absl
201