1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/lib/strings/str_util.h"
17
18 #include <ctype.h>
19 #include <algorithm>
20 #include <cstring>
21 #include <vector>
22 #include "tensorflow/core/lib/strings/numbers.h"
23 #include "tensorflow/core/lib/strings/stringprintf.h"
24 #include "tensorflow/core/platform/logging.h"
25
26 namespace tensorflow {
27 namespace str_util {
28
29 static char hex_char[] = "0123456789abcdef";
30
CEscape(StringPiece src)31 string CEscape(StringPiece src) {
32 string dest;
33
34 for (unsigned char c : src) {
35 switch (c) {
36 case '\n':
37 dest.append("\\n");
38 break;
39 case '\r':
40 dest.append("\\r");
41 break;
42 case '\t':
43 dest.append("\\t");
44 break;
45 case '\"':
46 dest.append("\\\"");
47 break;
48 case '\'':
49 dest.append("\\'");
50 break;
51 case '\\':
52 dest.append("\\\\");
53 break;
54 default:
55 // Note that if we emit \xNN and the src character after that is a hex
56 // digit then that digit must be escaped too to prevent it being
57 // interpreted as part of the character code by C.
58 if ((c >= 0x80) || !isprint(c)) {
59 dest.append("\\");
60 dest.push_back(hex_char[c / 64]);
61 dest.push_back(hex_char[(c % 64) / 8]);
62 dest.push_back(hex_char[c % 8]);
63 } else {
64 dest.push_back(c);
65 break;
66 }
67 }
68 }
69
70 return dest;
71 }
72
73 namespace { // Private helpers for CUnescape().
74
is_octal_digit(unsigned char c)75 inline bool is_octal_digit(unsigned char c) { return c >= '0' && c <= '7'; }
76
ascii_isxdigit(unsigned char c)77 inline bool ascii_isxdigit(unsigned char c) {
78 return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') ||
79 (c >= 'A' && c <= 'F');
80 }
81
hex_digit_to_int(char c)82 inline int hex_digit_to_int(char c) {
83 int x = static_cast<unsigned char>(c);
84 if (x > '9') {
85 x += 9;
86 }
87 return x & 0xf;
88 }
89
CUnescapeInternal(StringPiece source,string * dest,string::size_type * dest_len,string * error)90 bool CUnescapeInternal(StringPiece source, string* dest,
91 string::size_type* dest_len, string* error) {
92 const char* p = source.data();
93 const char* end = source.end();
94 const char* last_byte = end - 1;
95
96 // We are going to write the result to dest with its iterator. If our string
97 // implementation uses copy-on-write, this will trigger a copy-on-write of
98 // dest's buffer; that is, dest will be assigned a new buffer.
99 //
100 // Note that the following way is NOT a legal way to modify a string's
101 // content:
102 //
103 // char* d = const_cast<char*>(dest->data());
104 //
105 // This won't trigger copy-on-write of the string, and so is dangerous when
106 // the buffer is shared.
107 auto d = dest->begin();
108
109 // Small optimization for case where source = dest and there's no escaping
110 if (source.data() == dest->data()) {
111 while (p < end && *p != '\\') {
112 p++;
113 d++;
114 }
115 }
116
117 while (p < end) {
118 if (*p != '\\') {
119 *d++ = *p++;
120 } else {
121 if (++p > last_byte) { // skip past the '\\'
122 if (error) *error = "String cannot end with \\";
123 return false;
124 }
125 switch (*p) {
126 case 'a':
127 *d++ = '\a';
128 break;
129 case 'b':
130 *d++ = '\b';
131 break;
132 case 'f':
133 *d++ = '\f';
134 break;
135 case 'n':
136 *d++ = '\n';
137 break;
138 case 'r':
139 *d++ = '\r';
140 break;
141 case 't':
142 *d++ = '\t';
143 break;
144 case 'v':
145 *d++ = '\v';
146 break;
147 case '\\':
148 *d++ = '\\';
149 break;
150 case '?':
151 *d++ = '\?';
152 break; // \? Who knew?
153 case '\'':
154 *d++ = '\'';
155 break;
156 case '"':
157 *d++ = '\"';
158 break;
159 case '0':
160 case '1':
161 case '2':
162 case '3': // octal digit: 1 to 3 digits
163 case '4':
164 case '5':
165 case '6':
166 case '7': {
167 const char* octal_start = p;
168 unsigned int ch = *p - '0';
169 if (p < last_byte && is_octal_digit(p[1])) ch = ch * 8 + *++p - '0';
170 if (p < last_byte && is_octal_digit(p[1]))
171 ch = ch * 8 + *++p - '0'; // now points at last digit
172 if (ch > 0xff) {
173 if (error) {
174 *error = "Value of \\" +
175 string(octal_start, p + 1 - octal_start) +
176 " exceeds 0xff";
177 }
178 return false;
179 }
180 *d++ = ch;
181 break;
182 }
183 case 'x':
184 case 'X': {
185 if (p >= last_byte) {
186 if (error) *error = "String cannot end with \\x";
187 return false;
188 } else if (!ascii_isxdigit(p[1])) {
189 if (error) *error = "\\x cannot be followed by a non-hex digit";
190 return false;
191 }
192 unsigned int ch = 0;
193 const char* hex_start = p;
194 while (p < last_byte && ascii_isxdigit(p[1]))
195 // Arbitrarily many hex digits
196 ch = (ch << 4) + hex_digit_to_int(*++p);
197 if (ch > 0xFF) {
198 if (error) {
199 *error = "Value of \\" + string(hex_start, p + 1 - hex_start) +
200 " exceeds 0xff";
201 }
202 return false;
203 }
204 *d++ = ch;
205 break;
206 }
207 default: {
208 if (error) *error = string("Unknown escape sequence: \\") + *p;
209 return false;
210 }
211 }
212 p++; // read past letter we escaped
213 }
214 }
215 *dest_len = d - dest->begin();
216 return true;
217 }
218
219 template <typename T>
SplitAndParseAsInts(StringPiece text,char delim,std::function<bool (StringPiece,T *)> converter,std::vector<T> * result)220 bool SplitAndParseAsInts(StringPiece text, char delim,
221 std::function<bool(StringPiece, T*)> converter,
222 std::vector<T>* result) {
223 result->clear();
224 std::vector<string> num_strings = Split(text, delim);
225 for (const auto& s : num_strings) {
226 T num;
227 if (!converter(s, &num)) return false;
228 result->push_back(num);
229 }
230 return true;
231 }
232
233 } // namespace
234
CUnescape(StringPiece source,string * dest,string * error)235 bool CUnescape(StringPiece source, string* dest, string* error) {
236 dest->resize(source.size());
237 string::size_type dest_size;
238 if (!CUnescapeInternal(source, dest, &dest_size, error)) {
239 return false;
240 }
241 dest->erase(dest_size);
242 return true;
243 }
244
StripTrailingWhitespace(string * s)245 void StripTrailingWhitespace(string* s) {
246 string::size_type i;
247 for (i = s->size(); i > 0 && isspace((*s)[i - 1]); --i) {
248 }
249 s->resize(i);
250 }
251
252 // Return lower-cased version of s.
Lowercase(StringPiece s)253 string Lowercase(StringPiece s) {
254 string result(s.data(), s.size());
255 for (char& c : result) {
256 c = tolower(c);
257 }
258 return result;
259 }
260
261 // Return upper-cased version of s.
Uppercase(StringPiece s)262 string Uppercase(StringPiece s) {
263 string result(s.data(), s.size());
264 for (char& c : result) {
265 c = toupper(c);
266 }
267 return result;
268 }
269
ArgDefCase(StringPiece s)270 string ArgDefCase(StringPiece s) {
271 const size_t n = s.size();
272
273 // Compute the size of resulting string.
274 // Number of extra underscores we will need to add.
275 size_t extra_us = 0;
276 // Number of non-alpha chars in the beginning to skip.
277 size_t to_skip = 0;
278 for (size_t i = 0; i < n; ++i) {
279 // If we are skipping and current letter is non-alpha, skip it as well
280 if (i == to_skip && !isalpha(s[i])) {
281 ++to_skip;
282 continue;
283 }
284
285 // If we are here, we are not skipping any more.
286 // If this letter is upper case, not the very first char in the
287 // resulting string, and previous letter isn't replaced with an underscore,
288 // we will need to insert an underscore.
289 if (isupper(s[i]) && i != to_skip && i > 0 && isalnum(s[i - 1])) {
290 ++extra_us;
291 }
292 }
293
294 // Initialize result with all '_'s. There is no string
295 // constructor that does not initialize memory.
296 string result(n + extra_us - to_skip, '_');
297 // i - index into s
298 // j - index into result
299 for (size_t i = to_skip, j = 0; i < n; ++i, ++j) {
300 DCHECK_LT(j, result.size());
301 char c = s[i];
302 // If c is not alphanumeric, we don't need to do anything
303 // since there is already an underscore in its place.
304 if (isalnum(c)) {
305 if (isupper(c)) {
306 // If current char is upper case, we might need to insert an
307 // underscore.
308 if (i != to_skip) {
309 DCHECK_GT(j, 0);
310 if (result[j - 1] != '_') ++j;
311 }
312 result[j] = tolower(c);
313 } else {
314 result[j] = c;
315 }
316 }
317 }
318
319 return result;
320 }
321
TitlecaseString(string * s,StringPiece delimiters)322 void TitlecaseString(string* s, StringPiece delimiters) {
323 bool upper = true;
324 for (string::iterator ss = s->begin(); ss != s->end(); ++ss) {
325 if (upper) {
326 *ss = toupper(*ss);
327 }
328 upper = (delimiters.find(*ss) != StringPiece::npos);
329 }
330 }
331
StringReplace(StringPiece s,StringPiece oldsub,StringPiece newsub,bool replace_all)332 string StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
333 bool replace_all) {
334 // TODO(jlebar): We could avoid having to shift data around in the string if
335 // we had a StringPiece::find() overload that searched for a StringPiece.
336 string res(s);
337 size_t pos = 0;
338 while ((pos = res.find(oldsub.data(), pos, oldsub.size())) != string::npos) {
339 res.replace(pos, oldsub.size(), newsub.data(), newsub.size());
340 pos += newsub.size();
341 if (oldsub.empty()) {
342 pos++; // Match at the beginning of the text and after every byte
343 }
344 if (!replace_all) {
345 break;
346 }
347 }
348 return res;
349 }
350
RemoveLeadingWhitespace(StringPiece * text)351 size_t RemoveLeadingWhitespace(StringPiece* text) {
352 size_t count = 0;
353 const char* ptr = text->data();
354 while (count < text->size() && isspace(*ptr)) {
355 count++;
356 ptr++;
357 }
358 text->remove_prefix(count);
359 return count;
360 }
361
RemoveTrailingWhitespace(StringPiece * text)362 size_t RemoveTrailingWhitespace(StringPiece* text) {
363 size_t count = 0;
364 const char* ptr = text->data() + text->size() - 1;
365 while (count < text->size() && isspace(*ptr)) {
366 ++count;
367 --ptr;
368 }
369 text->remove_suffix(count);
370 return count;
371 }
372
RemoveWhitespaceContext(StringPiece * text)373 size_t RemoveWhitespaceContext(StringPiece* text) {
374 // use RemoveLeadingWhitespace() and RemoveTrailingWhitespace() to do the job
375 return (RemoveLeadingWhitespace(text) + RemoveTrailingWhitespace(text));
376 }
377
ConsumePrefix(StringPiece * s,StringPiece expected)378 bool ConsumePrefix(StringPiece* s, StringPiece expected) {
379 if (StartsWith(*s, expected)) {
380 s->remove_prefix(expected.size());
381 return true;
382 }
383 return false;
384 }
385
ConsumeSuffix(StringPiece * s,StringPiece expected)386 bool ConsumeSuffix(StringPiece* s, StringPiece expected) {
387 if (EndsWith(*s, expected)) {
388 s->remove_suffix(expected.size());
389 return true;
390 }
391 return false;
392 }
393
ConsumeLeadingDigits(StringPiece * s,uint64 * val)394 bool ConsumeLeadingDigits(StringPiece* s, uint64* val) {
395 const char* p = s->data();
396 const char* limit = p + s->size();
397 uint64 v = 0;
398 while (p < limit) {
399 const char c = *p;
400 if (c < '0' || c > '9') break;
401 uint64 new_v = (v * 10) + (c - '0');
402 if (new_v / 8 < v) {
403 // Overflow occurred
404 return false;
405 }
406 v = new_v;
407 p++;
408 }
409 if (p > s->data()) {
410 // Consume some digits
411 s->remove_prefix(p - s->data());
412 *val = v;
413 return true;
414 } else {
415 return false;
416 }
417 }
418
ConsumeNonWhitespace(StringPiece * s,StringPiece * val)419 bool ConsumeNonWhitespace(StringPiece* s, StringPiece* val) {
420 const char* p = s->data();
421 const char* limit = p + s->size();
422 while (p < limit) {
423 const char c = *p;
424 if (isspace(c)) break;
425 p++;
426 }
427 const size_t n = p - s->data();
428 if (n > 0) {
429 *val = StringPiece(s->data(), n);
430 s->remove_prefix(n);
431 return true;
432 } else {
433 *val = StringPiece();
434 return false;
435 }
436 }
437
SplitAndParseAsInts(StringPiece text,char delim,std::vector<int32> * result)438 bool SplitAndParseAsInts(StringPiece text, char delim,
439 std::vector<int32>* result) {
440 return SplitAndParseAsInts<int32>(text, delim, strings::safe_strto32, result);
441 }
442
SplitAndParseAsInts(StringPiece text,char delim,std::vector<int64> * result)443 bool SplitAndParseAsInts(StringPiece text, char delim,
444 std::vector<int64>* result) {
445 return SplitAndParseAsInts<int64>(text, delim, strings::safe_strto64, result);
446 }
447
SplitAndParseAsFloats(StringPiece text,char delim,std::vector<float> * result)448 bool SplitAndParseAsFloats(StringPiece text, char delim,
449 std::vector<float>* result) {
450 return SplitAndParseAsInts<float>(text, delim,
451 [](StringPiece str, float* value) {
452 return strings::safe_strtof(str, value);
453 },
454 result);
455 }
456
Strnlen(const char * str,const size_t string_max_len)457 size_t Strnlen(const char* str, const size_t string_max_len) {
458 size_t len = 0;
459 while (len < string_max_len && str[len] != '\0') {
460 ++len;
461 }
462 return len;
463 }
464
StrContains(StringPiece haystack,StringPiece needle)465 bool StrContains(StringPiece haystack, StringPiece needle) {
466 return std::search(haystack.begin(), haystack.end(), needle.begin(),
467 needle.end()) != haystack.end();
468 }
469
StartsWith(StringPiece text,StringPiece prefix)470 bool StartsWith(StringPiece text, StringPiece prefix) {
471 return prefix.empty() ||
472 (text.size() >= prefix.size() &&
473 memcmp(text.data(), prefix.data(), prefix.size()) == 0);
474 }
475
EndsWith(StringPiece text,StringPiece suffix)476 bool EndsWith(StringPiece text, StringPiece suffix) {
477 return suffix.empty() || (text.size() >= suffix.size() &&
478 memcmp(text.data() + (text.size() - suffix.size()),
479 suffix.data(), suffix.size()) == 0);
480 }
481
482 } // namespace str_util
483 } // namespace tensorflow
484