1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_LIB_STRINGS_SCANNER_H_ 17 #define TENSORFLOW_LIB_STRINGS_SCANNER_H_ 18 19 #include <string> 20 #include "tensorflow/core/lib/core/stringpiece.h" 21 #include "tensorflow/core/lib/strings/str_util.h" 22 #include "tensorflow/core/platform/macros.h" 23 24 namespace tensorflow { 25 namespace strings { 26 27 // Scanner provides simplified string parsing, in which a string is parsed as a 28 // series of scanning calls (e.g. One, Any, Many, OneLiteral, Eos), and then 29 // finally GetResult is called. If GetResult returns true, then it also returns 30 // the remaining characters and any captured substring. 31 // 32 // The range to capture can be controlled with RestartCapture and StopCapture; 33 // by default, all processed characters are captured. 34 class Scanner { 35 public: 36 // Classes of characters. Each enum name is to be read as the union of the 37 // parts - e.g., class LETTER_DIGIT means the class includes all letters and 38 // all digits. 39 // 40 // LETTER means ascii letter a-zA-Z. 41 // DIGIT means ascii digit: 0-9. 42 enum CharClass { 43 // NOTE: When adding a new CharClass, update the AllCharClasses ScannerTest 44 // in scanner_test.cc 45 ALL, 46 DIGIT, 47 LETTER, 48 LETTER_DIGIT, 49 LETTER_DIGIT_DASH_UNDERSCORE, 50 LETTER_DIGIT_DASH_DOT_SLASH, // SLASH is / only, not backslash 51 LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE, // SLASH is / only, not backslash 52 LETTER_DIGIT_DOT, 53 LETTER_DIGIT_DOT_PLUS_MINUS, 54 LETTER_DIGIT_DOT_UNDERSCORE, 55 LETTER_DIGIT_UNDERSCORE, 56 LOWERLETTER, 57 LOWERLETTER_DIGIT, 58 LOWERLETTER_DIGIT_UNDERSCORE, 59 NON_ZERO_DIGIT, 60 SPACE, 61 UPPERLETTER, 62 }; 63 Scanner(StringPiece source)64 explicit Scanner(StringPiece source) : cur_(source) { RestartCapture(); } 65 66 // Consume the next character of the given class from input. If the next 67 // character is not in the class, then GetResult will ultimately return false. One(CharClass clz)68 Scanner& One(CharClass clz) { 69 if (cur_.empty() || !Matches(clz, cur_[0])) { 70 return Error(); 71 } 72 cur_.remove_prefix(1); 73 return *this; 74 } 75 76 // Consume the next s.size() characters of the input, if they match <s>. If 77 // they don't match <s>, this is a no-op. ZeroOrOneLiteral(StringPiece s)78 Scanner& ZeroOrOneLiteral(StringPiece s) { 79 str_util::ConsumePrefix(&cur_, s); 80 return *this; 81 } 82 83 // Consume the next s.size() characters of the input, if they match <s>. If 84 // they don't match <s>, then GetResult will ultimately return false. OneLiteral(StringPiece s)85 Scanner& OneLiteral(StringPiece s) { 86 if (!str_util::ConsumePrefix(&cur_, s)) { 87 error_ = true; 88 } 89 return *this; 90 } 91 92 // Consume characters from the input as long as they match <clz>. Zero 93 // characters is still considered a match, so it will never cause GetResult to 94 // return false. Any(CharClass clz)95 Scanner& Any(CharClass clz) { 96 while (!cur_.empty() && Matches(clz, cur_[0])) { 97 cur_.remove_prefix(1); 98 } 99 return *this; 100 } 101 102 // Shorthand for One(clz).Any(clz). Many(CharClass clz)103 Scanner& Many(CharClass clz) { return One(clz).Any(clz); } 104 105 // Reset the capture start point. 106 // 107 // Later, when GetResult is called and if it returns true, the capture 108 // returned will start at the position at the time this was called. RestartCapture()109 Scanner& RestartCapture() { 110 capture_start_ = cur_.data(); 111 capture_end_ = nullptr; 112 return *this; 113 } 114 115 // Stop capturing input. 116 // 117 // Later, when GetResult is called and if it returns true, the capture 118 // returned will end at the position at the time this was called. StopCapture()119 Scanner& StopCapture() { 120 capture_end_ = cur_.data(); 121 return *this; 122 } 123 124 // If not at the input of input, then GetResult will ultimately return false. Eos()125 Scanner& Eos() { 126 if (!cur_.empty()) error_ = true; 127 return *this; 128 } 129 130 // Shorthand for Any(SPACE). AnySpace()131 Scanner& AnySpace() { return Any(SPACE); } 132 133 // This scans input until <end_ch> is reached. <end_ch> is NOT consumed. ScanUntil(char end_ch)134 Scanner& ScanUntil(char end_ch) { 135 ScanUntilImpl(end_ch, false); 136 return *this; 137 } 138 139 // This scans input until <end_ch> is reached. <end_ch> is NOT consumed. 140 // Backslash escape sequences are skipped. 141 // Used for implementing quoted string scanning. ScanEscapedUntil(char end_ch)142 Scanner& ScanEscapedUntil(char end_ch) { 143 ScanUntilImpl(end_ch, true); 144 return *this; 145 } 146 147 // Return the next character that will be scanned, or <default_value> if there 148 // are no more characters to scan. 149 // Note that if a scan operation has failed (so GetResult() returns false), 150 // then the value of Peek may or may not have advanced since the scan 151 // operation that failed. 152 char Peek(char default_value = '\0') const { 153 return cur_.empty() ? default_value : cur_[0]; 154 } 155 156 // Returns false if there are no remaining characters to consume. empty()157 int empty() const { return cur_.empty(); } 158 159 // Returns true if the input string successfully matched. When true is 160 // returned, the remaining string is returned in <remaining> and the captured 161 // string returned in <capture>, if non-NULL. 162 bool GetResult(StringPiece* remaining = nullptr, 163 StringPiece* capture = nullptr); 164 165 private: 166 void ScanUntilImpl(char end_ch, bool escaped); 167 Error()168 Scanner& Error() { 169 error_ = true; 170 return *this; 171 } 172 IsLetter(char ch)173 static bool IsLetter(char ch) { 174 return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); 175 } 176 IsLowerLetter(char ch)177 static bool IsLowerLetter(char ch) { return ch >= 'a' && ch <= 'z'; } 178 IsDigit(char ch)179 static bool IsDigit(char ch) { return ch >= '0' && ch <= '9'; } 180 IsSpace(char ch)181 static bool IsSpace(char ch) { 182 return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\v' || ch == '\f' || 183 ch == '\r'); 184 } 185 Matches(CharClass clz,char ch)186 static bool Matches(CharClass clz, char ch) { 187 switch (clz) { 188 case ALL: 189 return true; 190 case DIGIT: 191 return IsDigit(ch); 192 case LETTER: 193 return IsLetter(ch); 194 case LETTER_DIGIT: 195 return IsLetter(ch) || IsDigit(ch); 196 case LETTER_DIGIT_DASH_UNDERSCORE: 197 return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '_'); 198 case LETTER_DIGIT_DASH_DOT_SLASH: 199 return IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' || 200 ch == '/'; 201 case LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE: 202 return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' || 203 ch == '/' || ch == '_'); 204 case LETTER_DIGIT_DOT: 205 return IsLetter(ch) || IsDigit(ch) || ch == '.'; 206 case LETTER_DIGIT_DOT_PLUS_MINUS: 207 return IsLetter(ch) || IsDigit(ch) || ch == '+' || ch == '-' || 208 ch == '.'; 209 case LETTER_DIGIT_DOT_UNDERSCORE: 210 return IsLetter(ch) || IsDigit(ch) || ch == '.' || ch == '_'; 211 case LETTER_DIGIT_UNDERSCORE: 212 return IsLetter(ch) || IsDigit(ch) || ch == '_'; 213 case LOWERLETTER: 214 return ch >= 'a' && ch <= 'z'; 215 case LOWERLETTER_DIGIT: 216 return IsLowerLetter(ch) || IsDigit(ch); 217 case LOWERLETTER_DIGIT_UNDERSCORE: 218 return IsLowerLetter(ch) || IsDigit(ch) || ch == '_'; 219 case NON_ZERO_DIGIT: 220 return IsDigit(ch) && ch != '0'; 221 case SPACE: 222 return IsSpace(ch); 223 case UPPERLETTER: 224 return ch >= 'A' && ch <= 'Z'; 225 } 226 return false; 227 } 228 229 StringPiece cur_; 230 const char* capture_start_ = nullptr; 231 const char* capture_end_ = nullptr; 232 bool error_ = false; 233 234 friend class ScannerTest; 235 TF_DISALLOW_COPY_AND_ASSIGN(Scanner); 236 }; 237 238 } // namespace strings 239 } // namespace tensorflow 240 241 #endif // TENSORFLOW_LIB_STRINGS_SCANNER_H_ 242