1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_PLATFORM_SCANNER_H_ 17 #define TENSORFLOW_CORE_PLATFORM_SCANNER_H_ 18 19 #include <string> 20 21 #include "tensorflow/core/platform/macros.h" 22 #include "tensorflow/core/platform/str_util.h" 23 #include "tensorflow/core/platform/stringpiece.h" 24 25 namespace tensorflow { 26 namespace strings { 27 28 // Scanner provides simplified string parsing, in which a string is parsed as a 29 // series of scanning calls (e.g. One, Any, Many, OneLiteral, Eos), and then 30 // finally GetResult is called. If GetResult returns true, then it also returns 31 // the remaining characters and any captured substring. 32 // 33 // The range to capture can be controlled with RestartCapture and StopCapture; 34 // by default, all processed characters are captured. 35 class Scanner { 36 public: 37 // Classes of characters. Each enum name is to be read as the union of the 38 // parts - e.g., class LETTER_DIGIT means the class includes all letters and 39 // all digits. 40 // 41 // LETTER means ascii letter a-zA-Z. 42 // DIGIT means ascii digit: 0-9. 43 enum CharClass { 44 // NOTE: When adding a new CharClass, update the AllCharClasses ScannerTest 45 // in scanner_test.cc 46 ALL, 47 DIGIT, 48 LETTER, 49 LETTER_DIGIT, 50 LETTER_DIGIT_DASH_UNDERSCORE, 51 LETTER_DIGIT_DASH_DOT_SLASH, // SLASH is / only, not backslash 52 LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE, // SLASH is / only, not backslash 53 LETTER_DIGIT_DOT, 54 LETTER_DIGIT_DOT_PLUS_MINUS, 55 LETTER_DIGIT_DOT_UNDERSCORE, 56 LETTER_DIGIT_UNDERSCORE, 57 LOWERLETTER, 58 LOWERLETTER_DIGIT, 59 LOWERLETTER_DIGIT_UNDERSCORE, 60 NON_ZERO_DIGIT, 61 SPACE, 62 UPPERLETTER, 63 RANGLE, 64 }; 65 Scanner(StringPiece source)66 explicit Scanner(StringPiece source) : cur_(source) { RestartCapture(); } 67 68 // Consume the next character of the given class from input. If the next 69 // character is not in the class, then GetResult will ultimately return false. One(CharClass clz)70 Scanner& One(CharClass clz) { 71 if (cur_.empty() || !Matches(clz, cur_[0])) { 72 return Error(); 73 } 74 cur_.remove_prefix(1); 75 return *this; 76 } 77 78 // Consume the next s.size() characters of the input, if they match <s>. If 79 // they don't match <s>, this is a no-op. ZeroOrOneLiteral(StringPiece s)80 Scanner& ZeroOrOneLiteral(StringPiece s) { 81 str_util::ConsumePrefix(&cur_, s); 82 return *this; 83 } 84 85 // Consume the next s.size() characters of the input, if they match <s>. If 86 // they don't match <s>, then GetResult will ultimately return false. OneLiteral(StringPiece s)87 Scanner& OneLiteral(StringPiece s) { 88 if (!str_util::ConsumePrefix(&cur_, s)) { 89 error_ = true; 90 } 91 return *this; 92 } 93 94 // Consume characters from the input as long as they match <clz>. Zero 95 // characters is still considered a match, so it will never cause GetResult to 96 // return false. Any(CharClass clz)97 Scanner& Any(CharClass clz) { 98 while (!cur_.empty() && Matches(clz, cur_[0])) { 99 cur_.remove_prefix(1); 100 } 101 return *this; 102 } 103 104 // Shorthand for One(clz).Any(clz). Many(CharClass clz)105 Scanner& Many(CharClass clz) { return One(clz).Any(clz); } 106 107 // Reset the capture start point. 108 // 109 // Later, when GetResult is called and if it returns true, the capture 110 // returned will start at the position at the time this was called. RestartCapture()111 Scanner& RestartCapture() { 112 capture_start_ = cur_.data(); 113 capture_end_ = nullptr; 114 return *this; 115 } 116 117 // Stop capturing input. 118 // 119 // Later, when GetResult is called and if it returns true, the capture 120 // returned will end at the position at the time this was called. StopCapture()121 Scanner& StopCapture() { 122 capture_end_ = cur_.data(); 123 return *this; 124 } 125 126 // If not at the input of input, then GetResult will ultimately return false. Eos()127 Scanner& Eos() { 128 if (!cur_.empty()) error_ = true; 129 return *this; 130 } 131 132 // Shorthand for Any(SPACE). AnySpace()133 Scanner& AnySpace() { return Any(SPACE); } 134 135 // This scans input until <end_ch> is reached. <end_ch> is NOT consumed. ScanUntil(char end_ch)136 Scanner& ScanUntil(char end_ch) { 137 ScanUntilImpl(end_ch, false); 138 return *this; 139 } 140 141 // This scans input until <end_ch> is reached. <end_ch> is NOT consumed. 142 // Backslash escape sequences are skipped. 143 // Used for implementing quoted string scanning. ScanEscapedUntil(char end_ch)144 Scanner& ScanEscapedUntil(char end_ch) { 145 ScanUntilImpl(end_ch, true); 146 return *this; 147 } 148 149 // Return the next character that will be scanned, or <default_value> if there 150 // are no more characters to scan. 151 // Note that if a scan operation has failed (so GetResult() returns false), 152 // then the value of Peek may or may not have advanced since the scan 153 // operation that failed. 154 char Peek(char default_value = '\0') const { 155 return cur_.empty() ? default_value : cur_[0]; 156 } 157 158 // Returns false if there are no remaining characters to consume. empty()159 int empty() const { return cur_.empty(); } 160 161 // Returns true if the input string successfully matched. When true is 162 // returned, the remaining string is returned in <remaining> and the captured 163 // string returned in <capture>, if non-NULL. 164 bool GetResult(StringPiece* remaining = nullptr, 165 StringPiece* capture = nullptr); 166 167 private: 168 void ScanUntilImpl(char end_ch, bool escaped); 169 Error()170 Scanner& Error() { 171 error_ = true; 172 return *this; 173 } 174 IsLetter(char ch)175 static bool IsLetter(char ch) { 176 return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); 177 } 178 IsLowerLetter(char ch)179 static bool IsLowerLetter(char ch) { return ch >= 'a' && ch <= 'z'; } 180 IsDigit(char ch)181 static bool IsDigit(char ch) { return ch >= '0' && ch <= '9'; } 182 IsSpace(char ch)183 static bool IsSpace(char ch) { 184 return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\v' || ch == '\f' || 185 ch == '\r'); 186 } 187 Matches(CharClass clz,char ch)188 static bool Matches(CharClass clz, char ch) { 189 switch (clz) { 190 case ALL: 191 return true; 192 case DIGIT: 193 return IsDigit(ch); 194 case LETTER: 195 return IsLetter(ch); 196 case LETTER_DIGIT: 197 return IsLetter(ch) || IsDigit(ch); 198 case LETTER_DIGIT_DASH_UNDERSCORE: 199 return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '_'); 200 case LETTER_DIGIT_DASH_DOT_SLASH: 201 return IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' || 202 ch == '/'; 203 case LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE: 204 return (IsLetter(ch) || IsDigit(ch) || ch == '-' || ch == '.' || 205 ch == '/' || ch == '_'); 206 case LETTER_DIGIT_DOT: 207 return IsLetter(ch) || IsDigit(ch) || ch == '.'; 208 case LETTER_DIGIT_DOT_PLUS_MINUS: 209 return IsLetter(ch) || IsDigit(ch) || ch == '+' || ch == '-' || 210 ch == '.'; 211 case LETTER_DIGIT_DOT_UNDERSCORE: 212 return IsLetter(ch) || IsDigit(ch) || ch == '.' || ch == '_'; 213 case LETTER_DIGIT_UNDERSCORE: 214 return IsLetter(ch) || IsDigit(ch) || ch == '_'; 215 case LOWERLETTER: 216 return ch >= 'a' && ch <= 'z'; 217 case LOWERLETTER_DIGIT: 218 return IsLowerLetter(ch) || IsDigit(ch); 219 case LOWERLETTER_DIGIT_UNDERSCORE: 220 return IsLowerLetter(ch) || IsDigit(ch) || ch == '_'; 221 case NON_ZERO_DIGIT: 222 return IsDigit(ch) && ch != '0'; 223 case SPACE: 224 return IsSpace(ch); 225 case UPPERLETTER: 226 return ch >= 'A' && ch <= 'Z'; 227 case RANGLE: 228 return ch == '>'; 229 } 230 return false; 231 } 232 233 StringPiece cur_; 234 const char* capture_start_ = nullptr; 235 const char* capture_end_ = nullptr; 236 bool error_ = false; 237 238 friend class ScannerTest; 239 TF_DISALLOW_COPY_AND_ASSIGN(Scanner); 240 }; 241 242 } // namespace strings 243 } // namespace tensorflow 244 245 #endif // TENSORFLOW_CORE_PLATFORM_SCANNER_H_ 246