1 // Copyright (c) 2005, Google Inc. 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions are 6 // met: 7 // 8 // * Redistributions of source code must retain the above copyright 9 // notice, this list of conditions and the following disclaimer. 10 // * Redistributions in binary form must reproduce the above 11 // copyright notice, this list of conditions and the following disclaimer 12 // in the documentation and/or other materials provided with the 13 // distribution. 14 // * Neither the name of Google Inc. nor the names of its 15 // contributors may be used to endorse or promote products derived from 16 // this software without specific prior written permission. 17 // 18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 // 30 // Author: Sanjay Ghemawat 31 // 32 // Regular-expression based scanner for parsing an input stream. 33 // 34 // Example 1: parse a sequence of "var = number" entries from input: 35 // 36 // Scanner scanner(input); 37 // string var; 38 // int number; 39 // scanner.SetSkipExpression("\\s+"); // Skip any white space we encounter 40 // while (scanner.Consume("(\\w+) = (\\d+)", &var, &number)) { 41 // ...; 42 // } 43 44 #ifndef _PCRE_SCANNER_H 45 #define _PCRE_SCANNER_H 46 47 #include <assert.h> 48 #include <string> 49 #include <vector> 50 51 #include <pcrecpp.h> 52 #include <pcre_stringpiece.h> 53 54 namespace pcrecpp { 55 56 class Scanner { 57 public: 58 Scanner(); 59 explicit Scanner(const std::string& input); 60 ~Scanner(); 61 62 // Return current line number. The returned line-number is 63 // one-based. I.e. it returns 1 + the number of consumed newlines. 64 // 65 // Note: this method may be slow. It may take time proportional to 66 // the size of the input. 67 int LineNumber() const; 68 69 // Return the byte-offset that the scanner is looking in the 70 // input data; 71 int Offset() const; 72 73 // Return true iff the start of the remaining input matches "re" 74 bool LookingAt(const RE& re) const; 75 76 // Return true iff all of the following are true 77 // a. the start of the remaining input matches "re", 78 // b. if any arguments are supplied, matched sub-patterns can be 79 // parsed and stored into the arguments. 80 // If it returns true, it skips over the matched input and any 81 // following input that matches the "skip" regular expression. 82 template<typename ... ARGS> Consume(const RE & re,ARGS &&...args)83 bool Consume(const RE& re, ARGS && ... args) { 84 const bool result = re.Consume(&input_, args...); 85 if (result && should_skip_) 86 ConsumeSkip(); 87 return result; 88 } 89 90 // Set the "skip" regular expression. If after consuming some data, 91 // a prefix of the input matches this RE, it is automatically 92 // skipped. For example, a programming language scanner would use 93 // a skip RE that matches white space and comments. 94 // 95 // scanner.SetSkipExpression("\\s+|//.*|/[*](.|\n)*?[*]/"); 96 // 97 // Skipping repeats as long as it succeeds. We used to let people do 98 // this by writing "(...)*" in the regular expression, but that added 99 // up to lots of recursive calls within the pcre library, so now we 100 // control repetition explicitly via the function call API. 101 // 102 // You can pass NULL for "re" if you do not want any data to be skipped. 103 void Skip(const char* re); // DEPRECATED; does *not* repeat 104 void SetSkipExpression(const char* re); 105 106 // Temporarily pause "skip"ing. This 107 // Skip("Foo"); code ; DisableSkip(); code; EnableSkip() 108 // is similar to 109 // Skip("Foo"); code ; Skip(NULL); code ; Skip("Foo"); 110 // but avoids creating/deleting new RE objects. 111 void DisableSkip(); 112 113 // Reenable previously paused skipping. Any prefix of the input 114 // that matches the skip pattern is immediately dropped. 115 void EnableSkip(); 116 117 /***** Special wrappers around SetSkip() for some common idioms *****/ 118 119 // Arranges to skip whitespace, C comments, C++ comments. 120 // The overall RE is a disjunction of the following REs: 121 // \\s whitespace 122 // //.*\n C++ comment 123 // /[*](.|\n)*?[*]/ C comment (x*? means minimal repetitions of x) 124 // We get repetition via the semantics of SetSkipExpression, not by using * SkipCXXComments()125 void SkipCXXComments() { 126 SetSkipExpression("\\s|//.*\n|/[*](?:\n|.)*?[*]/"); 127 } 128 set_save_comments(bool comments)129 void set_save_comments(bool comments) { 130 save_comments_ = comments; 131 } 132 save_comments()133 bool save_comments() { 134 return save_comments_; 135 } 136 137 // Append to vector ranges the comments found in the 138 // byte range [start,end] (inclusive) of the input data. 139 // Only comments that were extracted entirely within that 140 // range are returned: no range splitting of atomically-extracted 141 // comments is performed. 142 void GetComments(int start, int end, std::vector<StringPiece> *ranges); 143 144 // Append to vector ranges the comments added 145 // since the last time this was called. This 146 // functionality is provided for efficiency when 147 // interleaving scanning with parsing. 148 void GetNextComments(std::vector<StringPiece> *ranges); 149 150 private: 151 std::string data_; // All the input data 152 StringPiece input_; // Unprocessed input 153 RE* skip_; // If non-NULL, RE for skipping input 154 bool should_skip_; // If true, use skip_ 155 bool skip_repeat_; // If true, repeat skip_ as long as it works 156 bool save_comments_; // If true, aggregate the skip expression 157 158 // the skipped comments 159 // TODO: later consider requiring that the StringPieces be added 160 // in order by their start position 161 std::vector<StringPiece> *comments_; 162 163 // the offset into comments_ that has been returned by GetNextComments 164 int comments_offset_; 165 166 // helper function to consume *skip_ and honour 167 // save_comments_ 168 void ConsumeSkip(); 169 }; 170 171 } // namespace pcrecpp 172 173 #endif /* _PCRE_SCANNER_H */ 174