• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include <stddef.h>
6 #include <sys/types.h>
7 
8 #include <memory>
9 #include <stdexcept>
10 #include <string>
11 #include <tuple>
12 #include <utility>
13 #include <vector>
14 
15 #include "absl/strings/string_view.h"
16 #include "pybind11/buffer_info.h"
17 #include "pybind11/gil.h"
18 #include "pybind11/pybind11.h"
19 #include "pybind11/pytypes.h"
20 #include "pybind11/stl.h"  // IWYU pragma: keep
21 #include "re2/filtered_re2.h"
22 #include "re2/re2.h"
23 #include "re2/set.h"
24 
25 #ifdef _WIN32
26 #include <basetsd.h>
27 #define ssize_t SSIZE_T
28 #endif
29 
30 namespace re2_python {
31 
32 // This is conventional.
33 namespace py = pybind11;
34 
35 // In terms of the pybind11 API, a py::buffer is merely a py::object that
36 // supports the buffer interface/protocol and you must explicitly request
37 // a py::buffer_info in order to access the actual bytes. Under the hood,
38 // the py::buffer_info manages a reference count to the py::buffer, so it
39 // must be constructed and subsequently destructed while holding the GIL.
FromBytes(const py::buffer_info & bytes)40 static inline absl::string_view FromBytes(const py::buffer_info& bytes) {
41   char* data = reinterpret_cast<char*>(bytes.ptr);
42   ssize_t size = bytes.size;
43   return absl::string_view(data, size);
44 }
45 
OneCharLen(const char * ptr)46 static inline int OneCharLen(const char* ptr) {
47   return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*ptr & 0xFF) >> 4];
48 }
49 
50 // Helper function for when Python encodes str to bytes and then needs to
51 // convert str offsets to bytes offsets. Assumes that text is valid UTF-8.
CharLenToBytes(py::buffer buffer,ssize_t pos,ssize_t len)52 ssize_t CharLenToBytes(py::buffer buffer, ssize_t pos, ssize_t len) {
53   auto bytes = buffer.request();
54   auto text = FromBytes(bytes);
55   auto ptr = text.data() + pos;
56   auto end = text.data() + text.size();
57   while (ptr < end && len > 0) {
58     ptr += OneCharLen(ptr);
59     --len;
60   }
61   return ptr - (text.data() + pos);
62 }
63 
64 // Helper function for when Python decodes bytes to str and then needs to
65 // convert bytes offsets to str offsets. Assumes that text is valid UTF-8.
BytesToCharLen(py::buffer buffer,ssize_t pos,ssize_t endpos)66 ssize_t BytesToCharLen(py::buffer buffer, ssize_t pos, ssize_t endpos) {
67   auto bytes = buffer.request();
68   auto text = FromBytes(bytes);
69   auto ptr = text.data() + pos;
70   auto end = text.data() + endpos;
71   ssize_t len = 0;
72   while (ptr < end) {
73     ptr += OneCharLen(ptr);
74     ++len;
75   }
76   return len;
77 }
78 
RE2InitShim(py::buffer buffer,const RE2::Options & options)79 std::unique_ptr<RE2> RE2InitShim(py::buffer buffer,
80                                  const RE2::Options& options) {
81   auto bytes = buffer.request();
82   auto pattern = FromBytes(bytes);
83   return std::make_unique<RE2>(pattern, options);
84 }
85 
RE2ErrorShim(const RE2 & self)86 py::bytes RE2ErrorShim(const RE2& self) {
87   // Return std::string as bytes. That is, without decoding to str.
88   return self.error();
89 }
90 
RE2NamedCapturingGroupsShim(const RE2 & self)91 std::vector<std::pair<py::bytes, int>> RE2NamedCapturingGroupsShim(
92     const RE2& self) {
93   const int num_groups = self.NumberOfCapturingGroups();
94   std::vector<std::pair<py::bytes, int>> groups;
95   groups.reserve(num_groups);
96   for (const auto& it : self.NamedCapturingGroups()) {
97     groups.emplace_back(it.first, it.second);
98   }
99   return groups;
100 }
101 
RE2ProgramFanoutShim(const RE2 & self)102 std::vector<int> RE2ProgramFanoutShim(const RE2& self) {
103   std::vector<int> histogram;
104   self.ProgramFanout(&histogram);
105   return histogram;
106 }
107 
RE2ReverseProgramFanoutShim(const RE2 & self)108 std::vector<int> RE2ReverseProgramFanoutShim(const RE2& self) {
109   std::vector<int> histogram;
110   self.ReverseProgramFanout(&histogram);
111   return histogram;
112 }
113 
RE2PossibleMatchRangeShim(const RE2 & self,int maxlen)114 std::tuple<bool, py::bytes, py::bytes> RE2PossibleMatchRangeShim(
115     const RE2& self, int maxlen) {
116   std::string min, max;
117   // Return std::string as bytes. That is, without decoding to str.
118   return {self.PossibleMatchRange(&min, &max, maxlen), min, max};
119 }
120 
RE2MatchShim(const RE2 & self,RE2::Anchor anchor,py::buffer buffer,ssize_t pos,ssize_t endpos)121 std::vector<std::pair<ssize_t, ssize_t>> RE2MatchShim(const RE2& self,
122                                                       RE2::Anchor anchor,
123                                                       py::buffer buffer,
124                                                       ssize_t pos,
125                                                       ssize_t endpos) {
126   auto bytes = buffer.request();
127   auto text = FromBytes(bytes);
128   const int num_groups = self.NumberOfCapturingGroups() + 1;  // need $0
129   std::vector<absl::string_view> groups;
130   groups.resize(num_groups);
131   py::gil_scoped_release release_gil;
132   if (!self.Match(text, pos, endpos, anchor, groups.data(), groups.size())) {
133     // Ensure that groups are null before converting to spans!
134     for (auto& it : groups) {
135       it = absl::string_view();
136     }
137   }
138   std::vector<std::pair<ssize_t, ssize_t>> spans;
139   spans.reserve(num_groups);
140   for (const auto& it : groups) {
141     if (it.data() == NULL) {
142       spans.emplace_back(-1, -1);
143     } else {
144       spans.emplace_back(it.data() - text.data(),
145                          it.data() - text.data() + it.size());
146     }
147   }
148   return spans;
149 }
150 
RE2QuoteMetaShim(py::buffer buffer)151 py::bytes RE2QuoteMetaShim(py::buffer buffer) {
152   auto bytes = buffer.request();
153   auto pattern = FromBytes(bytes);
154   // Return std::string as bytes. That is, without decoding to str.
155   return RE2::QuoteMeta(pattern);
156 }
157 
158 class Set {
159  public:
Set(RE2::Anchor anchor,const RE2::Options & options)160   Set(RE2::Anchor anchor, const RE2::Options& options)
161       : set_(options, anchor) {}
162 
163   ~Set() = default;
164 
165   // Not copyable or movable.
166   Set(const Set&) = delete;
167   Set& operator=(const Set&) = delete;
168 
Add(py::buffer buffer)169   int Add(py::buffer buffer) {
170     auto bytes = buffer.request();
171     auto pattern = FromBytes(bytes);
172     int index = set_.Add(pattern, /*error=*/NULL);  // -1 on error
173     return index;
174   }
175 
Compile()176   bool Compile() {
177     // Compiling can fail.
178     return set_.Compile();
179   }
180 
Match(py::buffer buffer) const181   std::vector<int> Match(py::buffer buffer) const {
182     auto bytes = buffer.request();
183     auto text = FromBytes(bytes);
184     std::vector<int> matches;
185     py::gil_scoped_release release_gil;
186     set_.Match(text, &matches);
187     return matches;
188   }
189 
190  private:
191   RE2::Set set_;
192 };
193 
194 class Filter {
195  public:
196   Filter() = default;
197   ~Filter() = default;
198 
199   // Not copyable or movable.
200   Filter(const Filter&) = delete;
201   Filter& operator=(const Filter&) = delete;
202 
Add(py::buffer buffer,const RE2::Options & options)203   int Add(py::buffer buffer, const RE2::Options& options) {
204     auto bytes = buffer.request();
205     auto pattern = FromBytes(bytes);
206     int index = -1;  // not clobbered on error
207     filter_.Add(pattern, options, &index);
208     return index;
209   }
210 
Compile()211   bool Compile() {
212     std::vector<std::string> atoms;
213     filter_.Compile(&atoms);
214     RE2::Options options;
215     options.set_literal(true);
216     options.set_case_sensitive(false);
217     set_ = std::make_unique<RE2::Set>(options, RE2::UNANCHORED);
218     for (int i = 0; i < static_cast<int>(atoms.size()); ++i) {
219       if (set_->Add(atoms[i], /*error=*/NULL) != i) {
220         // Should never happen: the atom is a literal!
221         py::pybind11_fail("set_->Add() failed");
222       }
223     }
224     // Compiling can fail.
225     return set_->Compile();
226   }
227 
Match(py::buffer buffer,bool potential) const228   std::vector<int> Match(py::buffer buffer, bool potential) const {
229     if (set_ == nullptr) {
230       py::pybind11_fail("Match() called before compiling");
231     }
232 
233     auto bytes = buffer.request();
234     auto text = FromBytes(bytes);
235     std::vector<int> atoms;
236     py::gil_scoped_release release_gil;
237     set_->Match(text, &atoms);
238     std::vector<int> matches;
239     if (potential) {
240       filter_.AllPotentials(atoms, &matches);
241     } else {
242       filter_.AllMatches(text, atoms, &matches);
243     }
244     return matches;
245   }
246 
GetRE2(int index) const247   const RE2& GetRE2(int index) const {
248     return filter_.GetRE2(index);
249   }
250 
251  private:
252   re2::FilteredRE2 filter_;
253   std::unique_ptr<RE2::Set> set_;
254 };
255 
PYBIND11_MODULE(_re2,module)256 PYBIND11_MODULE(_re2, module) {
257   // Translate exceptions thrown by py::pybind11_fail() into Python.
258   py::register_local_exception<std::runtime_error>(module, "Error");
259 
260   module.def("CharLenToBytes", &CharLenToBytes);
261   module.def("BytesToCharLen", &BytesToCharLen);
262 
263   // CLASSES
264   //     class RE2
265   //         enum Anchor
266   //         class Options
267   //             enum Encoding
268   //     class Set
269   //     class Filter
270   py::class_<RE2> re2(module, "RE2");
271   py::enum_<RE2::Anchor> anchor(re2, "Anchor");
272   py::class_<RE2::Options> options(re2, "Options");
273   py::enum_<RE2::Options::Encoding> encoding(options, "Encoding");
274   py::class_<Set> set(module, "Set");
275   py::class_<Filter> filter(module, "Filter");
276 
277   anchor.value("UNANCHORED", RE2::Anchor::UNANCHORED);
278   anchor.value("ANCHOR_START", RE2::Anchor::ANCHOR_START);
279   anchor.value("ANCHOR_BOTH", RE2::Anchor::ANCHOR_BOTH);
280 
281   encoding.value("UTF8", RE2::Options::Encoding::EncodingUTF8);
282   encoding.value("LATIN1", RE2::Options::Encoding::EncodingLatin1);
283 
284   options.def(py::init<>())
285       .def_property("max_mem",                          //
286                     &RE2::Options::max_mem,             //
287                     &RE2::Options::set_max_mem)         //
288       .def_property("encoding",                         //
289                     &RE2::Options::encoding,            //
290                     &RE2::Options::set_encoding)        //
291       .def_property("posix_syntax",                     //
292                     &RE2::Options::posix_syntax,        //
293                     &RE2::Options::set_posix_syntax)    //
294       .def_property("longest_match",                    //
295                     &RE2::Options::longest_match,       //
296                     &RE2::Options::set_longest_match)   //
297       .def_property("log_errors",                       //
298                     &RE2::Options::log_errors,          //
299                     &RE2::Options::set_log_errors)      //
300       .def_property("literal",                          //
301                     &RE2::Options::literal,             //
302                     &RE2::Options::set_literal)         //
303       .def_property("never_nl",                         //
304                     &RE2::Options::never_nl,            //
305                     &RE2::Options::set_never_nl)        //
306       .def_property("dot_nl",                           //
307                     &RE2::Options::dot_nl,              //
308                     &RE2::Options::set_dot_nl)          //
309       .def_property("never_capture",                    //
310                     &RE2::Options::never_capture,       //
311                     &RE2::Options::set_never_capture)   //
312       .def_property("case_sensitive",                   //
313                     &RE2::Options::case_sensitive,      //
314                     &RE2::Options::set_case_sensitive)  //
315       .def_property("perl_classes",                     //
316                     &RE2::Options::perl_classes,        //
317                     &RE2::Options::set_perl_classes)    //
318       .def_property("word_boundary",                    //
319                     &RE2::Options::word_boundary,       //
320                     &RE2::Options::set_word_boundary)   //
321       .def_property("one_line",                         //
322                     &RE2::Options::one_line,            //
323                     &RE2::Options::set_one_line);       //
324 
325   re2.def(py::init(&RE2InitShim))
326       .def("ok", &RE2::ok)
327       .def("error", &RE2ErrorShim)
328       .def("options", &RE2::options)
329       .def("NumberOfCapturingGroups", &RE2::NumberOfCapturingGroups)
330       .def("NamedCapturingGroups", &RE2NamedCapturingGroupsShim)
331       .def("ProgramSize", &RE2::ProgramSize)
332       .def("ReverseProgramSize", &RE2::ReverseProgramSize)
333       .def("ProgramFanout", &RE2ProgramFanoutShim)
334       .def("ReverseProgramFanout", &RE2ReverseProgramFanoutShim)
335       .def("PossibleMatchRange", &RE2PossibleMatchRangeShim)
336       .def("Match", &RE2MatchShim)
337       .def_static("QuoteMeta", &RE2QuoteMetaShim);
338 
339   set.def(py::init<RE2::Anchor, const RE2::Options&>())
340       .def("Add", &Set::Add)
341       .def("Compile", &Set::Compile)
342       .def("Match", &Set::Match);
343 
344   filter.def(py::init<>())
345       .def("Add", &Filter::Add)
346       .def("Compile", &Filter::Compile)
347       .def("Match", &Filter::Match)
348       .def("GetRE2", &Filter::GetRE2,
349            py::return_value_policy::reference_internal);
350 }
351 
352 }  // namespace re2_python
353