1 // Copyright 2019 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include <stddef.h>
6 #include <sys/types.h>
7
8 #include <memory>
9 #include <stdexcept>
10 #include <string>
11 #include <tuple>
12 #include <utility>
13 #include <vector>
14
15 #include "absl/strings/string_view.h"
16 #include "pybind11/buffer_info.h"
17 #include "pybind11/gil.h"
18 #include "pybind11/pybind11.h"
19 #include "pybind11/pytypes.h"
20 #include "pybind11/stl.h" // IWYU pragma: keep
21 #include "re2/filtered_re2.h"
22 #include "re2/re2.h"
23 #include "re2/set.h"
24
25 #ifdef _WIN32
26 #include <basetsd.h>
27 #define ssize_t SSIZE_T
28 #endif
29
30 namespace re2_python {
31
32 // This is conventional.
33 namespace py = pybind11;
34
35 // In terms of the pybind11 API, a py::buffer is merely a py::object that
36 // supports the buffer interface/protocol and you must explicitly request
37 // a py::buffer_info in order to access the actual bytes. Under the hood,
38 // the py::buffer_info manages a reference count to the py::buffer, so it
39 // must be constructed and subsequently destructed while holding the GIL.
FromBytes(const py::buffer_info & bytes)40 static inline absl::string_view FromBytes(const py::buffer_info& bytes) {
41 char* data = reinterpret_cast<char*>(bytes.ptr);
42 ssize_t size = bytes.size;
43 return absl::string_view(data, size);
44 }
45
OneCharLen(const char * ptr)46 static inline int OneCharLen(const char* ptr) {
47 return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*ptr & 0xFF) >> 4];
48 }
49
50 // Helper function for when Python encodes str to bytes and then needs to
51 // convert str offsets to bytes offsets. Assumes that text is valid UTF-8.
CharLenToBytes(py::buffer buffer,ssize_t pos,ssize_t len)52 ssize_t CharLenToBytes(py::buffer buffer, ssize_t pos, ssize_t len) {
53 auto bytes = buffer.request();
54 auto text = FromBytes(bytes);
55 auto ptr = text.data() + pos;
56 auto end = text.data() + text.size();
57 while (ptr < end && len > 0) {
58 ptr += OneCharLen(ptr);
59 --len;
60 }
61 return ptr - (text.data() + pos);
62 }
63
64 // Helper function for when Python decodes bytes to str and then needs to
65 // convert bytes offsets to str offsets. Assumes that text is valid UTF-8.
BytesToCharLen(py::buffer buffer,ssize_t pos,ssize_t endpos)66 ssize_t BytesToCharLen(py::buffer buffer, ssize_t pos, ssize_t endpos) {
67 auto bytes = buffer.request();
68 auto text = FromBytes(bytes);
69 auto ptr = text.data() + pos;
70 auto end = text.data() + endpos;
71 ssize_t len = 0;
72 while (ptr < end) {
73 ptr += OneCharLen(ptr);
74 ++len;
75 }
76 return len;
77 }
78
RE2InitShim(py::buffer buffer,const RE2::Options & options)79 std::unique_ptr<RE2> RE2InitShim(py::buffer buffer,
80 const RE2::Options& options) {
81 auto bytes = buffer.request();
82 auto pattern = FromBytes(bytes);
83 return std::make_unique<RE2>(pattern, options);
84 }
85
RE2ErrorShim(const RE2 & self)86 py::bytes RE2ErrorShim(const RE2& self) {
87 // Return std::string as bytes. That is, without decoding to str.
88 return self.error();
89 }
90
RE2NamedCapturingGroupsShim(const RE2 & self)91 std::vector<std::pair<py::bytes, int>> RE2NamedCapturingGroupsShim(
92 const RE2& self) {
93 const int num_groups = self.NumberOfCapturingGroups();
94 std::vector<std::pair<py::bytes, int>> groups;
95 groups.reserve(num_groups);
96 for (const auto& it : self.NamedCapturingGroups()) {
97 groups.emplace_back(it.first, it.second);
98 }
99 return groups;
100 }
101
RE2ProgramFanoutShim(const RE2 & self)102 std::vector<int> RE2ProgramFanoutShim(const RE2& self) {
103 std::vector<int> histogram;
104 self.ProgramFanout(&histogram);
105 return histogram;
106 }
107
RE2ReverseProgramFanoutShim(const RE2 & self)108 std::vector<int> RE2ReverseProgramFanoutShim(const RE2& self) {
109 std::vector<int> histogram;
110 self.ReverseProgramFanout(&histogram);
111 return histogram;
112 }
113
RE2PossibleMatchRangeShim(const RE2 & self,int maxlen)114 std::tuple<bool, py::bytes, py::bytes> RE2PossibleMatchRangeShim(
115 const RE2& self, int maxlen) {
116 std::string min, max;
117 // Return std::string as bytes. That is, without decoding to str.
118 return {self.PossibleMatchRange(&min, &max, maxlen), min, max};
119 }
120
RE2MatchShim(const RE2 & self,RE2::Anchor anchor,py::buffer buffer,ssize_t pos,ssize_t endpos)121 std::vector<std::pair<ssize_t, ssize_t>> RE2MatchShim(const RE2& self,
122 RE2::Anchor anchor,
123 py::buffer buffer,
124 ssize_t pos,
125 ssize_t endpos) {
126 auto bytes = buffer.request();
127 auto text = FromBytes(bytes);
128 const int num_groups = self.NumberOfCapturingGroups() + 1; // need $0
129 std::vector<absl::string_view> groups;
130 groups.resize(num_groups);
131 py::gil_scoped_release release_gil;
132 if (!self.Match(text, pos, endpos, anchor, groups.data(), groups.size())) {
133 // Ensure that groups are null before converting to spans!
134 for (auto& it : groups) {
135 it = absl::string_view();
136 }
137 }
138 std::vector<std::pair<ssize_t, ssize_t>> spans;
139 spans.reserve(num_groups);
140 for (const auto& it : groups) {
141 if (it.data() == NULL) {
142 spans.emplace_back(-1, -1);
143 } else {
144 spans.emplace_back(it.data() - text.data(),
145 it.data() - text.data() + it.size());
146 }
147 }
148 return spans;
149 }
150
RE2QuoteMetaShim(py::buffer buffer)151 py::bytes RE2QuoteMetaShim(py::buffer buffer) {
152 auto bytes = buffer.request();
153 auto pattern = FromBytes(bytes);
154 // Return std::string as bytes. That is, without decoding to str.
155 return RE2::QuoteMeta(pattern);
156 }
157
158 class Set {
159 public:
Set(RE2::Anchor anchor,const RE2::Options & options)160 Set(RE2::Anchor anchor, const RE2::Options& options)
161 : set_(options, anchor) {}
162
163 ~Set() = default;
164
165 // Not copyable or movable.
166 Set(const Set&) = delete;
167 Set& operator=(const Set&) = delete;
168
Add(py::buffer buffer)169 int Add(py::buffer buffer) {
170 auto bytes = buffer.request();
171 auto pattern = FromBytes(bytes);
172 int index = set_.Add(pattern, /*error=*/NULL); // -1 on error
173 return index;
174 }
175
Compile()176 bool Compile() {
177 // Compiling can fail.
178 return set_.Compile();
179 }
180
Match(py::buffer buffer) const181 std::vector<int> Match(py::buffer buffer) const {
182 auto bytes = buffer.request();
183 auto text = FromBytes(bytes);
184 std::vector<int> matches;
185 py::gil_scoped_release release_gil;
186 set_.Match(text, &matches);
187 return matches;
188 }
189
190 private:
191 RE2::Set set_;
192 };
193
194 class Filter {
195 public:
196 Filter() = default;
197 ~Filter() = default;
198
199 // Not copyable or movable.
200 Filter(const Filter&) = delete;
201 Filter& operator=(const Filter&) = delete;
202
Add(py::buffer buffer,const RE2::Options & options)203 int Add(py::buffer buffer, const RE2::Options& options) {
204 auto bytes = buffer.request();
205 auto pattern = FromBytes(bytes);
206 int index = -1; // not clobbered on error
207 filter_.Add(pattern, options, &index);
208 return index;
209 }
210
Compile()211 bool Compile() {
212 std::vector<std::string> atoms;
213 filter_.Compile(&atoms);
214 RE2::Options options;
215 options.set_literal(true);
216 options.set_case_sensitive(false);
217 set_ = std::make_unique<RE2::Set>(options, RE2::UNANCHORED);
218 for (int i = 0; i < static_cast<int>(atoms.size()); ++i) {
219 if (set_->Add(atoms[i], /*error=*/NULL) != i) {
220 // Should never happen: the atom is a literal!
221 py::pybind11_fail("set_->Add() failed");
222 }
223 }
224 // Compiling can fail.
225 return set_->Compile();
226 }
227
Match(py::buffer buffer,bool potential) const228 std::vector<int> Match(py::buffer buffer, bool potential) const {
229 if (set_ == nullptr) {
230 py::pybind11_fail("Match() called before compiling");
231 }
232
233 auto bytes = buffer.request();
234 auto text = FromBytes(bytes);
235 std::vector<int> atoms;
236 py::gil_scoped_release release_gil;
237 set_->Match(text, &atoms);
238 std::vector<int> matches;
239 if (potential) {
240 filter_.AllPotentials(atoms, &matches);
241 } else {
242 filter_.AllMatches(text, atoms, &matches);
243 }
244 return matches;
245 }
246
GetRE2(int index) const247 const RE2& GetRE2(int index) const {
248 return filter_.GetRE2(index);
249 }
250
251 private:
252 re2::FilteredRE2 filter_;
253 std::unique_ptr<RE2::Set> set_;
254 };
255
PYBIND11_MODULE(_re2,module)256 PYBIND11_MODULE(_re2, module) {
257 // Translate exceptions thrown by py::pybind11_fail() into Python.
258 py::register_local_exception<std::runtime_error>(module, "Error");
259
260 module.def("CharLenToBytes", &CharLenToBytes);
261 module.def("BytesToCharLen", &BytesToCharLen);
262
263 // CLASSES
264 // class RE2
265 // enum Anchor
266 // class Options
267 // enum Encoding
268 // class Set
269 // class Filter
270 py::class_<RE2> re2(module, "RE2");
271 py::enum_<RE2::Anchor> anchor(re2, "Anchor");
272 py::class_<RE2::Options> options(re2, "Options");
273 py::enum_<RE2::Options::Encoding> encoding(options, "Encoding");
274 py::class_<Set> set(module, "Set");
275 py::class_<Filter> filter(module, "Filter");
276
277 anchor.value("UNANCHORED", RE2::Anchor::UNANCHORED);
278 anchor.value("ANCHOR_START", RE2::Anchor::ANCHOR_START);
279 anchor.value("ANCHOR_BOTH", RE2::Anchor::ANCHOR_BOTH);
280
281 encoding.value("UTF8", RE2::Options::Encoding::EncodingUTF8);
282 encoding.value("LATIN1", RE2::Options::Encoding::EncodingLatin1);
283
284 options.def(py::init<>())
285 .def_property("max_mem", //
286 &RE2::Options::max_mem, //
287 &RE2::Options::set_max_mem) //
288 .def_property("encoding", //
289 &RE2::Options::encoding, //
290 &RE2::Options::set_encoding) //
291 .def_property("posix_syntax", //
292 &RE2::Options::posix_syntax, //
293 &RE2::Options::set_posix_syntax) //
294 .def_property("longest_match", //
295 &RE2::Options::longest_match, //
296 &RE2::Options::set_longest_match) //
297 .def_property("log_errors", //
298 &RE2::Options::log_errors, //
299 &RE2::Options::set_log_errors) //
300 .def_property("literal", //
301 &RE2::Options::literal, //
302 &RE2::Options::set_literal) //
303 .def_property("never_nl", //
304 &RE2::Options::never_nl, //
305 &RE2::Options::set_never_nl) //
306 .def_property("dot_nl", //
307 &RE2::Options::dot_nl, //
308 &RE2::Options::set_dot_nl) //
309 .def_property("never_capture", //
310 &RE2::Options::never_capture, //
311 &RE2::Options::set_never_capture) //
312 .def_property("case_sensitive", //
313 &RE2::Options::case_sensitive, //
314 &RE2::Options::set_case_sensitive) //
315 .def_property("perl_classes", //
316 &RE2::Options::perl_classes, //
317 &RE2::Options::set_perl_classes) //
318 .def_property("word_boundary", //
319 &RE2::Options::word_boundary, //
320 &RE2::Options::set_word_boundary) //
321 .def_property("one_line", //
322 &RE2::Options::one_line, //
323 &RE2::Options::set_one_line); //
324
325 re2.def(py::init(&RE2InitShim))
326 .def("ok", &RE2::ok)
327 .def("error", &RE2ErrorShim)
328 .def("options", &RE2::options)
329 .def("NumberOfCapturingGroups", &RE2::NumberOfCapturingGroups)
330 .def("NamedCapturingGroups", &RE2NamedCapturingGroupsShim)
331 .def("ProgramSize", &RE2::ProgramSize)
332 .def("ReverseProgramSize", &RE2::ReverseProgramSize)
333 .def("ProgramFanout", &RE2ProgramFanoutShim)
334 .def("ReverseProgramFanout", &RE2ReverseProgramFanoutShim)
335 .def("PossibleMatchRange", &RE2PossibleMatchRangeShim)
336 .def("Match", &RE2MatchShim)
337 .def_static("QuoteMeta", &RE2QuoteMetaShim);
338
339 set.def(py::init<RE2::Anchor, const RE2::Options&>())
340 .def("Add", &Set::Add)
341 .def("Compile", &Set::Compile)
342 .def("Match", &Set::Match);
343
344 filter.def(py::init<>())
345 .def("Add", &Filter::Add)
346 .def("Compile", &Filter::Compile)
347 .def("Match", &Filter::Match)
348 .def("GetRE2", &Filter::GetRE2,
349 py::return_value_policy::reference_internal);
350 }
351
352 } // namespace re2_python
353