// Copyright 2019 The RE2 Authors. All Rights Reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #include #include #include #include #include #include #include #include #include "absl/strings/string_view.h" #include "pybind11/buffer_info.h" #include "pybind11/gil.h" #include "pybind11/pybind11.h" #include "pybind11/pytypes.h" #include "pybind11/stl.h" // IWYU pragma: keep #include "re2/filtered_re2.h" #include "re2/re2.h" #include "re2/set.h" #ifdef _WIN32 #include #define ssize_t SSIZE_T #endif namespace re2_python { // This is conventional. namespace py = pybind11; // In terms of the pybind11 API, a py::buffer is merely a py::object that // supports the buffer interface/protocol and you must explicitly request // a py::buffer_info in order to access the actual bytes. Under the hood, // the py::buffer_info manages a reference count to the py::buffer, so it // must be constructed and subsequently destructed while holding the GIL. static inline absl::string_view FromBytes(const py::buffer_info& bytes) { char* data = reinterpret_cast(bytes.ptr); ssize_t size = bytes.size; return absl::string_view(data, size); } static inline int OneCharLen(const char* ptr) { return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*ptr & 0xFF) >> 4]; } // Helper function for when Python encodes str to bytes and then needs to // convert str offsets to bytes offsets. Assumes that text is valid UTF-8. ssize_t CharLenToBytes(py::buffer buffer, ssize_t pos, ssize_t len) { auto bytes = buffer.request(); auto text = FromBytes(bytes); auto ptr = text.data() + pos; auto end = text.data() + text.size(); while (ptr < end && len > 0) { ptr += OneCharLen(ptr); --len; } return ptr - (text.data() + pos); } // Helper function for when Python decodes bytes to str and then needs to // convert bytes offsets to str offsets. Assumes that text is valid UTF-8. ssize_t BytesToCharLen(py::buffer buffer, ssize_t pos, ssize_t endpos) { auto bytes = buffer.request(); auto text = FromBytes(bytes); auto ptr = text.data() + pos; auto end = text.data() + endpos; ssize_t len = 0; while (ptr < end) { ptr += OneCharLen(ptr); ++len; } return len; } std::unique_ptr RE2InitShim(py::buffer buffer, const RE2::Options& options) { auto bytes = buffer.request(); auto pattern = FromBytes(bytes); return std::make_unique(pattern, options); } py::bytes RE2ErrorShim(const RE2& self) { // Return std::string as bytes. That is, without decoding to str. return self.error(); } std::vector> RE2NamedCapturingGroupsShim( const RE2& self) { const int num_groups = self.NumberOfCapturingGroups(); std::vector> groups; groups.reserve(num_groups); for (const auto& it : self.NamedCapturingGroups()) { groups.emplace_back(it.first, it.second); } return groups; } std::vector RE2ProgramFanoutShim(const RE2& self) { std::vector histogram; self.ProgramFanout(&histogram); return histogram; } std::vector RE2ReverseProgramFanoutShim(const RE2& self) { std::vector histogram; self.ReverseProgramFanout(&histogram); return histogram; } std::tuple RE2PossibleMatchRangeShim( const RE2& self, int maxlen) { std::string min, max; // Return std::string as bytes. That is, without decoding to str. return {self.PossibleMatchRange(&min, &max, maxlen), min, max}; } std::vector> RE2MatchShim(const RE2& self, RE2::Anchor anchor, py::buffer buffer, ssize_t pos, ssize_t endpos) { auto bytes = buffer.request(); auto text = FromBytes(bytes); const int num_groups = self.NumberOfCapturingGroups() + 1; // need $0 std::vector groups; groups.resize(num_groups); py::gil_scoped_release release_gil; if (!self.Match(text, pos, endpos, anchor, groups.data(), groups.size())) { // Ensure that groups are null before converting to spans! for (auto& it : groups) { it = absl::string_view(); } } std::vector> spans; spans.reserve(num_groups); for (const auto& it : groups) { if (it.data() == NULL) { spans.emplace_back(-1, -1); } else { spans.emplace_back(it.data() - text.data(), it.data() - text.data() + it.size()); } } return spans; } py::bytes RE2QuoteMetaShim(py::buffer buffer) { auto bytes = buffer.request(); auto pattern = FromBytes(bytes); // Return std::string as bytes. That is, without decoding to str. return RE2::QuoteMeta(pattern); } class Set { public: Set(RE2::Anchor anchor, const RE2::Options& options) : set_(options, anchor) {} ~Set() = default; // Not copyable or movable. Set(const Set&) = delete; Set& operator=(const Set&) = delete; int Add(py::buffer buffer) { auto bytes = buffer.request(); auto pattern = FromBytes(bytes); int index = set_.Add(pattern, /*error=*/NULL); // -1 on error return index; } bool Compile() { // Compiling can fail. return set_.Compile(); } std::vector Match(py::buffer buffer) const { auto bytes = buffer.request(); auto text = FromBytes(bytes); std::vector matches; py::gil_scoped_release release_gil; set_.Match(text, &matches); return matches; } private: RE2::Set set_; }; class Filter { public: Filter() = default; ~Filter() = default; // Not copyable or movable. Filter(const Filter&) = delete; Filter& operator=(const Filter&) = delete; int Add(py::buffer buffer, const RE2::Options& options) { auto bytes = buffer.request(); auto pattern = FromBytes(bytes); int index = -1; // not clobbered on error filter_.Add(pattern, options, &index); return index; } bool Compile() { std::vector atoms; filter_.Compile(&atoms); RE2::Options options; options.set_literal(true); options.set_case_sensitive(false); set_ = std::make_unique(options, RE2::UNANCHORED); for (int i = 0; i < static_cast(atoms.size()); ++i) { if (set_->Add(atoms[i], /*error=*/NULL) != i) { // Should never happen: the atom is a literal! py::pybind11_fail("set_->Add() failed"); } } // Compiling can fail. return set_->Compile(); } std::vector Match(py::buffer buffer, bool potential) const { if (set_ == nullptr) { py::pybind11_fail("Match() called before compiling"); } auto bytes = buffer.request(); auto text = FromBytes(bytes); std::vector atoms; py::gil_scoped_release release_gil; set_->Match(text, &atoms); std::vector matches; if (potential) { filter_.AllPotentials(atoms, &matches); } else { filter_.AllMatches(text, atoms, &matches); } return matches; } const RE2& GetRE2(int index) const { return filter_.GetRE2(index); } private: re2::FilteredRE2 filter_; std::unique_ptr set_; }; PYBIND11_MODULE(_re2, module) { // Translate exceptions thrown by py::pybind11_fail() into Python. py::register_local_exception(module, "Error"); module.def("CharLenToBytes", &CharLenToBytes); module.def("BytesToCharLen", &BytesToCharLen); // CLASSES // class RE2 // enum Anchor // class Options // enum Encoding // class Set // class Filter py::class_ re2(module, "RE2"); py::enum_ anchor(re2, "Anchor"); py::class_ options(re2, "Options"); py::enum_ encoding(options, "Encoding"); py::class_ set(module, "Set"); py::class_ filter(module, "Filter"); anchor.value("UNANCHORED", RE2::Anchor::UNANCHORED); anchor.value("ANCHOR_START", RE2::Anchor::ANCHOR_START); anchor.value("ANCHOR_BOTH", RE2::Anchor::ANCHOR_BOTH); encoding.value("UTF8", RE2::Options::Encoding::EncodingUTF8); encoding.value("LATIN1", RE2::Options::Encoding::EncodingLatin1); options.def(py::init<>()) .def_property("max_mem", // &RE2::Options::max_mem, // &RE2::Options::set_max_mem) // .def_property("encoding", // &RE2::Options::encoding, // &RE2::Options::set_encoding) // .def_property("posix_syntax", // &RE2::Options::posix_syntax, // &RE2::Options::set_posix_syntax) // .def_property("longest_match", // &RE2::Options::longest_match, // &RE2::Options::set_longest_match) // .def_property("log_errors", // &RE2::Options::log_errors, // &RE2::Options::set_log_errors) // .def_property("literal", // &RE2::Options::literal, // &RE2::Options::set_literal) // .def_property("never_nl", // &RE2::Options::never_nl, // &RE2::Options::set_never_nl) // .def_property("dot_nl", // &RE2::Options::dot_nl, // &RE2::Options::set_dot_nl) // .def_property("never_capture", // &RE2::Options::never_capture, // &RE2::Options::set_never_capture) // .def_property("case_sensitive", // &RE2::Options::case_sensitive, // &RE2::Options::set_case_sensitive) // .def_property("perl_classes", // &RE2::Options::perl_classes, // &RE2::Options::set_perl_classes) // .def_property("word_boundary", // &RE2::Options::word_boundary, // &RE2::Options::set_word_boundary) // .def_property("one_line", // &RE2::Options::one_line, // &RE2::Options::set_one_line); // re2.def(py::init(&RE2InitShim)) .def("ok", &RE2::ok) .def("error", &RE2ErrorShim) .def("options", &RE2::options) .def("NumberOfCapturingGroups", &RE2::NumberOfCapturingGroups) .def("NamedCapturingGroups", &RE2NamedCapturingGroupsShim) .def("ProgramSize", &RE2::ProgramSize) .def("ReverseProgramSize", &RE2::ReverseProgramSize) .def("ProgramFanout", &RE2ProgramFanoutShim) .def("ReverseProgramFanout", &RE2ReverseProgramFanoutShim) .def("PossibleMatchRange", &RE2PossibleMatchRangeShim) .def("Match", &RE2MatchShim) .def_static("QuoteMeta", &RE2QuoteMetaShim); set.def(py::init()) .def("Add", &Set::Add) .def("Compile", &Set::Compile) .def("Match", &Set::Match); filter.def(py::init<>()) .def("Add", &Filter::Add) .def("Compile", &Filter::Compile) .def("Match", &Filter::Match) .def("GetRE2", &Filter::GetRE2, py::return_value_policy::reference_internal); } } // namespace re2_python