• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #![allow(non_camel_case_types)]
2 
3 use std::fmt;
4 use std::ptr;
5 use std::str;
6 
7 use libc::{c_int, c_void, size_t};
8 
9 pub struct Regex {
10     code: *mut code,
11     match_data: *mut match_data,
12     ovector: *mut size_t,
13 }
14 
15 unsafe impl Send for Regex {}
16 
17 impl Drop for Regex {
drop(&mut self)18     fn drop(&mut self) {
19         unsafe {
20             pcre2_match_data_free_8(self.match_data);
21             pcre2_code_free_8(self.code);
22         }
23     }
24 }
25 
26 pub struct Error {
27     code: c_int,
28     offset: size_t,
29 }
30 
31 impl Regex {
new(pattern: &str) -> Result<Regex, Error>32     pub fn new(pattern: &str) -> Result<Regex, Error> {
33         let mut error_code: c_int = 0;
34         let mut error_offset: size_t = 0;
35         let code = unsafe {
36             pcre2_compile_8(
37                 pattern.as_ptr(),
38                 pattern.len(),
39                 // PCRE2 can get significantly faster in some cases depending
40                 // on the permutation of these options (in particular, dropping
41                 // UCP). We should endeavor to have a separate "ASCII compatible"
42                 // benchmark.
43                 PCRE2_UCP | PCRE2_UTF,
44                 &mut error_code,
45                 &mut error_offset,
46                 ptr::null_mut(),
47             )
48         };
49         if code.is_null() {
50             return Err(Error { code: error_code, offset: error_offset });
51         }
52         let err = unsafe { pcre2_jit_compile_8(code, PCRE2_JIT_COMPLETE) };
53         if err < 0 {
54             panic!("pcre2_jit_compile_8 failed with error: {:?}", err);
55         }
56         let match_data = unsafe {
57             pcre2_match_data_create_from_pattern_8(code, ptr::null_mut())
58         };
59         if match_data.is_null() {
60             panic!("could not allocate match_data");
61         }
62         let ovector = unsafe { pcre2_get_ovector_pointer_8(match_data) };
63         if ovector.is_null() {
64             panic!("could not get ovector");
65         }
66         Ok(Regex { code: code, match_data: match_data, ovector: ovector })
67     }
68 
is_match(&self, text: &str) -> bool69     pub fn is_match(&self, text: &str) -> bool {
70         self.find_at(text, 0).is_some()
71     }
72 
find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't>73     pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> {
74         FindMatches { re: self, text: text, last_match_end: 0 }
75     }
76 
find_at(&self, text: &str, start: usize) -> Option<(usize, usize)>77     fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
78         // The man pages for PCRE2 say that pcre2_jit_match is the fastest
79         // way to execute a JIT match because it skips sanity checks. We also
80         // explicitly disable the UTF-8 validity check, but it's probably not
81         // necessary.
82         let err = unsafe {
83             pcre2_jit_match_8(
84                 self.code,
85                 text.as_ptr(),
86                 text.len(),
87                 start,
88                 PCRE2_NO_UTF_CHECK,
89                 self.match_data,
90                 ptr::null_mut(),
91             )
92         };
93         if err == PCRE2_ERROR_NOMATCH {
94             None
95         } else if err < 0 {
96             panic!("unknown error code: {:?}", err)
97         } else {
98             Some(unsafe { (*self.ovector, *self.ovector.offset(1)) })
99         }
100     }
101 }
102 
103 pub struct FindMatches<'r, 't> {
104     re: &'r Regex,
105     text: &'t str,
106     last_match_end: usize,
107 }
108 
109 impl<'r, 't> Iterator for FindMatches<'r, 't> {
110     type Item = (usize, usize);
111 
next(&mut self) -> Option<(usize, usize)>112     fn next(&mut self) -> Option<(usize, usize)> {
113         match self.re.find_at(self.text, self.last_match_end) {
114             None => None,
115             Some((s, e)) => {
116                 self.last_match_end = e;
117                 Some((s, e))
118             }
119         }
120     }
121 }
122 
123 impl fmt::Debug for Error {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result124     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
125         const BUF_LEN: size_t = 256;
126         let mut buf = [0; BUF_LEN];
127         let len = unsafe {
128             pcre2_get_error_message_8(self.code, buf.as_mut_ptr(), BUF_LEN)
129         };
130         if len < 0 {
131             write!(
132                 f,
133                 "Unknown PCRE error. (code: {:?}, offset: {:?})",
134                 self.code, self.offset
135             )
136         } else {
137             let msg = str::from_utf8(&buf[..len as usize]).unwrap();
138             write!(f, "error at {:?}: {}", self.offset, msg)
139         }
140     }
141 }
142 
143 // PCRE2 FFI. We only wrap the bits we need.
144 
145 const PCRE2_UCP: u32 = 0x00020000;
146 const PCRE2_UTF: u32 = 0x00080000;
147 const PCRE2_NO_UTF_CHECK: u32 = 0x40000000;
148 const PCRE2_JIT_COMPLETE: u32 = 0x00000001;
149 const PCRE2_ERROR_NOMATCH: c_int = -1;
150 
151 type code = c_void;
152 
153 type match_data = c_void;
154 
155 type compile_context = c_void; // unused
156 
157 type general_context = c_void; // unused
158 
159 type match_context = c_void; // unused
160 
161 extern "C" {
pcre2_compile_8( pattern: *const u8, len: size_t, options: u32, error_code: *mut c_int, error_offset: *mut size_t, context: *mut compile_context, ) -> *mut code162     fn pcre2_compile_8(
163         pattern: *const u8,
164         len: size_t,
165         options: u32,
166         error_code: *mut c_int,
167         error_offset: *mut size_t,
168         context: *mut compile_context,
169     ) -> *mut code;
170 
pcre2_code_free_8(code: *mut code)171     fn pcre2_code_free_8(code: *mut code);
172 
pcre2_match_data_create_from_pattern_8( code: *const code, context: *mut general_context, ) -> *mut match_data173     fn pcre2_match_data_create_from_pattern_8(
174         code: *const code,
175         context: *mut general_context,
176     ) -> *mut match_data;
177 
pcre2_match_data_free_8(match_data: *mut match_data)178     fn pcre2_match_data_free_8(match_data: *mut match_data);
179 
pcre2_get_ovector_pointer_8(match_data: *mut match_data) -> *mut size_t180     fn pcre2_get_ovector_pointer_8(match_data: *mut match_data)
181         -> *mut size_t;
182 
pcre2_jit_compile_8(code: *const code, options: u32) -> c_int183     fn pcre2_jit_compile_8(code: *const code, options: u32) -> c_int;
184 
pcre2_jit_match_8( code: *const code, subject: *const u8, length: size_t, startoffset: size_t, options: u32, match_data: *mut match_data, match_context: *mut match_context, ) -> c_int185     fn pcre2_jit_match_8(
186         code: *const code,
187         subject: *const u8,
188         length: size_t,
189         startoffset: size_t,
190         options: u32,
191         match_data: *mut match_data,
192         match_context: *mut match_context,
193     ) -> c_int;
194 
pcre2_get_error_message_8( error_code: c_int, buf: *mut u8, buflen: size_t, ) -> c_int195     fn pcre2_get_error_message_8(
196         error_code: c_int,
197         buf: *mut u8,
198         buflen: size_t,
199     ) -> c_int;
200 }
201