1 use std::collections::HashMap;
2 use std::ffi::{CStr, CString};
3 use std::ops::Deref;
4 use std::ptr;
5 use std::slice;
6 use std::str;
7
8 use libc::{c_char, size_t};
9 use regex::bytes;
10
11 use crate::error::{Error, ErrorKind};
12
13 const RURE_FLAG_CASEI: u32 = 1 << 0;
14 const RURE_FLAG_MULTI: u32 = 1 << 1;
15 const RURE_FLAG_DOTNL: u32 = 1 << 2;
16 const RURE_FLAG_SWAP_GREED: u32 = 1 << 3;
17 const RURE_FLAG_SPACE: u32 = 1 << 4;
18 const RURE_FLAG_UNICODE: u32 = 1 << 5;
19 const RURE_DEFAULT_FLAGS: u32 = RURE_FLAG_UNICODE;
20
21 pub struct Regex {
22 re: bytes::Regex,
23 capture_names: HashMap<String, i32>,
24 }
25
26 pub struct Options {
27 size_limit: usize,
28 dfa_size_limit: usize,
29 }
30
31 // The `RegexSet` is not exposed with option support or matching at an
32 // arbitrary position with a crate just yet. To circumvent this, we use
33 // the `Exec` structure directly.
34 pub struct RegexSet {
35 re: bytes::RegexSet,
36 }
37
38 #[repr(C)]
39 pub struct rure_match {
40 pub start: size_t,
41 pub end: size_t,
42 }
43
44 pub struct Captures(bytes::Locations);
45
46 pub struct Iter {
47 re: *const Regex,
48 last_end: usize,
49 last_match: Option<usize>,
50 }
51
52 pub struct IterCaptureNames {
53 capture_names: bytes::CaptureNames<'static>,
54 name_ptrs: Vec<*mut c_char>,
55 }
56
57 impl Deref for Regex {
58 type Target = bytes::Regex;
deref(&self) -> &bytes::Regex59 fn deref(&self) -> &bytes::Regex {
60 &self.re
61 }
62 }
63
64 impl Deref for RegexSet {
65 type Target = bytes::RegexSet;
deref(&self) -> &bytes::RegexSet66 fn deref(&self) -> &bytes::RegexSet {
67 &self.re
68 }
69 }
70
71 impl Default for Options {
default() -> Options72 fn default() -> Options {
73 Options { size_limit: 10 * (1 << 20), dfa_size_limit: 2 * (1 << 20) }
74 }
75 }
76
77 ffi_fn! {
78 fn rure_compile_must(pattern: *const c_char) -> *const Regex {
79 let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() };
80 let pat = pattern as *const u8;
81 let mut err = Error::new(ErrorKind::None);
82 let re = rure_compile(
83 pat, len, RURE_DEFAULT_FLAGS, ptr::null(), &mut err);
84 if err.is_err() {
85 let _ = writeln!(&mut io::stderr(), "{}", err);
86 let _ = writeln!(
87 &mut io::stderr(), "aborting from rure_compile_must");
88 unsafe { abort() }
89 }
90 re
91 }
92 }
93
94 ffi_fn! {
95 fn rure_compile(
96 pattern: *const u8,
97 length: size_t,
98 flags: u32,
99 options: *const Options,
100 error: *mut Error,
101 ) -> *const Regex {
102 let pat = unsafe { slice::from_raw_parts(pattern, length) };
103 let pat = match str::from_utf8(pat) {
104 Ok(pat) => pat,
105 Err(err) => {
106 unsafe {
107 if !error.is_null() {
108 *error = Error::new(ErrorKind::Str(err));
109 }
110 return ptr::null();
111 }
112 }
113 };
114 let mut builder = bytes::RegexBuilder::new(pat);
115 if !options.is_null() {
116 let options = unsafe { &*options };
117 builder.size_limit(options.size_limit);
118 builder.dfa_size_limit(options.dfa_size_limit);
119 }
120 builder.case_insensitive(flags & RURE_FLAG_CASEI > 0);
121 builder.multi_line(flags & RURE_FLAG_MULTI > 0);
122 builder.dot_matches_new_line(flags & RURE_FLAG_DOTNL > 0);
123 builder.swap_greed(flags & RURE_FLAG_SWAP_GREED > 0);
124 builder.ignore_whitespace(flags & RURE_FLAG_SPACE > 0);
125 builder.unicode(flags & RURE_FLAG_UNICODE > 0);
126 match builder.build() {
127 Ok(re) => {
128 let mut capture_names = HashMap::new();
129 for (i, name) in re.capture_names().enumerate() {
130 if let Some(name) = name {
131 capture_names.insert(name.to_owned(), i as i32);
132 }
133 }
134 let re = Regex {
135 re: re,
136 capture_names: capture_names,
137 };
138 Box::into_raw(Box::new(re))
139 }
140 Err(err) => {
141 unsafe {
142 if !error.is_null() {
143 *error = Error::new(ErrorKind::Regex(err));
144 }
145 ptr::null()
146 }
147 }
148 }
149 }
150 }
151
152 ffi_fn! {
153 fn rure_free(re: *const Regex) {
154 unsafe { drop(Box::from_raw(re as *mut Regex)); }
155 }
156 }
157
158 ffi_fn! {
159 fn rure_is_match(
160 re: *const Regex,
161 haystack: *const u8,
162 len: size_t,
163 start: size_t,
164 ) -> bool {
165 let re = unsafe { &*re };
166 let haystack = unsafe { slice::from_raw_parts(haystack, len) };
167 re.is_match_at(haystack, start)
168 }
169 }
170
171 ffi_fn! {
172 fn rure_find(
173 re: *const Regex,
174 haystack: *const u8,
175 len: size_t,
176 start: size_t,
177 match_info: *mut rure_match,
178 ) -> bool {
179 let re = unsafe { &*re };
180 let haystack = unsafe { slice::from_raw_parts(haystack, len) };
181 re.find_at(haystack, start).map(|m| unsafe {
182 if !match_info.is_null() {
183 (*match_info).start = m.start();
184 (*match_info).end = m.end();
185 }
186 }).is_some()
187 }
188 }
189
190 ffi_fn! {
191 fn rure_find_captures(
192 re: *const Regex,
193 haystack: *const u8,
194 len: size_t,
195 start: size_t,
196 captures: *mut Captures,
197 ) -> bool {
198 let re = unsafe { &*re };
199 let haystack = unsafe { slice::from_raw_parts(haystack, len) };
200 let slots = unsafe { &mut (*captures).0 };
201 re.read_captures_at(slots, haystack, start).is_some()
202 }
203 }
204
205 ffi_fn! {
206 fn rure_shortest_match(
207 re: *const Regex,
208 haystack: *const u8,
209 len: size_t,
210 start: size_t,
211 end: *mut usize,
212 ) -> bool {
213 let re = unsafe { &*re };
214 let haystack = unsafe { slice::from_raw_parts(haystack, len) };
215 match re.shortest_match_at(haystack, start) {
216 None => false,
217 Some(i) => {
218 if !end.is_null() {
219 unsafe {
220 *end = i;
221 }
222 }
223 true
224 }
225 }
226 }
227 }
228
229 ffi_fn! {
230 fn rure_capture_name_index(
231 re: *const Regex,
232 name: *const c_char,
233 ) -> i32 {
234 let re = unsafe { &*re };
235 let name = unsafe { CStr::from_ptr(name) };
236 let name = match name.to_str() {
237 Err(_) => return -1,
238 Ok(name) => name,
239 };
240 re.capture_names.get(name).map(|&i|i).unwrap_or(-1)
241 }
242 }
243
244 ffi_fn! {
245 fn rure_iter_capture_names_new(
246 re: *const Regex,
247 ) -> *mut IterCaptureNames {
248 let re = unsafe { &*re };
249 Box::into_raw(Box::new(IterCaptureNames {
250 capture_names: re.re.capture_names(),
251 name_ptrs: Vec::new(),
252 }))
253 }
254 }
255
256 ffi_fn! {
257 fn rure_iter_capture_names_free(it: *mut IterCaptureNames) {
258 unsafe {
259 let it = &mut *it;
260 while let Some(ptr) = it.name_ptrs.pop() {
261 drop(CString::from_raw(ptr));
262 }
263 drop(Box::from_raw(it));
264 }
265 }
266 }
267
268 ffi_fn! {
269 fn rure_iter_capture_names_next(
270 it: *mut IterCaptureNames,
271 capture_name: *mut *mut c_char,
272 ) -> bool {
273 if capture_name.is_null() {
274 return false;
275 }
276
277 let it = unsafe { &mut *it };
278 let cn = match it.capture_names.next() {
279 // Top-level iterator ran out of capture groups
280 None => return false,
281 Some(val) => {
282 let name = match val {
283 // inner Option didn't have a name
284 None => "",
285 Some(name) => name
286 };
287 name
288 }
289 };
290
291 unsafe {
292 let cs = match CString::new(cn.as_bytes()) {
293 Result::Ok(val) => val,
294 Result::Err(_) => return false
295 };
296 let ptr = cs.into_raw();
297 it.name_ptrs.push(ptr);
298 *capture_name = ptr;
299 }
300 true
301
302 }
303 }
304
305 ffi_fn! {
306 fn rure_iter_new(
307 re: *const Regex,
308 ) -> *mut Iter {
309 Box::into_raw(Box::new(Iter {
310 re: re,
311 last_end: 0,
312 last_match: None,
313 }))
314 }
315 }
316
317 ffi_fn! {
318 fn rure_iter_free(it: *mut Iter) {
319 unsafe { drop(Box::from_raw(it)); }
320 }
321 }
322
323 ffi_fn! {
324 fn rure_iter_next(
325 it: *mut Iter,
326 haystack: *const u8,
327 len: size_t,
328 match_info: *mut rure_match,
329 ) -> bool {
330 let it = unsafe { &mut *it };
331 let re = unsafe { &*it.re };
332 let text = unsafe { slice::from_raw_parts(haystack, len) };
333 if it.last_end > text.len() {
334 return false;
335 }
336 let (s, e) = match re.find_at(text, it.last_end) {
337 None => return false,
338 Some(m) => (m.start(), m.end()),
339 };
340 if s == e {
341 // This is an empty match. To ensure we make progress, start
342 // the next search at the smallest possible starting position
343 // of the next match following this one.
344 it.last_end += 1;
345 // Don't accept empty matches immediately following a match.
346 // Just move on to the next match.
347 if Some(e) == it.last_match {
348 return rure_iter_next(it, haystack, len, match_info);
349 }
350 } else {
351 it.last_end = e;
352 }
353 it.last_match = Some(e);
354 if !match_info.is_null() {
355 unsafe {
356 (*match_info).start = s;
357 (*match_info).end = e;
358 }
359 }
360 true
361 }
362 }
363
364 ffi_fn! {
365 fn rure_iter_next_captures(
366 it: *mut Iter,
367 haystack: *const u8,
368 len: size_t,
369 captures: *mut Captures,
370 ) -> bool {
371 let it = unsafe { &mut *it };
372 let re = unsafe { &*it.re };
373 let slots = unsafe { &mut (*captures).0 };
374 let text = unsafe { slice::from_raw_parts(haystack, len) };
375 if it.last_end > text.len() {
376 return false;
377 }
378 let (s, e) = match re.read_captures_at(slots, text, it.last_end) {
379 None => return false,
380 Some(m) => (m.start(), m.end()),
381 };
382 if s == e {
383 // This is an empty match. To ensure we make progress, start
384 // the next search at the smallest possible starting position
385 // of the next match following this one.
386 it.last_end += 1;
387 // Don't accept empty matches immediately following a match.
388 // Just move on to the next match.
389 if Some(e) == it.last_match {
390 return rure_iter_next_captures(it, haystack, len, captures);
391 }
392 } else {
393 it.last_end = e;
394 }
395 it.last_match = Some(e);
396 true
397 }
398 }
399
400 ffi_fn! {
401 fn rure_captures_new(re: *const Regex) -> *mut Captures {
402 let re = unsafe { &*re };
403 let captures = Captures(re.locations());
404 Box::into_raw(Box::new(captures))
405 }
406 }
407
408 ffi_fn! {
409 fn rure_captures_free(captures: *const Captures) {
410 unsafe { drop(Box::from_raw(captures as *mut Captures)); }
411 }
412 }
413
414 ffi_fn! {
415 fn rure_captures_at(
416 captures: *const Captures,
417 i: size_t,
418 match_info: *mut rure_match,
419 ) -> bool {
420 let locs = unsafe { &(*captures).0 };
421 match locs.pos(i) {
422 Some((start, end)) => {
423 if !match_info.is_null() {
424 unsafe {
425 (*match_info).start = start;
426 (*match_info).end = end;
427 }
428 }
429 true
430 }
431 _ => false
432 }
433 }
434 }
435
436 ffi_fn! {
437 fn rure_captures_len(captures: *const Captures) -> size_t {
438 unsafe { (*captures).0.len() }
439 }
440 }
441
442 ffi_fn! {
443 fn rure_options_new() -> *mut Options {
444 Box::into_raw(Box::new(Options::default()))
445 }
446 }
447
448 ffi_fn! {
449 fn rure_options_free(options: *mut Options) {
450 unsafe { drop(Box::from_raw(options)); }
451 }
452 }
453
454 ffi_fn! {
455 fn rure_options_size_limit(options: *mut Options, limit: size_t) {
456 let options = unsafe { &mut *options };
457 options.size_limit = limit;
458 }
459 }
460
461 ffi_fn! {
462 fn rure_options_dfa_size_limit(options: *mut Options, limit: size_t) {
463 let options = unsafe { &mut *options };
464 options.dfa_size_limit = limit;
465 }
466 }
467
468 ffi_fn! {
469 fn rure_compile_set(
470 patterns: *const *const u8,
471 patterns_lengths: *const size_t,
472 patterns_count: size_t,
473 flags: u32,
474 options: *const Options,
475 error: *mut Error
476 ) -> *const RegexSet {
477 let (raw_pats, raw_patsl) = unsafe {
478 (
479 slice::from_raw_parts(patterns, patterns_count),
480 slice::from_raw_parts(patterns_lengths, patterns_count)
481 )
482 };
483
484 let mut pats = Vec::with_capacity(patterns_count);
485 for (&raw_pat, &raw_patl) in raw_pats.iter().zip(raw_patsl) {
486 let pat = unsafe { slice::from_raw_parts(raw_pat, raw_patl) };
487 pats.push(match str::from_utf8(pat) {
488 Ok(pat) => pat,
489 Err(err) => {
490 unsafe {
491 if !error.is_null() {
492 *error = Error::new(ErrorKind::Str(err));
493 }
494 return ptr::null();
495 }
496 }
497 });
498 }
499
500 let mut builder = bytes::RegexSetBuilder::new(pats);
501 if !options.is_null() {
502 let options = unsafe { &*options };
503 builder.size_limit(options.size_limit);
504 builder.dfa_size_limit(options.dfa_size_limit);
505 }
506 builder.case_insensitive(flags & RURE_FLAG_CASEI > 0);
507 builder.multi_line(flags & RURE_FLAG_MULTI > 0);
508 builder.dot_matches_new_line(flags & RURE_FLAG_DOTNL > 0);
509 builder.swap_greed(flags & RURE_FLAG_SWAP_GREED > 0);
510 builder.ignore_whitespace(flags & RURE_FLAG_SPACE > 0);
511 builder.unicode(flags & RURE_FLAG_UNICODE > 0);
512 match builder.build() {
513 Ok(re) => {
514 Box::into_raw(Box::new(RegexSet { re: re }))
515 }
516 Err(err) => {
517 unsafe {
518 if !error.is_null() {
519 *error = Error::new(ErrorKind::Regex(err))
520 }
521 ptr::null()
522 }
523 }
524 }
525 }
526 }
527
528 ffi_fn! {
529 fn rure_set_free(re: *const RegexSet) {
530 unsafe { drop(Box::from_raw(re as *mut RegexSet)); }
531 }
532 }
533
534 ffi_fn! {
535 fn rure_set_is_match(
536 re: *const RegexSet,
537 haystack: *const u8,
538 len: size_t,
539 start: size_t
540 ) -> bool {
541 let re = unsafe { &*re };
542 let haystack = unsafe { slice::from_raw_parts(haystack, len) };
543 re.is_match_at(haystack, start)
544 }
545 }
546
547 ffi_fn! {
548 fn rure_set_matches(
549 re: *const RegexSet,
550 haystack: *const u8,
551 len: size_t,
552 start: size_t,
553 matches: *mut bool
554 ) -> bool {
555 let re = unsafe { &*re };
556 let mut matches = unsafe {
557 slice::from_raw_parts_mut(matches, re.len())
558 };
559 let haystack = unsafe { slice::from_raw_parts(haystack, len) };
560
561 // read_matches_at isn't guaranteed to set non-matches to false
562 for item in matches.iter_mut() {
563 *item = false;
564 }
565 re.read_matches_at(&mut matches, haystack, start)
566 }
567 }
568
569 ffi_fn! {
570 fn rure_set_len(re: *const RegexSet) -> size_t {
571 unsafe { (*re).len() }
572 }
573 }
574
575 ffi_fn! {
576 fn rure_escape_must(pattern: *const c_char) -> *const c_char {
577 let len = unsafe { CStr::from_ptr(pattern).to_bytes().len() };
578 let pat = pattern as *const u8;
579 let mut err = Error::new(ErrorKind::None);
580 let esc = rure_escape(pat, len, &mut err);
581 if err.is_err() {
582 let _ = writeln!(&mut io::stderr(), "{}", err);
583 let _ = writeln!(
584 &mut io::stderr(), "aborting from rure_escape_must");
585 unsafe { abort() }
586 }
587 esc
588 }
589 }
590
591 /// A helper function that implements fallible escaping in a way that returns
592 /// an error if escaping failed.
593 ///
594 /// This should ideally be exposed, but it needs API design work. In
595 /// particular, this should not return a C string, but a `const uint8_t *`
596 /// instead, since it may contain a NUL byte.
rure_escape( pattern: *const u8, length: size_t, error: *mut Error, ) -> *const c_char597 fn rure_escape(
598 pattern: *const u8,
599 length: size_t,
600 error: *mut Error,
601 ) -> *const c_char {
602 let pat: &[u8] = unsafe { slice::from_raw_parts(pattern, length) };
603 let str_pat = match str::from_utf8(pat) {
604 Ok(val) => val,
605 Err(err) => unsafe {
606 if !error.is_null() {
607 *error = Error::new(ErrorKind::Str(err));
608 }
609 return ptr::null();
610 },
611 };
612 let esc_pat = regex::escape(str_pat);
613 let c_esc_pat = match CString::new(esc_pat) {
614 Ok(val) => val,
615 Err(err) => unsafe {
616 if !error.is_null() {
617 *error = Error::new(ErrorKind::Nul(err));
618 }
619 return ptr::null();
620 },
621 };
622 c_esc_pat.into_raw() as *const c_char
623 }
624
625 ffi_fn! {
626 fn rure_cstring_free(s: *mut c_char) {
627 unsafe { drop(CString::from_raw(s)); }
628 }
629 }
630