1 #[cfg(feature = "std")]
2 use std::borrow::Cow;
3 #[cfg(feature = "std")]
4 use std::ffi::OsStr;
5 #[cfg(feature = "std")]
6 use std::path::Path;
7
8 use core::{cmp, iter, ops, ptr, slice, str};
9 use memchr::{memchr, memrchr};
10
11 use ascii;
12 use bstr::BStr;
13 use byteset;
14 #[cfg(feature = "std")]
15 use ext_vec::ByteVec;
16 use search::{PrefilterState, TwoWay};
17 #[cfg(feature = "unicode")]
18 use unicode::{
19 whitespace_len_fwd, whitespace_len_rev, GraphemeIndices, Graphemes,
20 SentenceIndices, Sentences, WordIndices, Words, WordsWithBreakIndices,
21 WordsWithBreaks,
22 };
23 use utf8::{self, CharIndices, Chars, Utf8Chunks, Utf8Error};
24
25 /// A short-hand constructor for building a `&[u8]`.
26 ///
27 /// This idiosyncratic constructor is useful for concisely building byte string
28 /// slices. Its primary utility is in conveniently writing byte string literals
29 /// in a uniform way. For example, consider this code that does not compile:
30 ///
31 /// ```ignore
32 /// let strs = vec![b"a", b"xy"];
33 /// ```
34 ///
35 /// The above code doesn't compile because the type of the byte string literal
36 /// `b"a"` is `&'static [u8; 1]`, and the type of `b"xy"` is
37 /// `&'static [u8; 2]`. Since their types aren't the same, they can't be stored
38 /// in the same `Vec`. (This is dissimilar from normal Unicode string slices,
39 /// where both `"a"` and `"xy"` have the same type of `&'static str`.)
40 ///
41 /// One way of getting the above code to compile is to convert byte strings to
42 /// slices. You might try this:
43 ///
44 /// ```ignore
45 /// let strs = vec![&b"a", &b"xy"];
46 /// ```
47 ///
48 /// But this just creates values with type `& &'static [u8; 1]` and
49 /// `& &'static [u8; 2]`. Instead, you need to force the issue like so:
50 ///
51 /// ```
52 /// let strs = vec![&b"a"[..], &b"xy"[..]];
53 /// // or
54 /// let strs = vec![b"a".as_ref(), b"xy".as_ref()];
55 /// ```
56 ///
57 /// But neither of these are particularly convenient to type, especially when
58 /// it's something as common as a string literal. Thus, this constructor
59 /// permits writing the following instead:
60 ///
61 /// ```
62 /// use bstr::B;
63 ///
64 /// let strs = vec![B("a"), B(b"xy")];
65 /// ```
66 ///
67 /// Notice that this also lets you mix and match both string literals and byte
68 /// string literals. This can be quite convenient!
69 #[allow(non_snake_case)]
70 #[inline]
B<'a, B: ?Sized + AsRef<[u8]>>(bytes: &'a B) -> &'a [u8]71 pub fn B<'a, B: ?Sized + AsRef<[u8]>>(bytes: &'a B) -> &'a [u8] {
72 bytes.as_ref()
73 }
74
75 impl ByteSlice for [u8] {
76 #[inline]
as_bytes(&self) -> &[u8]77 fn as_bytes(&self) -> &[u8] {
78 self
79 }
80
81 #[inline]
as_bytes_mut(&mut self) -> &mut [u8]82 fn as_bytes_mut(&mut self) -> &mut [u8] {
83 self
84 }
85 }
86
87 /// Ensure that callers cannot implement `ByteSlice` by making an
88 /// umplementable trait its super trait.
89 pub trait Sealed {}
90 impl Sealed for [u8] {}
91
92 /// A trait that extends `&[u8]` with string oriented methods.
93 pub trait ByteSlice: Sealed {
94 /// A method for accessing the raw bytes of this type. This is always a
95 /// no-op and callers shouldn't care about it. This only exists for making
96 /// the extension trait work.
97 #[doc(hidden)]
as_bytes(&self) -> &[u8]98 fn as_bytes(&self) -> &[u8];
99
100 /// A method for accessing the raw bytes of this type, mutably. This is
101 /// always a no-op and callers shouldn't care about it. This only exists
102 /// for making the extension trait work.
103 #[doc(hidden)]
as_bytes_mut(&mut self) -> &mut [u8]104 fn as_bytes_mut(&mut self) -> &mut [u8];
105
106 /// Return this byte slice as a `&BStr`.
107 ///
108 /// Use `&BStr` is useful because of its `fmt::Debug` representation
109 /// and various other trait implementations (such as `PartialEq` and
110 /// `PartialOrd`). In particular, the `Debug` implementation for `BStr`
111 /// shows its bytes as a normal string. For invalid UTF-8, hex escape
112 /// sequences are used.
113 ///
114 /// # Examples
115 ///
116 /// Basic usage:
117 ///
118 /// ```
119 /// use bstr::ByteSlice;
120 ///
121 /// println!("{:?}", b"foo\xFFbar".as_bstr());
122 /// ```
123 #[inline]
as_bstr(&self) -> &BStr124 fn as_bstr(&self) -> &BStr {
125 BStr::new(self.as_bytes())
126 }
127
128 /// Return this byte slice as a `&mut BStr`.
129 ///
130 /// Use `&mut BStr` is useful because of its `fmt::Debug` representation
131 /// and various other trait implementations (such as `PartialEq` and
132 /// `PartialOrd`). In particular, the `Debug` implementation for `BStr`
133 /// shows its bytes as a normal string. For invalid UTF-8, hex escape
134 /// sequences are used.
135 ///
136 /// # Examples
137 ///
138 /// Basic usage:
139 ///
140 /// ```
141 /// use bstr::ByteSlice;
142 ///
143 /// let mut bytes = *b"foo\xFFbar";
144 /// println!("{:?}", &mut bytes.as_bstr_mut());
145 /// ```
146 #[inline]
as_bstr_mut(&mut self) -> &mut BStr147 fn as_bstr_mut(&mut self) -> &mut BStr {
148 BStr::new_mut(self.as_bytes_mut())
149 }
150
151 /// Create an immutable byte string from an OS string slice.
152 ///
153 /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
154 /// this returns `None` if the given OS string is not valid UTF-8. (For
155 /// example, on Windows, file paths are allowed to be a sequence of
156 /// arbitrary 16-bit integers. Not all such sequences can be transcoded to
157 /// valid UTF-8.)
158 ///
159 /// # Examples
160 ///
161 /// Basic usage:
162 ///
163 /// ```
164 /// use std::ffi::OsStr;
165 ///
166 /// use bstr::{B, ByteSlice};
167 ///
168 /// let os_str = OsStr::new("foo");
169 /// let bs = <[u8]>::from_os_str(os_str).expect("should be valid UTF-8");
170 /// assert_eq!(bs, B("foo"));
171 /// ```
172 #[cfg(feature = "std")]
173 #[inline]
from_os_str(os_str: &OsStr) -> Option<&[u8]>174 fn from_os_str(os_str: &OsStr) -> Option<&[u8]> {
175 #[cfg(unix)]
176 #[inline]
177 fn imp(os_str: &OsStr) -> Option<&[u8]> {
178 use std::os::unix::ffi::OsStrExt;
179
180 Some(os_str.as_bytes())
181 }
182
183 #[cfg(not(unix))]
184 #[inline]
185 fn imp(os_str: &OsStr) -> Option<&[u8]> {
186 os_str.to_str().map(|s| s.as_bytes())
187 }
188
189 imp(os_str)
190 }
191
192 /// Create an immutable byte string from a file path.
193 ///
194 /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
195 /// this returns `None` if the given path is not valid UTF-8. (For example,
196 /// on Windows, file paths are allowed to be a sequence of arbitrary 16-bit
197 /// integers. Not all such sequences can be transcoded to valid UTF-8.)
198 ///
199 /// # Examples
200 ///
201 /// Basic usage:
202 ///
203 /// ```
204 /// use std::path::Path;
205 ///
206 /// use bstr::{B, ByteSlice};
207 ///
208 /// let path = Path::new("foo");
209 /// let bs = <[u8]>::from_path(path).expect("should be valid UTF-8");
210 /// assert_eq!(bs, B("foo"));
211 /// ```
212 #[cfg(feature = "std")]
213 #[inline]
from_path(path: &Path) -> Option<&[u8]>214 fn from_path(path: &Path) -> Option<&[u8]> {
215 Self::from_os_str(path.as_os_str())
216 }
217
218 /// Safely convert this byte string into a `&str` if it's valid UTF-8.
219 ///
220 /// If this byte string is not valid UTF-8, then an error is returned. The
221 /// error returned indicates the first invalid byte found and the length
222 /// of the error.
223 ///
224 /// In cases where a lossy conversion to `&str` is acceptable, then use one
225 /// of the [`to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy) or
226 /// [`to_str_lossy_into`](trait.ByteSlice.html#method.to_str_lossy_into)
227 /// methods.
228 ///
229 /// # Examples
230 ///
231 /// Basic usage:
232 ///
233 /// ```
234 /// use bstr::{B, ByteSlice, ByteVec};
235 ///
236 /// # fn example() -> Result<(), bstr::Utf8Error> {
237 /// let s = B("☃βツ").to_str()?;
238 /// assert_eq!("☃βツ", s);
239 ///
240 /// let mut bstring = <Vec<u8>>::from("☃βツ");
241 /// bstring.push(b'\xFF');
242 /// let err = bstring.to_str().unwrap_err();
243 /// assert_eq!(8, err.valid_up_to());
244 /// # Ok(()) }; example().unwrap()
245 /// ```
246 #[inline]
to_str(&self) -> Result<&str, Utf8Error>247 fn to_str(&self) -> Result<&str, Utf8Error> {
248 utf8::validate(self.as_bytes()).map(|_| {
249 // SAFETY: This is safe because of the guarantees provided by
250 // utf8::validate.
251 unsafe { str::from_utf8_unchecked(self.as_bytes()) }
252 })
253 }
254
255 /// Unsafely convert this byte string into a `&str`, without checking for
256 /// valid UTF-8.
257 ///
258 /// # Safety
259 ///
260 /// Callers *must* ensure that this byte string is valid UTF-8 before
261 /// calling this method. Converting a byte string into a `&str` that is
262 /// not valid UTF-8 is considered undefined behavior.
263 ///
264 /// This routine is useful in performance sensitive contexts where the
265 /// UTF-8 validity of the byte string is already known and it is
266 /// undesirable to pay the cost of an additional UTF-8 validation check
267 /// that [`to_str`](trait.ByteSlice.html#method.to_str) performs.
268 ///
269 /// # Examples
270 ///
271 /// Basic usage:
272 ///
273 /// ```
274 /// use bstr::{B, ByteSlice};
275 ///
276 /// // SAFETY: This is safe because string literals are guaranteed to be
277 /// // valid UTF-8 by the Rust compiler.
278 /// let s = unsafe { B("☃βツ").to_str_unchecked() };
279 /// assert_eq!("☃βツ", s);
280 /// ```
281 #[inline]
to_str_unchecked(&self) -> &str282 unsafe fn to_str_unchecked(&self) -> &str {
283 str::from_utf8_unchecked(self.as_bytes())
284 }
285
286 /// Convert this byte string to a valid UTF-8 string by replacing invalid
287 /// UTF-8 bytes with the Unicode replacement codepoint (`U+FFFD`).
288 ///
289 /// If the byte string is already valid UTF-8, then no copying or
290 /// allocation is performed and a borrrowed string slice is returned. If
291 /// the byte string is not valid UTF-8, then an owned string buffer is
292 /// returned with invalid bytes replaced by the replacement codepoint.
293 ///
294 /// This method uses the "substitution of maximal subparts" (Unicode
295 /// Standard, Chapter 3, Section 9) strategy for inserting the replacement
296 /// codepoint. Specifically, a replacement codepoint is inserted whenever a
297 /// byte is found that cannot possibly lead to a valid code unit sequence.
298 /// If there were previous bytes that represented a prefix of a well-formed
299 /// code unit sequence, then all of those bytes are substituted with a
300 /// single replacement codepoint. The "substitution of maximal subparts"
301 /// strategy is the same strategy used by
302 /// [W3C's Encoding standard](https://www.w3.org/TR/encoding/).
303 /// For a more precise description of the maximal subpart strategy, see
304 /// the Unicode Standard, Chapter 3, Section 9. See also
305 /// [Public Review Issue #121](http://www.unicode.org/review/pr-121.html).
306 ///
307 /// N.B. Rust's standard library also appears to use the same strategy,
308 /// but it does not appear to be an API guarantee.
309 ///
310 /// # Examples
311 ///
312 /// Basic usage:
313 ///
314 /// ```
315 /// use std::borrow::Cow;
316 ///
317 /// use bstr::ByteSlice;
318 ///
319 /// let mut bstring = <Vec<u8>>::from("☃βツ");
320 /// assert_eq!(Cow::Borrowed("☃βツ"), bstring.to_str_lossy());
321 ///
322 /// // Add a byte that makes the sequence invalid.
323 /// bstring.push(b'\xFF');
324 /// assert_eq!(Cow::Borrowed("☃βツ\u{FFFD}"), bstring.to_str_lossy());
325 /// ```
326 ///
327 /// This demonstrates the "maximal subpart" substitution logic.
328 ///
329 /// ```
330 /// use bstr::{B, ByteSlice};
331 ///
332 /// // \x61 is the ASCII codepoint for 'a'.
333 /// // \xF1\x80\x80 is a valid 3-byte code unit prefix.
334 /// // \xE1\x80 is a valid 2-byte code unit prefix.
335 /// // \xC2 is a valid 1-byte code unit prefix.
336 /// // \x62 is the ASCII codepoint for 'b'.
337 /// //
338 /// // In sum, each of the prefixes is replaced by a single replacement
339 /// // codepoint since none of the prefixes are properly completed. This
340 /// // is in contrast to other strategies that might insert a replacement
341 /// // codepoint for every single byte.
342 /// let bs = B(b"\x61\xF1\x80\x80\xE1\x80\xC2\x62");
343 /// assert_eq!("a\u{FFFD}\u{FFFD}\u{FFFD}b", bs.to_str_lossy());
344 /// ```
345 #[cfg(feature = "std")]
346 #[inline]
to_str_lossy(&self) -> Cow<str>347 fn to_str_lossy(&self) -> Cow<str> {
348 match utf8::validate(self.as_bytes()) {
349 Ok(()) => {
350 // SAFETY: This is safe because of the guarantees provided by
351 // utf8::validate.
352 unsafe {
353 Cow::Borrowed(str::from_utf8_unchecked(self.as_bytes()))
354 }
355 }
356 Err(err) => {
357 let mut lossy = String::with_capacity(self.as_bytes().len());
358 let (valid, after) =
359 self.as_bytes().split_at(err.valid_up_to());
360 // SAFETY: This is safe because utf8::validate guarantees
361 // that all of `valid` is valid UTF-8.
362 lossy.push_str(unsafe { str::from_utf8_unchecked(valid) });
363 lossy.push_str("\u{FFFD}");
364 if let Some(len) = err.error_len() {
365 after[len..].to_str_lossy_into(&mut lossy);
366 }
367 Cow::Owned(lossy)
368 }
369 }
370 }
371
372 /// Copy the contents of this byte string into the given owned string
373 /// buffer, while replacing invalid UTF-8 code unit sequences with the
374 /// Unicode replacement codepoint (`U+FFFD`).
375 ///
376 /// This method uses the same "substitution of maximal subparts" strategy
377 /// for inserting the replacement codepoint as the
378 /// [`to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy) method.
379 ///
380 /// This routine is useful for amortizing allocation. However, unlike
381 /// `to_str_lossy`, this routine will _always_ copy the contents of this
382 /// byte string into the destination buffer, even if this byte string is
383 /// valid UTF-8.
384 ///
385 /// # Examples
386 ///
387 /// Basic usage:
388 ///
389 /// ```
390 /// use std::borrow::Cow;
391 ///
392 /// use bstr::ByteSlice;
393 ///
394 /// let mut bstring = <Vec<u8>>::from("☃βツ");
395 /// // Add a byte that makes the sequence invalid.
396 /// bstring.push(b'\xFF');
397 ///
398 /// let mut dest = String::new();
399 /// bstring.to_str_lossy_into(&mut dest);
400 /// assert_eq!("☃βツ\u{FFFD}", dest);
401 /// ```
402 #[cfg(feature = "std")]
403 #[inline]
to_str_lossy_into(&self, dest: &mut String)404 fn to_str_lossy_into(&self, dest: &mut String) {
405 let mut bytes = self.as_bytes();
406 dest.reserve(bytes.len());
407 loop {
408 match utf8::validate(bytes) {
409 Ok(()) => {
410 // SAFETY: This is safe because utf8::validate guarantees
411 // that all of `bytes` is valid UTF-8.
412 dest.push_str(unsafe { str::from_utf8_unchecked(bytes) });
413 break;
414 }
415 Err(err) => {
416 let (valid, after) = bytes.split_at(err.valid_up_to());
417 // SAFETY: This is safe because utf8::validate guarantees
418 // that all of `valid` is valid UTF-8.
419 dest.push_str(unsafe { str::from_utf8_unchecked(valid) });
420 dest.push_str("\u{FFFD}");
421 match err.error_len() {
422 None => break,
423 Some(len) => bytes = &after[len..],
424 }
425 }
426 }
427 }
428 }
429
430 /// Create an OS string slice from this byte string.
431 ///
432 /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
433 /// this returns a UTF-8 decoding error if this byte string is not valid
434 /// UTF-8. (For example, on Windows, file paths are allowed to be a
435 /// sequence of arbitrary 16-bit integers. There is no obvious mapping from
436 /// an arbitrary sequence of 8-bit integers to an arbitrary sequence of
437 /// 16-bit integers.)
438 ///
439 /// # Examples
440 ///
441 /// Basic usage:
442 ///
443 /// ```
444 /// use bstr::{B, ByteSlice};
445 ///
446 /// let os_str = b"foo".to_os_str().expect("should be valid UTF-8");
447 /// assert_eq!(os_str, "foo");
448 /// ```
449 #[cfg(feature = "std")]
450 #[inline]
to_os_str(&self) -> Result<&OsStr, Utf8Error>451 fn to_os_str(&self) -> Result<&OsStr, Utf8Error> {
452 #[cfg(unix)]
453 #[inline]
454 fn imp(bytes: &[u8]) -> Result<&OsStr, Utf8Error> {
455 use std::os::unix::ffi::OsStrExt;
456
457 Ok(OsStr::from_bytes(bytes))
458 }
459
460 #[cfg(not(unix))]
461 #[inline]
462 fn imp(bytes: &[u8]) -> Result<&OsStr, Utf8Error> {
463 bytes.to_str().map(OsStr::new)
464 }
465
466 imp(self.as_bytes())
467 }
468
469 /// Lossily create an OS string slice from this byte string.
470 ///
471 /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
472 /// this will perform a UTF-8 check and lossily convert this byte string
473 /// into valid UTF-8 using the Unicode replacement codepoint.
474 ///
475 /// Note that this can prevent the correct roundtripping of file paths on
476 /// non-Unix systems such as Windows, where file paths are an arbitrary
477 /// sequence of 16-bit integers.
478 ///
479 /// # Examples
480 ///
481 /// Basic usage:
482 ///
483 /// ```
484 /// use bstr::ByteSlice;
485 ///
486 /// let os_str = b"foo\xFFbar".to_os_str_lossy();
487 /// assert_eq!(os_str.to_string_lossy(), "foo\u{FFFD}bar");
488 /// ```
489 #[cfg(feature = "std")]
490 #[inline]
to_os_str_lossy(&self) -> Cow<OsStr>491 fn to_os_str_lossy(&self) -> Cow<OsStr> {
492 #[cfg(unix)]
493 #[inline]
494 fn imp(bytes: &[u8]) -> Cow<OsStr> {
495 use std::os::unix::ffi::OsStrExt;
496
497 Cow::Borrowed(OsStr::from_bytes(bytes))
498 }
499
500 #[cfg(not(unix))]
501 #[inline]
502 fn imp(bytes: &[u8]) -> Cow<OsStr> {
503 use std::ffi::OsString;
504
505 match bytes.to_str_lossy() {
506 Cow::Borrowed(x) => Cow::Borrowed(OsStr::new(x)),
507 Cow::Owned(x) => Cow::Owned(OsString::from(x)),
508 }
509 }
510
511 imp(self.as_bytes())
512 }
513
514 /// Create a path slice from this byte string.
515 ///
516 /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
517 /// this returns a UTF-8 decoding error if this byte string is not valid
518 /// UTF-8. (For example, on Windows, file paths are allowed to be a
519 /// sequence of arbitrary 16-bit integers. There is no obvious mapping from
520 /// an arbitrary sequence of 8-bit integers to an arbitrary sequence of
521 /// 16-bit integers.)
522 ///
523 /// # Examples
524 ///
525 /// Basic usage:
526 ///
527 /// ```
528 /// use bstr::ByteSlice;
529 ///
530 /// let path = b"foo".to_path().expect("should be valid UTF-8");
531 /// assert_eq!(path.as_os_str(), "foo");
532 /// ```
533 #[cfg(feature = "std")]
534 #[inline]
to_path(&self) -> Result<&Path, Utf8Error>535 fn to_path(&self) -> Result<&Path, Utf8Error> {
536 self.to_os_str().map(Path::new)
537 }
538
539 /// Lossily create a path slice from this byte string.
540 ///
541 /// On Unix, this always succeeds and is zero cost. On non-Unix systems,
542 /// this will perform a UTF-8 check and lossily convert this byte string
543 /// into valid UTF-8 using the Unicode replacement codepoint.
544 ///
545 /// Note that this can prevent the correct roundtripping of file paths on
546 /// non-Unix systems such as Windows, where file paths are an arbitrary
547 /// sequence of 16-bit integers.
548 ///
549 /// # Examples
550 ///
551 /// Basic usage:
552 ///
553 /// ```
554 /// use bstr::ByteSlice;
555 ///
556 /// let bs = b"foo\xFFbar";
557 /// let path = bs.to_path_lossy();
558 /// assert_eq!(path.to_string_lossy(), "foo\u{FFFD}bar");
559 /// ```
560 #[cfg(feature = "std")]
561 #[inline]
to_path_lossy(&self) -> Cow<Path>562 fn to_path_lossy(&self) -> Cow<Path> {
563 use std::path::PathBuf;
564
565 match self.to_os_str_lossy() {
566 Cow::Borrowed(x) => Cow::Borrowed(Path::new(x)),
567 Cow::Owned(x) => Cow::Owned(PathBuf::from(x)),
568 }
569 }
570
571 /// Create a new byte string by repeating this byte string `n` times.
572 ///
573 /// # Panics
574 ///
575 /// This function panics if the capacity of the new byte string would
576 /// overflow.
577 ///
578 /// # Examples
579 ///
580 /// Basic usage:
581 ///
582 /// ```
583 /// use bstr::{B, ByteSlice};
584 ///
585 /// assert_eq!(b"foo".repeatn(4), B("foofoofoofoo"));
586 /// assert_eq!(b"foo".repeatn(0), B(""));
587 /// ```
588 #[cfg(feature = "std")]
589 #[inline]
repeatn(&self, n: usize) -> Vec<u8>590 fn repeatn(&self, n: usize) -> Vec<u8> {
591 let bs = self.as_bytes();
592 let mut dst = vec![0; bs.len() * n];
593 for i in 0..n {
594 dst[i * bs.len()..(i + 1) * bs.len()].copy_from_slice(bs);
595 }
596 dst
597 }
598
599 /// Returns true if and only if this byte string contains the given needle.
600 ///
601 /// # Examples
602 ///
603 /// Basic usage:
604 ///
605 /// ```
606 /// use bstr::ByteSlice;
607 ///
608 /// assert!(b"foo bar".contains_str("foo"));
609 /// assert!(b"foo bar".contains_str("bar"));
610 /// assert!(!b"foo".contains_str("foobar"));
611 /// ```
612 #[inline]
contains_str<B: AsRef<[u8]>>(&self, needle: B) -> bool613 fn contains_str<B: AsRef<[u8]>>(&self, needle: B) -> bool {
614 self.find(needle).is_some()
615 }
616
617 /// Returns true if and only if this byte string has the given prefix.
618 ///
619 /// # Examples
620 ///
621 /// Basic usage:
622 ///
623 /// ```
624 /// use bstr::ByteSlice;
625 ///
626 /// assert!(b"foo bar".starts_with_str("foo"));
627 /// assert!(!b"foo bar".starts_with_str("bar"));
628 /// assert!(!b"foo".starts_with_str("foobar"));
629 /// ```
630 #[inline]
starts_with_str<B: AsRef<[u8]>>(&self, prefix: B) -> bool631 fn starts_with_str<B: AsRef<[u8]>>(&self, prefix: B) -> bool {
632 self.as_bytes().starts_with(prefix.as_ref())
633 }
634
635 /// Returns true if and only if this byte string has the given suffix.
636 ///
637 /// # Examples
638 ///
639 /// Basic usage:
640 ///
641 /// ```
642 /// use bstr::ByteSlice;
643 ///
644 /// assert!(b"foo bar".ends_with_str("bar"));
645 /// assert!(!b"foo bar".ends_with_str("foo"));
646 /// assert!(!b"bar".ends_with_str("foobar"));
647 /// ```
648 #[inline]
ends_with_str<B: AsRef<[u8]>>(&self, suffix: B) -> bool649 fn ends_with_str<B: AsRef<[u8]>>(&self, suffix: B) -> bool {
650 self.as_bytes().ends_with(suffix.as_ref())
651 }
652
653 /// Returns the index of the first occurrence of the given needle.
654 ///
655 /// The needle may be any type that can be cheaply converted into a
656 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
657 ///
658 /// Note that if you're are searching for the same needle in many
659 /// different small haystacks, it may be faster to initialize a
660 /// [`Finder`](struct.Finder.html) once, and reuse it for each search.
661 ///
662 /// # Complexity
663 ///
664 /// This routine is guaranteed to have worst case linear time complexity
665 /// with respect to both the needle and the haystack. That is, this runs
666 /// in `O(needle.len() + haystack.len())` time.
667 ///
668 /// This routine is also guaranteed to have worst case constant space
669 /// complexity.
670 ///
671 /// # Examples
672 ///
673 /// Basic usage:
674 ///
675 /// ```
676 /// use bstr::ByteSlice;
677 ///
678 /// let s = b"foo bar baz";
679 /// assert_eq!(Some(0), s.find("foo"));
680 /// assert_eq!(Some(4), s.find("bar"));
681 /// assert_eq!(None, s.find("quux"));
682 /// ```
683 #[inline]
find<B: AsRef<[u8]>>(&self, needle: B) -> Option<usize>684 fn find<B: AsRef<[u8]>>(&self, needle: B) -> Option<usize> {
685 Finder::new(needle.as_ref()).find(self.as_bytes())
686 }
687
688 /// Returns the index of the last occurrence of the given needle.
689 ///
690 /// The needle may be any type that can be cheaply converted into a
691 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
692 ///
693 /// Note that if you're are searching for the same needle in many
694 /// different small haystacks, it may be faster to initialize a
695 /// [`FinderReverse`](struct.FinderReverse.html) once, and reuse it for
696 /// each search.
697 ///
698 /// # Complexity
699 ///
700 /// This routine is guaranteed to have worst case linear time complexity
701 /// with respect to both the needle and the haystack. That is, this runs
702 /// in `O(needle.len() + haystack.len())` time.
703 ///
704 /// This routine is also guaranteed to have worst case constant space
705 /// complexity.
706 ///
707 /// # Examples
708 ///
709 /// Basic usage:
710 ///
711 /// ```
712 /// use bstr::ByteSlice;
713 ///
714 /// let s = b"foo bar baz";
715 /// assert_eq!(Some(0), s.rfind("foo"));
716 /// assert_eq!(Some(4), s.rfind("bar"));
717 /// assert_eq!(Some(8), s.rfind("ba"));
718 /// assert_eq!(None, s.rfind("quux"));
719 /// ```
720 #[inline]
rfind<B: AsRef<[u8]>>(&self, needle: B) -> Option<usize>721 fn rfind<B: AsRef<[u8]>>(&self, needle: B) -> Option<usize> {
722 FinderReverse::new(needle.as_ref()).rfind(self.as_bytes())
723 }
724
725 /// Returns an iterator of the non-overlapping occurrences of the given
726 /// needle. The iterator yields byte offset positions indicating the start
727 /// of each match.
728 ///
729 /// # Complexity
730 ///
731 /// This routine is guaranteed to have worst case linear time complexity
732 /// with respect to both the needle and the haystack. That is, this runs
733 /// in `O(needle.len() + haystack.len())` time.
734 ///
735 /// This routine is also guaranteed to have worst case constant space
736 /// complexity.
737 ///
738 /// # Examples
739 ///
740 /// Basic usage:
741 ///
742 /// ```
743 /// use bstr::ByteSlice;
744 ///
745 /// let s = b"foo bar foo foo quux foo";
746 /// let matches: Vec<usize> = s.find_iter("foo").collect();
747 /// assert_eq!(matches, vec![0, 8, 12, 21]);
748 /// ```
749 ///
750 /// An empty string matches at every position, including the position
751 /// immediately following the last byte:
752 ///
753 /// ```
754 /// use bstr::ByteSlice;
755 ///
756 /// let matches: Vec<usize> = b"foo".find_iter("").collect();
757 /// assert_eq!(matches, vec![0, 1, 2, 3]);
758 ///
759 /// let matches: Vec<usize> = b"".find_iter("").collect();
760 /// assert_eq!(matches, vec![0]);
761 /// ```
762 #[inline]
find_iter<'a, B: ?Sized + AsRef<[u8]>>( &'a self, needle: &'a B, ) -> Find<'a>763 fn find_iter<'a, B: ?Sized + AsRef<[u8]>>(
764 &'a self,
765 needle: &'a B,
766 ) -> Find<'a> {
767 Find::new(self.as_bytes(), needle.as_ref())
768 }
769
770 /// Returns an iterator of the non-overlapping occurrences of the given
771 /// needle in reverse. The iterator yields byte offset positions indicating
772 /// the start of each match.
773 ///
774 /// # Complexity
775 ///
776 /// This routine is guaranteed to have worst case linear time complexity
777 /// with respect to both the needle and the haystack. That is, this runs
778 /// in `O(needle.len() + haystack.len())` time.
779 ///
780 /// This routine is also guaranteed to have worst case constant space
781 /// complexity.
782 ///
783 /// # Examples
784 ///
785 /// Basic usage:
786 ///
787 /// ```
788 /// use bstr::ByteSlice;
789 ///
790 /// let s = b"foo bar foo foo quux foo";
791 /// let matches: Vec<usize> = s.rfind_iter("foo").collect();
792 /// assert_eq!(matches, vec![21, 12, 8, 0]);
793 /// ```
794 ///
795 /// An empty string matches at every position, including the position
796 /// immediately following the last byte:
797 ///
798 /// ```
799 /// use bstr::ByteSlice;
800 ///
801 /// let matches: Vec<usize> = b"foo".rfind_iter("").collect();
802 /// assert_eq!(matches, vec![3, 2, 1, 0]);
803 ///
804 /// let matches: Vec<usize> = b"".rfind_iter("").collect();
805 /// assert_eq!(matches, vec![0]);
806 /// ```
807 #[inline]
rfind_iter<'a, B: ?Sized + AsRef<[u8]>>( &'a self, needle: &'a B, ) -> FindReverse<'a>808 fn rfind_iter<'a, B: ?Sized + AsRef<[u8]>>(
809 &'a self,
810 needle: &'a B,
811 ) -> FindReverse<'a> {
812 FindReverse::new(self.as_bytes(), needle.as_ref())
813 }
814
815 /// Returns the index of the first occurrence of the given byte. If the
816 /// byte does not occur in this byte string, then `None` is returned.
817 ///
818 /// # Examples
819 ///
820 /// Basic usage:
821 ///
822 /// ```
823 /// use bstr::ByteSlice;
824 ///
825 /// assert_eq!(Some(10), b"foo bar baz".find_byte(b'z'));
826 /// assert_eq!(None, b"foo bar baz".find_byte(b'y'));
827 /// ```
828 #[inline]
find_byte(&self, byte: u8) -> Option<usize>829 fn find_byte(&self, byte: u8) -> Option<usize> {
830 memchr(byte, self.as_bytes())
831 }
832
833 /// Returns the index of the last occurrence of the given byte. If the
834 /// byte does not occur in this byte string, then `None` is returned.
835 ///
836 /// # Examples
837 ///
838 /// Basic usage:
839 ///
840 /// ```
841 /// use bstr::ByteSlice;
842 ///
843 /// assert_eq!(Some(10), b"foo bar baz".rfind_byte(b'z'));
844 /// assert_eq!(None, b"foo bar baz".rfind_byte(b'y'));
845 /// ```
846 #[inline]
rfind_byte(&self, byte: u8) -> Option<usize>847 fn rfind_byte(&self, byte: u8) -> Option<usize> {
848 memrchr(byte, self.as_bytes())
849 }
850
851 /// Returns the index of the first occurrence of the given codepoint.
852 /// If the codepoint does not occur in this byte string, then `None` is
853 /// returned.
854 ///
855 /// Note that if one searches for the replacement codepoint, `\u{FFFD}`,
856 /// then only explicit occurrences of that encoding will be found. Invalid
857 /// UTF-8 sequences will not be matched.
858 ///
859 /// # Examples
860 ///
861 /// Basic usage:
862 ///
863 /// ```
864 /// use bstr::{B, ByteSlice};
865 ///
866 /// assert_eq!(Some(10), b"foo bar baz".find_char('z'));
867 /// assert_eq!(Some(4), B("αβγγδ").find_char('γ'));
868 /// assert_eq!(None, b"foo bar baz".find_char('y'));
869 /// ```
870 #[inline]
find_char(&self, ch: char) -> Option<usize>871 fn find_char(&self, ch: char) -> Option<usize> {
872 self.find(ch.encode_utf8(&mut [0; 4]))
873 }
874
875 /// Returns the index of the last occurrence of the given codepoint.
876 /// If the codepoint does not occur in this byte string, then `None` is
877 /// returned.
878 ///
879 /// Note that if one searches for the replacement codepoint, `\u{FFFD}`,
880 /// then only explicit occurrences of that encoding will be found. Invalid
881 /// UTF-8 sequences will not be matched.
882 ///
883 /// # Examples
884 ///
885 /// Basic usage:
886 ///
887 /// ```
888 /// use bstr::{B, ByteSlice};
889 ///
890 /// assert_eq!(Some(10), b"foo bar baz".rfind_char('z'));
891 /// assert_eq!(Some(6), B("αβγγδ").rfind_char('γ'));
892 /// assert_eq!(None, b"foo bar baz".rfind_char('y'));
893 /// ```
894 #[inline]
rfind_char(&self, ch: char) -> Option<usize>895 fn rfind_char(&self, ch: char) -> Option<usize> {
896 self.rfind(ch.encode_utf8(&mut [0; 4]))
897 }
898
899 /// Returns the index of the first occurrence of any of the bytes in the
900 /// provided set.
901 ///
902 /// The `byteset` may be any type that can be cheaply converted into a
903 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but
904 /// note that passing a `&str` which contains multibyte characters may not
905 /// behave as you expect: each byte in the `&str` is treated as an
906 /// individual member of the byte set.
907 ///
908 /// Note that order is irrelevant for the `byteset` parameter, and
909 /// duplicate bytes present in its body are ignored.
910 ///
911 /// # Complexity
912 ///
913 /// This routine is guaranteed to have worst case linear time complexity
914 /// with respect to both the set of bytes and the haystack. That is, this
915 /// runs in `O(byteset.len() + haystack.len())` time.
916 ///
917 /// This routine is also guaranteed to have worst case constant space
918 /// complexity.
919 ///
920 /// # Examples
921 ///
922 /// Basic usage:
923 ///
924 /// ```
925 /// use bstr::ByteSlice;
926 ///
927 /// assert_eq!(b"foo bar baz".find_byteset(b"zr"), Some(6));
928 /// assert_eq!(b"foo baz bar".find_byteset(b"bzr"), Some(4));
929 /// assert_eq!(None, b"foo baz bar".find_byteset(b"\t\n"));
930 /// ```
931 #[inline]
find_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize>932 fn find_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
933 byteset::find(self.as_bytes(), byteset.as_ref())
934 }
935
936 /// Returns the index of the first occurrence of a byte that is not a member
937 /// of the provided set.
938 ///
939 /// The `byteset` may be any type that can be cheaply converted into a
940 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but
941 /// note that passing a `&str` which contains multibyte characters may not
942 /// behave as you expect: each byte in the `&str` is treated as an
943 /// individual member of the byte set.
944 ///
945 /// Note that order is irrelevant for the `byteset` parameter, and
946 /// duplicate bytes present in its body are ignored.
947 ///
948 /// # Complexity
949 ///
950 /// This routine is guaranteed to have worst case linear time complexity
951 /// with respect to both the set of bytes and the haystack. That is, this
952 /// runs in `O(byteset.len() + haystack.len())` time.
953 ///
954 /// This routine is also guaranteed to have worst case constant space
955 /// complexity.
956 ///
957 /// # Examples
958 ///
959 /// Basic usage:
960 ///
961 /// ```
962 /// use bstr::ByteSlice;
963 ///
964 /// assert_eq!(b"foo bar baz".find_not_byteset(b"fo "), Some(4));
965 /// assert_eq!(b"\t\tbaz bar".find_not_byteset(b" \t\r\n"), Some(2));
966 /// assert_eq!(b"foo\nbaz\tbar".find_not_byteset(b"\t\n"), Some(0));
967 /// ```
968 #[inline]
find_not_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize>969 fn find_not_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
970 byteset::find_not(self.as_bytes(), byteset.as_ref())
971 }
972
973 /// Returns the index of the last occurrence of any of the bytes in the
974 /// provided set.
975 ///
976 /// The `byteset` may be any type that can be cheaply converted into a
977 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but
978 /// note that passing a `&str` which contains multibyte characters may not
979 /// behave as you expect: each byte in the `&str` is treated as an
980 /// individual member of the byte set.
981 ///
982 /// Note that order is irrelevant for the `byteset` parameter, and duplicate
983 /// bytes present in its body are ignored.
984 ///
985 /// # Complexity
986 ///
987 /// This routine is guaranteed to have worst case linear time complexity
988 /// with respect to both the set of bytes and the haystack. That is, this
989 /// runs in `O(byteset.len() + haystack.len())` time.
990 ///
991 /// This routine is also guaranteed to have worst case constant space
992 /// complexity.
993 ///
994 /// # Examples
995 ///
996 /// Basic usage:
997 ///
998 /// ```
999 /// use bstr::ByteSlice;
1000 ///
1001 /// assert_eq!(b"foo bar baz".rfind_byteset(b"agb"), Some(9));
1002 /// assert_eq!(b"foo baz bar".rfind_byteset(b"rabz "), Some(10));
1003 /// assert_eq!(b"foo baz bar".rfind_byteset(b"\n123"), None);
1004 /// ```
1005 #[inline]
rfind_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize>1006 fn rfind_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
1007 byteset::rfind(self.as_bytes(), byteset.as_ref())
1008 }
1009
1010 /// Returns the index of the last occurrence of a byte that is not a member
1011 /// of the provided set.
1012 ///
1013 /// The `byteset` may be any type that can be cheaply converted into a
1014 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but
1015 /// note that passing a `&str` which contains multibyte characters may not
1016 /// behave as you expect: each byte in the `&str` is treated as an
1017 /// individual member of the byte set.
1018 ///
1019 /// Note that order is irrelevant for the `byteset` parameter, and
1020 /// duplicate bytes present in its body are ignored.
1021 ///
1022 /// # Complexity
1023 ///
1024 /// This routine is guaranteed to have worst case linear time complexity
1025 /// with respect to both the set of bytes and the haystack. That is, this
1026 /// runs in `O(byteset.len() + haystack.len())` time.
1027 ///
1028 /// This routine is also guaranteed to have worst case constant space
1029 /// complexity.
1030 ///
1031 /// # Examples
1032 ///
1033 /// Basic usage:
1034 ///
1035 /// ```
1036 /// use bstr::ByteSlice;
1037 ///
1038 /// assert_eq!(b"foo bar baz,\t".rfind_not_byteset(b",\t"), Some(10));
1039 /// assert_eq!(b"foo baz bar".rfind_not_byteset(b"rabz "), Some(2));
1040 /// assert_eq!(None, b"foo baz bar".rfind_not_byteset(b"barfoz "));
1041 /// ```
1042 #[inline]
rfind_not_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize>1043 fn rfind_not_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
1044 byteset::rfind_not(self.as_bytes(), byteset.as_ref())
1045 }
1046
1047 /// Returns an iterator over the fields in a byte string, separated by
1048 /// contiguous whitespace.
1049 ///
1050 /// # Example
1051 ///
1052 /// Basic usage:
1053 ///
1054 /// ```
1055 /// use bstr::{B, ByteSlice};
1056 ///
1057 /// let s = B(" foo\tbar\t\u{2003}\nquux \n");
1058 /// let fields: Vec<&[u8]> = s.fields().collect();
1059 /// assert_eq!(fields, vec![B("foo"), B("bar"), B("quux")]);
1060 /// ```
1061 ///
1062 /// A byte string consisting of just whitespace yields no elements:
1063 ///
1064 /// ```
1065 /// use bstr::{B, ByteSlice};
1066 ///
1067 /// assert_eq!(0, B(" \n\t\u{2003}\n \t").fields().count());
1068 /// ```
1069 #[inline]
fields(&self) -> Fields1070 fn fields(&self) -> Fields {
1071 Fields::new(self.as_bytes())
1072 }
1073
1074 /// Returns an iterator over the fields in a byte string, separated by
1075 /// contiguous codepoints satisfying the given predicate.
1076 ///
1077 /// If this byte string is not valid UTF-8, then the given closure will
1078 /// be called with a Unicode replacement codepoint when invalid UTF-8
1079 /// bytes are seen.
1080 ///
1081 /// # Example
1082 ///
1083 /// Basic usage:
1084 ///
1085 /// ```
1086 /// use bstr::{B, ByteSlice};
1087 ///
1088 /// let s = b"123foo999999bar1quux123456";
1089 /// let fields: Vec<&[u8]> = s.fields_with(|c| c.is_numeric()).collect();
1090 /// assert_eq!(fields, vec![B("foo"), B("bar"), B("quux")]);
1091 /// ```
1092 ///
1093 /// A byte string consisting of all codepoints satisfying the predicate
1094 /// yields no elements:
1095 ///
1096 /// ```
1097 /// use bstr::ByteSlice;
1098 ///
1099 /// assert_eq!(0, b"1911354563".fields_with(|c| c.is_numeric()).count());
1100 /// ```
1101 #[inline]
fields_with<F: FnMut(char) -> bool>(&self, f: F) -> FieldsWith<F>1102 fn fields_with<F: FnMut(char) -> bool>(&self, f: F) -> FieldsWith<F> {
1103 FieldsWith::new(self.as_bytes(), f)
1104 }
1105
1106 /// Returns an iterator over substrings of this byte string, separated
1107 /// by the given byte string. Each element yielded is guaranteed not to
1108 /// include the splitter substring.
1109 ///
1110 /// The splitter may be any type that can be cheaply converted into a
1111 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
1112 ///
1113 /// # Examples
1114 ///
1115 /// Basic usage:
1116 ///
1117 /// ```
1118 /// use bstr::{B, ByteSlice};
1119 ///
1120 /// let x: Vec<&[u8]> = b"Mary had a little lamb".split_str(" ").collect();
1121 /// assert_eq!(x, vec![
1122 /// B("Mary"), B("had"), B("a"), B("little"), B("lamb"),
1123 /// ]);
1124 ///
1125 /// let x: Vec<&[u8]> = b"".split_str("X").collect();
1126 /// assert_eq!(x, vec![b""]);
1127 ///
1128 /// let x: Vec<&[u8]> = b"lionXXtigerXleopard".split_str("X").collect();
1129 /// assert_eq!(x, vec![B("lion"), B(""), B("tiger"), B("leopard")]);
1130 ///
1131 /// let x: Vec<&[u8]> = b"lion::tiger::leopard".split_str("::").collect();
1132 /// assert_eq!(x, vec![B("lion"), B("tiger"), B("leopard")]);
1133 /// ```
1134 ///
1135 /// If a string contains multiple contiguous separators, you will end up
1136 /// with empty strings yielded by the iterator:
1137 ///
1138 /// ```
1139 /// use bstr::{B, ByteSlice};
1140 ///
1141 /// let x: Vec<&[u8]> = b"||||a||b|c".split_str("|").collect();
1142 /// assert_eq!(x, vec![
1143 /// B(""), B(""), B(""), B(""), B("a"), B(""), B("b"), B("c"),
1144 /// ]);
1145 ///
1146 /// let x: Vec<&[u8]> = b"(///)".split_str("/").collect();
1147 /// assert_eq!(x, vec![B("("), B(""), B(""), B(")")]);
1148 /// ```
1149 ///
1150 /// Separators at the start or end of a string are neighbored by empty
1151 /// strings.
1152 ///
1153 /// ```
1154 /// use bstr::{B, ByteSlice};
1155 ///
1156 /// let x: Vec<&[u8]> = b"010".split_str("0").collect();
1157 /// assert_eq!(x, vec![B(""), B("1"), B("")]);
1158 /// ```
1159 ///
1160 /// When the empty string is used as a separator, it splits every **byte**
1161 /// in the byte string, along with the beginning and end of the byte
1162 /// string.
1163 ///
1164 /// ```
1165 /// use bstr::{B, ByteSlice};
1166 ///
1167 /// let x: Vec<&[u8]> = b"rust".split_str("").collect();
1168 /// assert_eq!(x, vec![
1169 /// B(""), B("r"), B("u"), B("s"), B("t"), B(""),
1170 /// ]);
1171 ///
1172 /// // Splitting by an empty string is not UTF-8 aware. Elements yielded
1173 /// // may not be valid UTF-8!
1174 /// let x: Vec<&[u8]> = B("☃").split_str("").collect();
1175 /// assert_eq!(x, vec![
1176 /// B(""), B(b"\xE2"), B(b"\x98"), B(b"\x83"), B(""),
1177 /// ]);
1178 /// ```
1179 ///
1180 /// Contiguous separators, especially whitespace, can lead to possibly
1181 /// surprising behavior. For example, this code is correct:
1182 ///
1183 /// ```
1184 /// use bstr::{B, ByteSlice};
1185 ///
1186 /// let x: Vec<&[u8]> = b" a b c".split_str(" ").collect();
1187 /// assert_eq!(x, vec![
1188 /// B(""), B(""), B(""), B(""), B("a"), B(""), B("b"), B("c"),
1189 /// ]);
1190 /// ```
1191 ///
1192 /// It does *not* give you `["a", "b", "c"]`. For that behavior, use
1193 /// [`fields`](#method.fields) instead.
1194 #[inline]
split_str<'a, B: ?Sized + AsRef<[u8]>>( &'a self, splitter: &'a B, ) -> Split<'a>1195 fn split_str<'a, B: ?Sized + AsRef<[u8]>>(
1196 &'a self,
1197 splitter: &'a B,
1198 ) -> Split<'a> {
1199 Split::new(self.as_bytes(), splitter.as_ref())
1200 }
1201
1202 /// Returns an iterator over substrings of this byte string, separated by
1203 /// the given byte string, in reverse. Each element yielded is guaranteed
1204 /// not to include the splitter substring.
1205 ///
1206 /// The splitter may be any type that can be cheaply converted into a
1207 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
1208 ///
1209 /// # Examples
1210 ///
1211 /// Basic usage:
1212 ///
1213 /// ```
1214 /// use bstr::{B, ByteSlice};
1215 ///
1216 /// let x: Vec<&[u8]> =
1217 /// b"Mary had a little lamb".rsplit_str(" ").collect();
1218 /// assert_eq!(x, vec![
1219 /// B("lamb"), B("little"), B("a"), B("had"), B("Mary"),
1220 /// ]);
1221 ///
1222 /// let x: Vec<&[u8]> = b"".rsplit_str("X").collect();
1223 /// assert_eq!(x, vec![b""]);
1224 ///
1225 /// let x: Vec<&[u8]> = b"lionXXtigerXleopard".rsplit_str("X").collect();
1226 /// assert_eq!(x, vec![B("leopard"), B("tiger"), B(""), B("lion")]);
1227 ///
1228 /// let x: Vec<&[u8]> = b"lion::tiger::leopard".rsplit_str("::").collect();
1229 /// assert_eq!(x, vec![B("leopard"), B("tiger"), B("lion")]);
1230 /// ```
1231 ///
1232 /// If a string contains multiple contiguous separators, you will end up
1233 /// with empty strings yielded by the iterator:
1234 ///
1235 /// ```
1236 /// use bstr::{B, ByteSlice};
1237 ///
1238 /// let x: Vec<&[u8]> = b"||||a||b|c".rsplit_str("|").collect();
1239 /// assert_eq!(x, vec![
1240 /// B("c"), B("b"), B(""), B("a"), B(""), B(""), B(""), B(""),
1241 /// ]);
1242 ///
1243 /// let x: Vec<&[u8]> = b"(///)".rsplit_str("/").collect();
1244 /// assert_eq!(x, vec![B(")"), B(""), B(""), B("(")]);
1245 /// ```
1246 ///
1247 /// Separators at the start or end of a string are neighbored by empty
1248 /// strings.
1249 ///
1250 /// ```
1251 /// use bstr::{B, ByteSlice};
1252 ///
1253 /// let x: Vec<&[u8]> = b"010".rsplit_str("0").collect();
1254 /// assert_eq!(x, vec![B(""), B("1"), B("")]);
1255 /// ```
1256 ///
1257 /// When the empty string is used as a separator, it splits every **byte**
1258 /// in the byte string, along with the beginning and end of the byte
1259 /// string.
1260 ///
1261 /// ```
1262 /// use bstr::{B, ByteSlice};
1263 ///
1264 /// let x: Vec<&[u8]> = b"rust".rsplit_str("").collect();
1265 /// assert_eq!(x, vec![
1266 /// B(""), B("t"), B("s"), B("u"), B("r"), B(""),
1267 /// ]);
1268 ///
1269 /// // Splitting by an empty string is not UTF-8 aware. Elements yielded
1270 /// // may not be valid UTF-8!
1271 /// let x: Vec<&[u8]> = B("☃").rsplit_str("").collect();
1272 /// assert_eq!(x, vec![B(""), B(b"\x83"), B(b"\x98"), B(b"\xE2"), B("")]);
1273 /// ```
1274 ///
1275 /// Contiguous separators, especially whitespace, can lead to possibly
1276 /// surprising behavior. For example, this code is correct:
1277 ///
1278 /// ```
1279 /// use bstr::{B, ByteSlice};
1280 ///
1281 /// let x: Vec<&[u8]> = b" a b c".rsplit_str(" ").collect();
1282 /// assert_eq!(x, vec![
1283 /// B("c"), B("b"), B(""), B("a"), B(""), B(""), B(""), B(""),
1284 /// ]);
1285 /// ```
1286 ///
1287 /// It does *not* give you `["a", "b", "c"]`.
1288 #[inline]
rsplit_str<'a, B: ?Sized + AsRef<[u8]>>( &'a self, splitter: &'a B, ) -> SplitReverse<'a>1289 fn rsplit_str<'a, B: ?Sized + AsRef<[u8]>>(
1290 &'a self,
1291 splitter: &'a B,
1292 ) -> SplitReverse<'a> {
1293 SplitReverse::new(self.as_bytes(), splitter.as_ref())
1294 }
1295
1296 /// Returns an iterator of at most `limit` substrings of this byte string,
1297 /// separated by the given byte string. If `limit` substrings are yielded,
1298 /// then the last substring will contain the remainder of this byte string.
1299 ///
1300 /// The needle may be any type that can be cheaply converted into a
1301 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
1302 ///
1303 /// # Examples
1304 ///
1305 /// Basic usage:
1306 ///
1307 /// ```
1308 /// use bstr::{B, ByteSlice};
1309 ///
1310 /// let x: Vec<_> = b"Mary had a little lamb".splitn_str(3, " ").collect();
1311 /// assert_eq!(x, vec![B("Mary"), B("had"), B("a little lamb")]);
1312 ///
1313 /// let x: Vec<_> = b"".splitn_str(3, "X").collect();
1314 /// assert_eq!(x, vec![b""]);
1315 ///
1316 /// let x: Vec<_> = b"lionXXtigerXleopard".splitn_str(3, "X").collect();
1317 /// assert_eq!(x, vec![B("lion"), B(""), B("tigerXleopard")]);
1318 ///
1319 /// let x: Vec<_> = b"lion::tiger::leopard".splitn_str(2, "::").collect();
1320 /// assert_eq!(x, vec![B("lion"), B("tiger::leopard")]);
1321 ///
1322 /// let x: Vec<_> = b"abcXdef".splitn_str(1, "X").collect();
1323 /// assert_eq!(x, vec![B("abcXdef")]);
1324 ///
1325 /// let x: Vec<_> = b"abcdef".splitn_str(2, "X").collect();
1326 /// assert_eq!(x, vec![B("abcdef")]);
1327 ///
1328 /// let x: Vec<_> = b"abcXdef".splitn_str(0, "X").collect();
1329 /// assert!(x.is_empty());
1330 /// ```
1331 #[inline]
splitn_str<'a, B: ?Sized + AsRef<[u8]>>( &'a self, limit: usize, splitter: &'a B, ) -> SplitN<'a>1332 fn splitn_str<'a, B: ?Sized + AsRef<[u8]>>(
1333 &'a self,
1334 limit: usize,
1335 splitter: &'a B,
1336 ) -> SplitN<'a> {
1337 SplitN::new(self.as_bytes(), splitter.as_ref(), limit)
1338 }
1339
1340 /// Returns an iterator of at most `limit` substrings of this byte string,
1341 /// separated by the given byte string, in reverse. If `limit` substrings
1342 /// are yielded, then the last substring will contain the remainder of this
1343 /// byte string.
1344 ///
1345 /// The needle may be any type that can be cheaply converted into a
1346 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
1347 ///
1348 /// # Examples
1349 ///
1350 /// Basic usage:
1351 ///
1352 /// ```
1353 /// use bstr::{B, ByteSlice};
1354 ///
1355 /// let x: Vec<_> =
1356 /// b"Mary had a little lamb".rsplitn_str(3, " ").collect();
1357 /// assert_eq!(x, vec![B("lamb"), B("little"), B("Mary had a")]);
1358 ///
1359 /// let x: Vec<_> = b"".rsplitn_str(3, "X").collect();
1360 /// assert_eq!(x, vec![b""]);
1361 ///
1362 /// let x: Vec<_> = b"lionXXtigerXleopard".rsplitn_str(3, "X").collect();
1363 /// assert_eq!(x, vec![B("leopard"), B("tiger"), B("lionX")]);
1364 ///
1365 /// let x: Vec<_> = b"lion::tiger::leopard".rsplitn_str(2, "::").collect();
1366 /// assert_eq!(x, vec![B("leopard"), B("lion::tiger")]);
1367 ///
1368 /// let x: Vec<_> = b"abcXdef".rsplitn_str(1, "X").collect();
1369 /// assert_eq!(x, vec![B("abcXdef")]);
1370 ///
1371 /// let x: Vec<_> = b"abcdef".rsplitn_str(2, "X").collect();
1372 /// assert_eq!(x, vec![B("abcdef")]);
1373 ///
1374 /// let x: Vec<_> = b"abcXdef".rsplitn_str(0, "X").collect();
1375 /// assert!(x.is_empty());
1376 /// ```
1377 #[inline]
rsplitn_str<'a, B: ?Sized + AsRef<[u8]>>( &'a self, limit: usize, splitter: &'a B, ) -> SplitNReverse<'a>1378 fn rsplitn_str<'a, B: ?Sized + AsRef<[u8]>>(
1379 &'a self,
1380 limit: usize,
1381 splitter: &'a B,
1382 ) -> SplitNReverse<'a> {
1383 SplitNReverse::new(self.as_bytes(), splitter.as_ref(), limit)
1384 }
1385
1386 /// Replace all matches of the given needle with the given replacement, and
1387 /// the result as a new `Vec<u8>`.
1388 ///
1389 /// This routine is useful as a convenience. If you need to reuse an
1390 /// allocation, use [`replace_into`](#method.replace_into) instead.
1391 ///
1392 /// # Examples
1393 ///
1394 /// Basic usage:
1395 ///
1396 /// ```
1397 /// use bstr::ByteSlice;
1398 ///
1399 /// let s = b"this is old".replace("old", "new");
1400 /// assert_eq!(s, "this is new".as_bytes());
1401 /// ```
1402 ///
1403 /// When the pattern doesn't match:
1404 ///
1405 /// ```
1406 /// use bstr::ByteSlice;
1407 ///
1408 /// let s = b"this is old".replace("nada nada", "limonada");
1409 /// assert_eq!(s, "this is old".as_bytes());
1410 /// ```
1411 ///
1412 /// When the needle is an empty string:
1413 ///
1414 /// ```
1415 /// use bstr::ByteSlice;
1416 ///
1417 /// let s = b"foo".replace("", "Z");
1418 /// assert_eq!(s, "ZfZoZoZ".as_bytes());
1419 /// ```
1420 #[cfg(feature = "std")]
1421 #[inline]
replace<N: AsRef<[u8]>, R: AsRef<[u8]>>( &self, needle: N, replacement: R, ) -> Vec<u8>1422 fn replace<N: AsRef<[u8]>, R: AsRef<[u8]>>(
1423 &self,
1424 needle: N,
1425 replacement: R,
1426 ) -> Vec<u8> {
1427 let mut dest = Vec::with_capacity(self.as_bytes().len());
1428 self.replace_into(needle, replacement, &mut dest);
1429 dest
1430 }
1431
1432 /// Replace up to `limit` matches of the given needle with the given
1433 /// replacement, and the result as a new `Vec<u8>`.
1434 ///
1435 /// This routine is useful as a convenience. If you need to reuse an
1436 /// allocation, use [`replacen_into`](#method.replacen_into) instead.
1437 ///
1438 /// # Examples
1439 ///
1440 /// Basic usage:
1441 ///
1442 /// ```
1443 /// use bstr::ByteSlice;
1444 ///
1445 /// let s = b"foofoo".replacen("o", "z", 2);
1446 /// assert_eq!(s, "fzzfoo".as_bytes());
1447 /// ```
1448 ///
1449 /// When the pattern doesn't match:
1450 ///
1451 /// ```
1452 /// use bstr::ByteSlice;
1453 ///
1454 /// let s = b"foofoo".replacen("a", "z", 2);
1455 /// assert_eq!(s, "foofoo".as_bytes());
1456 /// ```
1457 ///
1458 /// When the needle is an empty string:
1459 ///
1460 /// ```
1461 /// use bstr::ByteSlice;
1462 ///
1463 /// let s = b"foo".replacen("", "Z", 2);
1464 /// assert_eq!(s, "ZfZoo".as_bytes());
1465 /// ```
1466 #[cfg(feature = "std")]
1467 #[inline]
replacen<N: AsRef<[u8]>, R: AsRef<[u8]>>( &self, needle: N, replacement: R, limit: usize, ) -> Vec<u8>1468 fn replacen<N: AsRef<[u8]>, R: AsRef<[u8]>>(
1469 &self,
1470 needle: N,
1471 replacement: R,
1472 limit: usize,
1473 ) -> Vec<u8> {
1474 let mut dest = Vec::with_capacity(self.as_bytes().len());
1475 self.replacen_into(needle, replacement, limit, &mut dest);
1476 dest
1477 }
1478
1479 /// Replace all matches of the given needle with the given replacement,
1480 /// and write the result into the provided `Vec<u8>`.
1481 ///
1482 /// This does **not** clear `dest` before writing to it.
1483 ///
1484 /// This routine is useful for reusing allocation. For a more convenient
1485 /// API, use [`replace`](#method.replace) instead.
1486 ///
1487 /// # Examples
1488 ///
1489 /// Basic usage:
1490 ///
1491 /// ```
1492 /// use bstr::ByteSlice;
1493 ///
1494 /// let s = b"this is old";
1495 ///
1496 /// let mut dest = vec![];
1497 /// s.replace_into("old", "new", &mut dest);
1498 /// assert_eq!(dest, "this is new".as_bytes());
1499 /// ```
1500 ///
1501 /// When the pattern doesn't match:
1502 ///
1503 /// ```
1504 /// use bstr::ByteSlice;
1505 ///
1506 /// let s = b"this is old";
1507 ///
1508 /// let mut dest = vec![];
1509 /// s.replace_into("nada nada", "limonada", &mut dest);
1510 /// assert_eq!(dest, "this is old".as_bytes());
1511 /// ```
1512 ///
1513 /// When the needle is an empty string:
1514 ///
1515 /// ```
1516 /// use bstr::ByteSlice;
1517 ///
1518 /// let s = b"foo";
1519 ///
1520 /// let mut dest = vec![];
1521 /// s.replace_into("", "Z", &mut dest);
1522 /// assert_eq!(dest, "ZfZoZoZ".as_bytes());
1523 /// ```
1524 #[cfg(feature = "std")]
1525 #[inline]
replace_into<N: AsRef<[u8]>, R: AsRef<[u8]>>( &self, needle: N, replacement: R, dest: &mut Vec<u8>, )1526 fn replace_into<N: AsRef<[u8]>, R: AsRef<[u8]>>(
1527 &self,
1528 needle: N,
1529 replacement: R,
1530 dest: &mut Vec<u8>,
1531 ) {
1532 let (needle, replacement) = (needle.as_ref(), replacement.as_ref());
1533
1534 let mut last = 0;
1535 for start in self.find_iter(needle) {
1536 dest.push_str(&self.as_bytes()[last..start]);
1537 dest.push_str(replacement);
1538 last = start + needle.len();
1539 }
1540 dest.push_str(&self.as_bytes()[last..]);
1541 }
1542
1543 /// Replace up to `limit` matches of the given needle with the given
1544 /// replacement, and write the result into the provided `Vec<u8>`.
1545 ///
1546 /// This does **not** clear `dest` before writing to it.
1547 ///
1548 /// This routine is useful for reusing allocation. For a more convenient
1549 /// API, use [`replacen`](#method.replacen) instead.
1550 ///
1551 /// # Examples
1552 ///
1553 /// Basic usage:
1554 ///
1555 /// ```
1556 /// use bstr::ByteSlice;
1557 ///
1558 /// let s = b"foofoo";
1559 ///
1560 /// let mut dest = vec![];
1561 /// s.replacen_into("o", "z", 2, &mut dest);
1562 /// assert_eq!(dest, "fzzfoo".as_bytes());
1563 /// ```
1564 ///
1565 /// When the pattern doesn't match:
1566 ///
1567 /// ```
1568 /// use bstr::ByteSlice;
1569 ///
1570 /// let s = b"foofoo";
1571 ///
1572 /// let mut dest = vec![];
1573 /// s.replacen_into("a", "z", 2, &mut dest);
1574 /// assert_eq!(dest, "foofoo".as_bytes());
1575 /// ```
1576 ///
1577 /// When the needle is an empty string:
1578 ///
1579 /// ```
1580 /// use bstr::ByteSlice;
1581 ///
1582 /// let s = b"foo";
1583 ///
1584 /// let mut dest = vec![];
1585 /// s.replacen_into("", "Z", 2, &mut dest);
1586 /// assert_eq!(dest, "ZfZoo".as_bytes());
1587 /// ```
1588 #[cfg(feature = "std")]
1589 #[inline]
replacen_into<N: AsRef<[u8]>, R: AsRef<[u8]>>( &self, needle: N, replacement: R, limit: usize, dest: &mut Vec<u8>, )1590 fn replacen_into<N: AsRef<[u8]>, R: AsRef<[u8]>>(
1591 &self,
1592 needle: N,
1593 replacement: R,
1594 limit: usize,
1595 dest: &mut Vec<u8>,
1596 ) {
1597 let (needle, replacement) = (needle.as_ref(), replacement.as_ref());
1598
1599 let mut last = 0;
1600 for start in self.find_iter(needle).take(limit) {
1601 dest.push_str(&self.as_bytes()[last..start]);
1602 dest.push_str(replacement);
1603 last = start + needle.len();
1604 }
1605 dest.push_str(&self.as_bytes()[last..]);
1606 }
1607
1608 /// Returns an iterator over the bytes in this byte string.
1609 ///
1610 /// # Examples
1611 ///
1612 /// Basic usage:
1613 ///
1614 /// ```
1615 /// use bstr::ByteSlice;
1616 ///
1617 /// let bs = b"foobar";
1618 /// let bytes: Vec<u8> = bs.bytes().collect();
1619 /// assert_eq!(bytes, bs);
1620 /// ```
1621 #[inline]
bytes(&self) -> Bytes1622 fn bytes(&self) -> Bytes {
1623 Bytes { it: self.as_bytes().iter() }
1624 }
1625
1626 /// Returns an iterator over the Unicode scalar values in this byte string.
1627 /// If invalid UTF-8 is encountered, then the Unicode replacement codepoint
1628 /// is yielded instead.
1629 ///
1630 /// # Examples
1631 ///
1632 /// Basic usage:
1633 ///
1634 /// ```
1635 /// use bstr::ByteSlice;
1636 ///
1637 /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61";
1638 /// let chars: Vec<char> = bs.chars().collect();
1639 /// assert_eq!(vec!['☃', '\u{FFFD}', '', '\u{FFFD}', 'a'], chars);
1640 /// ```
1641 ///
1642 /// Codepoints can also be iterated over in reverse:
1643 ///
1644 /// ```
1645 /// use bstr::ByteSlice;
1646 ///
1647 /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61";
1648 /// let chars: Vec<char> = bs.chars().rev().collect();
1649 /// assert_eq!(vec!['a', '\u{FFFD}', '', '\u{FFFD}', '☃'], chars);
1650 /// ```
1651 #[inline]
chars(&self) -> Chars1652 fn chars(&self) -> Chars {
1653 Chars::new(self.as_bytes())
1654 }
1655
1656 /// Returns an iterator over the Unicode scalar values in this byte string
1657 /// along with their starting and ending byte index positions. If invalid
1658 /// UTF-8 is encountered, then the Unicode replacement codepoint is yielded
1659 /// instead.
1660 ///
1661 /// Note that this is slightly different from the `CharIndices` iterator
1662 /// provided by the standard library. Aside from working on possibly
1663 /// invalid UTF-8, this iterator provides both the corresponding starting
1664 /// and ending byte indices of each codepoint yielded. The ending position
1665 /// is necessary to slice the original byte string when invalid UTF-8 bytes
1666 /// are converted into a Unicode replacement codepoint, since a single
1667 /// replacement codepoint can substitute anywhere from 1 to 3 invalid bytes
1668 /// (inclusive).
1669 ///
1670 /// # Examples
1671 ///
1672 /// Basic usage:
1673 ///
1674 /// ```
1675 /// use bstr::ByteSlice;
1676 ///
1677 /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61";
1678 /// let chars: Vec<(usize, usize, char)> = bs.char_indices().collect();
1679 /// assert_eq!(chars, vec![
1680 /// (0, 3, '☃'),
1681 /// (3, 4, '\u{FFFD}'),
1682 /// (4, 8, ''),
1683 /// (8, 10, '\u{FFFD}'),
1684 /// (10, 11, 'a'),
1685 /// ]);
1686 /// ```
1687 ///
1688 /// Codepoints can also be iterated over in reverse:
1689 ///
1690 /// ```
1691 /// use bstr::ByteSlice;
1692 ///
1693 /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61";
1694 /// let chars: Vec<(usize, usize, char)> = bs
1695 /// .char_indices()
1696 /// .rev()
1697 /// .collect();
1698 /// assert_eq!(chars, vec![
1699 /// (10, 11, 'a'),
1700 /// (8, 10, '\u{FFFD}'),
1701 /// (4, 8, ''),
1702 /// (3, 4, '\u{FFFD}'),
1703 /// (0, 3, '☃'),
1704 /// ]);
1705 /// ```
1706 #[inline]
char_indices(&self) -> CharIndices1707 fn char_indices(&self) -> CharIndices {
1708 CharIndices::new(self.as_bytes())
1709 }
1710
1711 /// Iterate over chunks of valid UTF-8.
1712 ///
1713 /// The iterator returned yields chunks of valid UTF-8 separated by invalid
1714 /// UTF-8 bytes, if they exist. Invalid UTF-8 bytes are always 1-3 bytes,
1715 /// which are determined via the "substitution of maximal subparts"
1716 /// strategy described in the docs for the
1717 /// [`ByteSlice::to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy)
1718 /// method.
1719 ///
1720 /// # Examples
1721 ///
1722 /// This example shows how to gather all valid and invalid chunks from a
1723 /// byte slice:
1724 ///
1725 /// ```
1726 /// use bstr::{ByteSlice, Utf8Chunk};
1727 ///
1728 /// let bytes = b"foo\xFD\xFEbar\xFF";
1729 ///
1730 /// let (mut valid_chunks, mut invalid_chunks) = (vec![], vec![]);
1731 /// for chunk in bytes.utf8_chunks() {
1732 /// if !chunk.valid().is_empty() {
1733 /// valid_chunks.push(chunk.valid());
1734 /// }
1735 /// if !chunk.invalid().is_empty() {
1736 /// invalid_chunks.push(chunk.invalid());
1737 /// }
1738 /// }
1739 ///
1740 /// assert_eq!(valid_chunks, vec!["foo", "bar"]);
1741 /// assert_eq!(invalid_chunks, vec![b"\xFD", b"\xFE", b"\xFF"]);
1742 /// ```
1743 #[inline]
utf8_chunks(&self) -> Utf8Chunks1744 fn utf8_chunks(&self) -> Utf8Chunks {
1745 Utf8Chunks { bytes: self.as_bytes() }
1746 }
1747
1748 /// Returns an iterator over the grapheme clusters in this byte string.
1749 /// If invalid UTF-8 is encountered, then the Unicode replacement codepoint
1750 /// is yielded instead.
1751 ///
1752 /// # Examples
1753 ///
1754 /// This example shows how multiple codepoints can combine to form a
1755 /// single grapheme cluster:
1756 ///
1757 /// ```
1758 /// use bstr::ByteSlice;
1759 ///
1760 /// let bs = "a\u{0300}\u{0316}\u{1F1FA}\u{1F1F8}".as_bytes();
1761 /// let graphemes: Vec<&str> = bs.graphemes().collect();
1762 /// assert_eq!(vec!["à̖", ""], graphemes);
1763 /// ```
1764 ///
1765 /// This shows that graphemes can be iterated over in reverse:
1766 ///
1767 /// ```
1768 /// use bstr::ByteSlice;
1769 ///
1770 /// let bs = "a\u{0300}\u{0316}\u{1F1FA}\u{1F1F8}".as_bytes();
1771 /// let graphemes: Vec<&str> = bs.graphemes().rev().collect();
1772 /// assert_eq!(vec!["", "à̖"], graphemes);
1773 /// ```
1774 #[cfg(feature = "unicode")]
1775 #[inline]
graphemes(&self) -> Graphemes1776 fn graphemes(&self) -> Graphemes {
1777 Graphemes::new(self.as_bytes())
1778 }
1779
1780 /// Returns an iterator over the grapheme clusters in this byte string
1781 /// along with their starting and ending byte index positions. If invalid
1782 /// UTF-8 is encountered, then the Unicode replacement codepoint is yielded
1783 /// instead.
1784 ///
1785 /// # Examples
1786 ///
1787 /// This example shows how to get the byte offsets of each individual
1788 /// grapheme cluster:
1789 ///
1790 /// ```
1791 /// use bstr::ByteSlice;
1792 ///
1793 /// let bs = "a\u{0300}\u{0316}\u{1F1FA}\u{1F1F8}".as_bytes();
1794 /// let graphemes: Vec<(usize, usize, &str)> =
1795 /// bs.grapheme_indices().collect();
1796 /// assert_eq!(vec![(0, 5, "à̖"), (5, 13, "")], graphemes);
1797 /// ```
1798 ///
1799 /// This example shows what happens when invalid UTF-8 is enountered. Note
1800 /// that the offsets are valid indices into the original string, and do
1801 /// not necessarily correspond to the length of the `&str` returned!
1802 ///
1803 /// ```
1804 /// use bstr::{ByteSlice, ByteVec};
1805 ///
1806 /// let mut bytes = vec![];
1807 /// bytes.push_str("a\u{0300}\u{0316}");
1808 /// bytes.push(b'\xFF');
1809 /// bytes.push_str("\u{1F1FA}\u{1F1F8}");
1810 ///
1811 /// let graphemes: Vec<(usize, usize, &str)> =
1812 /// bytes.grapheme_indices().collect();
1813 /// assert_eq!(
1814 /// graphemes,
1815 /// vec![(0, 5, "à̖"), (5, 6, "\u{FFFD}"), (6, 14, "")]
1816 /// );
1817 /// ```
1818 #[cfg(feature = "unicode")]
1819 #[inline]
grapheme_indices(&self) -> GraphemeIndices1820 fn grapheme_indices(&self) -> GraphemeIndices {
1821 GraphemeIndices::new(self.as_bytes())
1822 }
1823
1824 /// Returns an iterator over the words in this byte string. If invalid
1825 /// UTF-8 is encountered, then the Unicode replacement codepoint is yielded
1826 /// instead.
1827 ///
1828 /// This is similar to
1829 /// [`words_with_breaks`](trait.ByteSlice.html#method.words_with_breaks),
1830 /// except it only returns elements that contain a "word" character. A word
1831 /// character is defined by UTS #18 (Annex C) to be the combination of the
1832 /// `Alphabetic` and `Join_Control` properties, along with the
1833 /// `Decimal_Number`, `Mark` and `Connector_Punctuation` general
1834 /// categories.
1835 ///
1836 /// Since words are made up of one or more codepoints, this iterator
1837 /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
1838 /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
1839 ///
1840 /// # Examples
1841 ///
1842 /// Basic usage:
1843 ///
1844 /// ```
1845 /// use bstr::ByteSlice;
1846 ///
1847 /// let bs = br#"The quick ("brown") fox can't jump 32.3 feet, right?"#;
1848 /// let words: Vec<&str> = bs.words().collect();
1849 /// assert_eq!(words, vec![
1850 /// "The", "quick", "brown", "fox", "can't",
1851 /// "jump", "32.3", "feet", "right",
1852 /// ]);
1853 /// ```
1854 #[cfg(feature = "unicode")]
1855 #[inline]
words(&self) -> Words1856 fn words(&self) -> Words {
1857 Words::new(self.as_bytes())
1858 }
1859
1860 /// Returns an iterator over the words in this byte string along with
1861 /// their starting and ending byte index positions.
1862 ///
1863 /// This is similar to
1864 /// [`words_with_break_indices`](trait.ByteSlice.html#method.words_with_break_indices),
1865 /// except it only returns elements that contain a "word" character. A word
1866 /// character is defined by UTS #18 (Annex C) to be the combination of the
1867 /// `Alphabetic` and `Join_Control` properties, along with the
1868 /// `Decimal_Number`, `Mark` and `Connector_Punctuation` general
1869 /// categories.
1870 ///
1871 /// Since words are made up of one or more codepoints, this iterator
1872 /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
1873 /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
1874 ///
1875 /// # Examples
1876 ///
1877 /// This example shows how to get the byte offsets of each individual
1878 /// word:
1879 ///
1880 /// ```
1881 /// use bstr::ByteSlice;
1882 ///
1883 /// let bs = b"can't jump 32.3 feet";
1884 /// let words: Vec<(usize, usize, &str)> = bs.word_indices().collect();
1885 /// assert_eq!(words, vec![
1886 /// (0, 5, "can't"),
1887 /// (6, 10, "jump"),
1888 /// (11, 15, "32.3"),
1889 /// (16, 20, "feet"),
1890 /// ]);
1891 /// ```
1892 #[cfg(feature = "unicode")]
1893 #[inline]
word_indices(&self) -> WordIndices1894 fn word_indices(&self) -> WordIndices {
1895 WordIndices::new(self.as_bytes())
1896 }
1897
1898 /// Returns an iterator over the words in this byte string, along with
1899 /// all breaks between the words. Concatenating all elements yielded by
1900 /// the iterator results in the original string (modulo Unicode replacement
1901 /// codepoint substitutions if invalid UTF-8 is encountered).
1902 ///
1903 /// Since words are made up of one or more codepoints, this iterator
1904 /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
1905 /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
1906 ///
1907 /// # Examples
1908 ///
1909 /// Basic usage:
1910 ///
1911 /// ```
1912 /// use bstr::ByteSlice;
1913 ///
1914 /// let bs = br#"The quick ("brown") fox can't jump 32.3 feet, right?"#;
1915 /// let words: Vec<&str> = bs.words_with_breaks().collect();
1916 /// assert_eq!(words, vec![
1917 /// "The", " ", "quick", " ", "(", "\"", "brown", "\"", ")",
1918 /// " ", "fox", " ", "can't", " ", "jump", " ", "32.3", " ", "feet",
1919 /// ",", " ", "right", "?",
1920 /// ]);
1921 /// ```
1922 #[cfg(feature = "unicode")]
1923 #[inline]
words_with_breaks(&self) -> WordsWithBreaks1924 fn words_with_breaks(&self) -> WordsWithBreaks {
1925 WordsWithBreaks::new(self.as_bytes())
1926 }
1927
1928 /// Returns an iterator over the words and their byte offsets in this
1929 /// byte string, along with all breaks between the words. Concatenating
1930 /// all elements yielded by the iterator results in the original string
1931 /// (modulo Unicode replacement codepoint substitutions if invalid UTF-8 is
1932 /// encountered).
1933 ///
1934 /// Since words are made up of one or more codepoints, this iterator
1935 /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
1936 /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
1937 ///
1938 /// # Examples
1939 ///
1940 /// This example shows how to get the byte offsets of each individual
1941 /// word:
1942 ///
1943 /// ```
1944 /// use bstr::ByteSlice;
1945 ///
1946 /// let bs = b"can't jump 32.3 feet";
1947 /// let words: Vec<(usize, usize, &str)> =
1948 /// bs.words_with_break_indices().collect();
1949 /// assert_eq!(words, vec![
1950 /// (0, 5, "can't"),
1951 /// (5, 6, " "),
1952 /// (6, 10, "jump"),
1953 /// (10, 11, " "),
1954 /// (11, 15, "32.3"),
1955 /// (15, 16, " "),
1956 /// (16, 20, "feet"),
1957 /// ]);
1958 /// ```
1959 #[cfg(feature = "unicode")]
1960 #[inline]
words_with_break_indices(&self) -> WordsWithBreakIndices1961 fn words_with_break_indices(&self) -> WordsWithBreakIndices {
1962 WordsWithBreakIndices::new(self.as_bytes())
1963 }
1964
1965 /// Returns an iterator over the sentences in this byte string.
1966 ///
1967 /// Typically, a sentence will include its trailing punctuation and
1968 /// whitespace. Concatenating all elements yielded by the iterator
1969 /// results in the original string (modulo Unicode replacement codepoint
1970 /// substitutions if invalid UTF-8 is encountered).
1971 ///
1972 /// Since sentences are made up of one or more codepoints, this iterator
1973 /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
1974 /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
1975 ///
1976 /// # Examples
1977 ///
1978 /// Basic usage:
1979 ///
1980 /// ```
1981 /// use bstr::ByteSlice;
1982 ///
1983 /// let bs = b"I want this. Not that. Right now.";
1984 /// let sentences: Vec<&str> = bs.sentences().collect();
1985 /// assert_eq!(sentences, vec![
1986 /// "I want this. ",
1987 /// "Not that. ",
1988 /// "Right now.",
1989 /// ]);
1990 /// ```
1991 #[cfg(feature = "unicode")]
1992 #[inline]
sentences(&self) -> Sentences1993 fn sentences(&self) -> Sentences {
1994 Sentences::new(self.as_bytes())
1995 }
1996
1997 /// Returns an iterator over the sentences in this byte string along with
1998 /// their starting and ending byte index positions.
1999 ///
2000 /// Typically, a sentence will include its trailing punctuation and
2001 /// whitespace. Concatenating all elements yielded by the iterator
2002 /// results in the original string (modulo Unicode replacement codepoint
2003 /// substitutions if invalid UTF-8 is encountered).
2004 ///
2005 /// Since sentences are made up of one or more codepoints, this iterator
2006 /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
2007 /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
2008 ///
2009 /// # Examples
2010 ///
2011 /// Basic usage:
2012 ///
2013 /// ```
2014 /// use bstr::ByteSlice;
2015 ///
2016 /// let bs = b"I want this. Not that. Right now.";
2017 /// let sentences: Vec<(usize, usize, &str)> =
2018 /// bs.sentence_indices().collect();
2019 /// assert_eq!(sentences, vec![
2020 /// (0, 13, "I want this. "),
2021 /// (13, 23, "Not that. "),
2022 /// (23, 33, "Right now."),
2023 /// ]);
2024 /// ```
2025 #[cfg(feature = "unicode")]
2026 #[inline]
sentence_indices(&self) -> SentenceIndices2027 fn sentence_indices(&self) -> SentenceIndices {
2028 SentenceIndices::new(self.as_bytes())
2029 }
2030
2031 /// An iterator over all lines in a byte string, without their
2032 /// terminators.
2033 ///
2034 /// For this iterator, the only line terminators recognized are `\r\n` and
2035 /// `\n`.
2036 ///
2037 /// # Examples
2038 ///
2039 /// Basic usage:
2040 ///
2041 /// ```
2042 /// use bstr::{B, ByteSlice};
2043 ///
2044 /// let s = b"\
2045 /// foo
2046 ///
2047 /// bar\r
2048 /// baz
2049 ///
2050 ///
2051 /// quux";
2052 /// let lines: Vec<&[u8]> = s.lines().collect();
2053 /// assert_eq!(lines, vec![
2054 /// B("foo"), B(""), B("bar"), B("baz"), B(""), B(""), B("quux"),
2055 /// ]);
2056 /// ```
2057 #[inline]
lines(&self) -> Lines2058 fn lines(&self) -> Lines {
2059 Lines::new(self.as_bytes())
2060 }
2061
2062 /// An iterator over all lines in a byte string, including their
2063 /// terminators.
2064 ///
2065 /// For this iterator, the only line terminator recognized is `\n`. (Since
2066 /// line terminators are included, this also handles `\r\n` line endings.)
2067 ///
2068 /// Line terminators are only included if they are present in the original
2069 /// byte string. For example, the last line in a byte string may not end
2070 /// with a line terminator.
2071 ///
2072 /// Concatenating all elements yielded by this iterator is guaranteed to
2073 /// yield the original byte string.
2074 ///
2075 /// # Examples
2076 ///
2077 /// Basic usage:
2078 ///
2079 /// ```
2080 /// use bstr::{B, ByteSlice};
2081 ///
2082 /// let s = b"\
2083 /// foo
2084 ///
2085 /// bar\r
2086 /// baz
2087 ///
2088 ///
2089 /// quux";
2090 /// let lines: Vec<&[u8]> = s.lines_with_terminator().collect();
2091 /// assert_eq!(lines, vec![
2092 /// B("foo\n"),
2093 /// B("\n"),
2094 /// B("bar\r\n"),
2095 /// B("baz\n"),
2096 /// B("\n"),
2097 /// B("\n"),
2098 /// B("quux"),
2099 /// ]);
2100 /// ```
2101 #[inline]
lines_with_terminator(&self) -> LinesWithTerminator2102 fn lines_with_terminator(&self) -> LinesWithTerminator {
2103 LinesWithTerminator::new(self.as_bytes())
2104 }
2105
2106 /// Return a byte string slice with leading and trailing whitespace
2107 /// removed.
2108 ///
2109 /// Whitespace is defined according to the terms of the `White_Space`
2110 /// Unicode property.
2111 ///
2112 /// # Examples
2113 ///
2114 /// Basic usage:
2115 ///
2116 /// ```
2117 /// use bstr::{B, ByteSlice};
2118 ///
2119 /// let s = B(" foo\tbar\t\u{2003}\n");
2120 /// assert_eq!(s.trim(), B("foo\tbar"));
2121 /// ```
2122 #[cfg(feature = "unicode")]
2123 #[inline]
trim(&self) -> &[u8]2124 fn trim(&self) -> &[u8] {
2125 self.trim_start().trim_end()
2126 }
2127
2128 /// Return a byte string slice with leading whitespace removed.
2129 ///
2130 /// Whitespace is defined according to the terms of the `White_Space`
2131 /// Unicode property.
2132 ///
2133 /// # Examples
2134 ///
2135 /// Basic usage:
2136 ///
2137 /// ```
2138 /// use bstr::{B, ByteSlice};
2139 ///
2140 /// let s = B(" foo\tbar\t\u{2003}\n");
2141 /// assert_eq!(s.trim_start(), B("foo\tbar\t\u{2003}\n"));
2142 /// ```
2143 #[cfg(feature = "unicode")]
2144 #[inline]
trim_start(&self) -> &[u8]2145 fn trim_start(&self) -> &[u8] {
2146 let start = whitespace_len_fwd(self.as_bytes());
2147 &self.as_bytes()[start..]
2148 }
2149
2150 /// Return a byte string slice with trailing whitespace removed.
2151 ///
2152 /// Whitespace is defined according to the terms of the `White_Space`
2153 /// Unicode property.
2154 ///
2155 /// # Examples
2156 ///
2157 /// Basic usage:
2158 ///
2159 /// ```
2160 /// use bstr::{B, ByteSlice};
2161 ///
2162 /// let s = B(" foo\tbar\t\u{2003}\n");
2163 /// assert_eq!(s.trim_end(), B(" foo\tbar"));
2164 /// ```
2165 #[cfg(feature = "unicode")]
2166 #[inline]
trim_end(&self) -> &[u8]2167 fn trim_end(&self) -> &[u8] {
2168 let end = whitespace_len_rev(self.as_bytes());
2169 &self.as_bytes()[..end]
2170 }
2171
2172 /// Return a byte string slice with leading and trailing characters
2173 /// satisfying the given predicate removed.
2174 ///
2175 /// # Examples
2176 ///
2177 /// Basic usage:
2178 ///
2179 /// ```
2180 /// use bstr::{B, ByteSlice};
2181 ///
2182 /// let s = b"123foo5bar789";
2183 /// assert_eq!(s.trim_with(|c| c.is_numeric()), B("foo5bar"));
2184 /// ```
2185 #[inline]
trim_with<F: FnMut(char) -> bool>(&self, mut trim: F) -> &[u8]2186 fn trim_with<F: FnMut(char) -> bool>(&self, mut trim: F) -> &[u8] {
2187 self.trim_start_with(&mut trim).trim_end_with(&mut trim)
2188 }
2189
2190 /// Return a byte string slice with leading characters satisfying the given
2191 /// predicate removed.
2192 ///
2193 /// # Examples
2194 ///
2195 /// Basic usage:
2196 ///
2197 /// ```
2198 /// use bstr::{B, ByteSlice};
2199 ///
2200 /// let s = b"123foo5bar789";
2201 /// assert_eq!(s.trim_start_with(|c| c.is_numeric()), B("foo5bar789"));
2202 /// ```
2203 #[inline]
trim_start_with<F: FnMut(char) -> bool>(&self, mut trim: F) -> &[u8]2204 fn trim_start_with<F: FnMut(char) -> bool>(&self, mut trim: F) -> &[u8] {
2205 for (s, _, ch) in self.char_indices() {
2206 if !trim(ch) {
2207 return &self.as_bytes()[s..];
2208 }
2209 }
2210 b""
2211 }
2212
2213 /// Return a byte string slice with trailing characters satisfying the
2214 /// given predicate removed.
2215 ///
2216 /// # Examples
2217 ///
2218 /// Basic usage:
2219 ///
2220 /// ```
2221 /// use bstr::{B, ByteSlice};
2222 ///
2223 /// let s = b"123foo5bar789";
2224 /// assert_eq!(s.trim_end_with(|c| c.is_numeric()), B("123foo5bar"));
2225 /// ```
2226 #[inline]
trim_end_with<F: FnMut(char) -> bool>(&self, mut trim: F) -> &[u8]2227 fn trim_end_with<F: FnMut(char) -> bool>(&self, mut trim: F) -> &[u8] {
2228 for (_, e, ch) in self.char_indices().rev() {
2229 if !trim(ch) {
2230 return &self.as_bytes()[..e];
2231 }
2232 }
2233 b""
2234 }
2235
2236 /// Returns a new `Vec<u8>` containing the lowercase equivalent of this
2237 /// byte string.
2238 ///
2239 /// In this case, lowercase is defined according to the `Lowercase` Unicode
2240 /// property.
2241 ///
2242 /// If invalid UTF-8 is seen, or if a character has no lowercase variant,
2243 /// then it is written to the given buffer unchanged.
2244 ///
2245 /// Note that some characters in this byte string may expand into multiple
2246 /// characters when changing the case, so the number of bytes written to
2247 /// the given byte string may not be equivalent to the number of bytes in
2248 /// this byte string.
2249 ///
2250 /// If you'd like to reuse an allocation for performance reasons, then use
2251 /// [`to_lowercase_into`](#method.to_lowercase_into) instead.
2252 ///
2253 /// # Examples
2254 ///
2255 /// Basic usage:
2256 ///
2257 /// ```
2258 /// use bstr::{B, ByteSlice};
2259 ///
2260 /// let s = B("HELLO Β");
2261 /// assert_eq!("hello β".as_bytes(), s.to_lowercase().as_bytes());
2262 /// ```
2263 ///
2264 /// Scripts without case are not changed:
2265 ///
2266 /// ```
2267 /// use bstr::{B, ByteSlice};
2268 ///
2269 /// let s = B("农历新年");
2270 /// assert_eq!("农历新年".as_bytes(), s.to_lowercase().as_bytes());
2271 /// ```
2272 ///
2273 /// Invalid UTF-8 remains as is:
2274 ///
2275 /// ```
2276 /// use bstr::{B, ByteSlice};
2277 ///
2278 /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ");
2279 /// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), s.to_lowercase().as_bytes());
2280 /// ```
2281 #[cfg(all(feature = "std", feature = "unicode"))]
2282 #[inline]
to_lowercase(&self) -> Vec<u8>2283 fn to_lowercase(&self) -> Vec<u8> {
2284 let mut buf = vec![];
2285 self.to_lowercase_into(&mut buf);
2286 buf
2287 }
2288
2289 /// Writes the lowercase equivalent of this byte string into the given
2290 /// buffer. The buffer is not cleared before written to.
2291 ///
2292 /// In this case, lowercase is defined according to the `Lowercase`
2293 /// Unicode property.
2294 ///
2295 /// If invalid UTF-8 is seen, or if a character has no lowercase variant,
2296 /// then it is written to the given buffer unchanged.
2297 ///
2298 /// Note that some characters in this byte string may expand into multiple
2299 /// characters when changing the case, so the number of bytes written to
2300 /// the given byte string may not be equivalent to the number of bytes in
2301 /// this byte string.
2302 ///
2303 /// If you don't need to amortize allocation and instead prefer
2304 /// convenience, then use [`to_lowercase`](#method.to_lowercase) instead.
2305 ///
2306 /// # Examples
2307 ///
2308 /// Basic usage:
2309 ///
2310 /// ```
2311 /// use bstr::{B, ByteSlice};
2312 ///
2313 /// let s = B("HELLO Β");
2314 ///
2315 /// let mut buf = vec![];
2316 /// s.to_lowercase_into(&mut buf);
2317 /// assert_eq!("hello β".as_bytes(), buf.as_bytes());
2318 /// ```
2319 ///
2320 /// Scripts without case are not changed:
2321 ///
2322 /// ```
2323 /// use bstr::{B, ByteSlice};
2324 ///
2325 /// let s = B("农历新年");
2326 ///
2327 /// let mut buf = vec![];
2328 /// s.to_lowercase_into(&mut buf);
2329 /// assert_eq!("农历新年".as_bytes(), buf.as_bytes());
2330 /// ```
2331 ///
2332 /// Invalid UTF-8 remains as is:
2333 ///
2334 /// ```
2335 /// use bstr::{B, ByteSlice};
2336 ///
2337 /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ");
2338 ///
2339 /// let mut buf = vec![];
2340 /// s.to_lowercase_into(&mut buf);
2341 /// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), buf.as_bytes());
2342 /// ```
2343 #[cfg(all(feature = "std", feature = "unicode"))]
2344 #[inline]
to_lowercase_into(&self, buf: &mut Vec<u8>)2345 fn to_lowercase_into(&self, buf: &mut Vec<u8>) {
2346 // TODO: This is the best we can do given what std exposes I think.
2347 // If we roll our own case handling, then we might be able to do this
2348 // a bit faster. We shouldn't roll our own case handling unless we
2349 // need to, e.g., for doing caseless matching or case folding.
2350
2351 // TODO(BUG): This doesn't handle any special casing rules.
2352
2353 buf.reserve(self.as_bytes().len());
2354 for (s, e, ch) in self.char_indices() {
2355 if ch == '\u{FFFD}' {
2356 buf.push_str(&self.as_bytes()[s..e]);
2357 } else if ch.is_ascii() {
2358 buf.push_char(ch.to_ascii_lowercase());
2359 } else {
2360 for upper in ch.to_lowercase() {
2361 buf.push_char(upper);
2362 }
2363 }
2364 }
2365 }
2366
2367 /// Returns a new `Vec<u8>` containing the ASCII lowercase equivalent of
2368 /// this byte string.
2369 ///
2370 /// In this case, lowercase is only defined in ASCII letters. Namely, the
2371 /// letters `A-Z` are converted to `a-z`. All other bytes remain unchanged.
2372 /// In particular, the length of the byte string returned is always
2373 /// equivalent to the length of this byte string.
2374 ///
2375 /// If you'd like to reuse an allocation for performance reasons, then use
2376 /// [`make_ascii_lowercase`](#method.make_ascii_lowercase) to perform
2377 /// the conversion in place.
2378 ///
2379 /// # Examples
2380 ///
2381 /// Basic usage:
2382 ///
2383 /// ```
2384 /// use bstr::{B, ByteSlice};
2385 ///
2386 /// let s = B("HELLO Β");
2387 /// assert_eq!("hello Β".as_bytes(), s.to_ascii_lowercase().as_bytes());
2388 /// ```
2389 ///
2390 /// Invalid UTF-8 remains as is:
2391 ///
2392 /// ```
2393 /// use bstr::{B, ByteSlice};
2394 ///
2395 /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ");
2396 /// assert_eq!(s.to_ascii_lowercase(), B(b"foo\xFFbar\xE2\x98baz"));
2397 /// ```
2398 #[cfg(feature = "std")]
2399 #[inline]
to_ascii_lowercase(&self) -> Vec<u8>2400 fn to_ascii_lowercase(&self) -> Vec<u8> {
2401 self.as_bytes().to_ascii_lowercase()
2402 }
2403
2404 /// Convert this byte string to its lowercase ASCII equivalent in place.
2405 ///
2406 /// In this case, lowercase is only defined in ASCII letters. Namely, the
2407 /// letters `A-Z` are converted to `a-z`. All other bytes remain unchanged.
2408 ///
2409 /// If you don't need to do the conversion in
2410 /// place and instead prefer convenience, then use
2411 /// [`to_ascii_lowercase`](#method.to_ascii_lowercase) instead.
2412 ///
2413 /// # Examples
2414 ///
2415 /// Basic usage:
2416 ///
2417 /// ```
2418 /// use bstr::ByteSlice;
2419 ///
2420 /// let mut s = <Vec<u8>>::from("HELLO Β");
2421 /// s.make_ascii_lowercase();
2422 /// assert_eq!(s, "hello Β".as_bytes());
2423 /// ```
2424 ///
2425 /// Invalid UTF-8 remains as is:
2426 ///
2427 /// ```
2428 /// use bstr::{B, ByteSlice, ByteVec};
2429 ///
2430 /// let mut s = <Vec<u8>>::from_slice(b"FOO\xFFBAR\xE2\x98BAZ");
2431 /// s.make_ascii_lowercase();
2432 /// assert_eq!(s, B(b"foo\xFFbar\xE2\x98baz"));
2433 /// ```
2434 #[inline]
make_ascii_lowercase(&mut self)2435 fn make_ascii_lowercase(&mut self) {
2436 self.as_bytes_mut().make_ascii_lowercase();
2437 }
2438
2439 /// Returns a new `Vec<u8>` containing the uppercase equivalent of this
2440 /// byte string.
2441 ///
2442 /// In this case, uppercase is defined according to the `Uppercase`
2443 /// Unicode property.
2444 ///
2445 /// If invalid UTF-8 is seen, or if a character has no uppercase variant,
2446 /// then it is written to the given buffer unchanged.
2447 ///
2448 /// Note that some characters in this byte string may expand into multiple
2449 /// characters when changing the case, so the number of bytes written to
2450 /// the given byte string may not be equivalent to the number of bytes in
2451 /// this byte string.
2452 ///
2453 /// If you'd like to reuse an allocation for performance reasons, then use
2454 /// [`to_uppercase_into`](#method.to_uppercase_into) instead.
2455 ///
2456 /// # Examples
2457 ///
2458 /// Basic usage:
2459 ///
2460 /// ```
2461 /// use bstr::{B, ByteSlice};
2462 ///
2463 /// let s = B("hello β");
2464 /// assert_eq!(s.to_uppercase(), B("HELLO Β"));
2465 /// ```
2466 ///
2467 /// Scripts without case are not changed:
2468 ///
2469 /// ```
2470 /// use bstr::{B, ByteSlice};
2471 ///
2472 /// let s = B("农历新年");
2473 /// assert_eq!(s.to_uppercase(), B("农历新年"));
2474 /// ```
2475 ///
2476 /// Invalid UTF-8 remains as is:
2477 ///
2478 /// ```
2479 /// use bstr::{B, ByteSlice};
2480 ///
2481 /// let s = B(b"foo\xFFbar\xE2\x98baz");
2482 /// assert_eq!(s.to_uppercase(), B(b"FOO\xFFBAR\xE2\x98BAZ"));
2483 /// ```
2484 #[cfg(all(feature = "std", feature = "unicode"))]
2485 #[inline]
to_uppercase(&self) -> Vec<u8>2486 fn to_uppercase(&self) -> Vec<u8> {
2487 let mut buf = vec![];
2488 self.to_uppercase_into(&mut buf);
2489 buf
2490 }
2491
2492 /// Writes the uppercase equivalent of this byte string into the given
2493 /// buffer. The buffer is not cleared before written to.
2494 ///
2495 /// In this case, uppercase is defined according to the `Uppercase`
2496 /// Unicode property.
2497 ///
2498 /// If invalid UTF-8 is seen, or if a character has no uppercase variant,
2499 /// then it is written to the given buffer unchanged.
2500 ///
2501 /// Note that some characters in this byte string may expand into multiple
2502 /// characters when changing the case, so the number of bytes written to
2503 /// the given byte string may not be equivalent to the number of bytes in
2504 /// this byte string.
2505 ///
2506 /// If you don't need to amortize allocation and instead prefer
2507 /// convenience, then use [`to_uppercase`](#method.to_uppercase) instead.
2508 ///
2509 /// # Examples
2510 ///
2511 /// Basic usage:
2512 ///
2513 /// ```
2514 /// use bstr::{B, ByteSlice};
2515 ///
2516 /// let s = B("hello β");
2517 ///
2518 /// let mut buf = vec![];
2519 /// s.to_uppercase_into(&mut buf);
2520 /// assert_eq!(buf, B("HELLO Β"));
2521 /// ```
2522 ///
2523 /// Scripts without case are not changed:
2524 ///
2525 /// ```
2526 /// use bstr::{B, ByteSlice};
2527 ///
2528 /// let s = B("农历新年");
2529 ///
2530 /// let mut buf = vec![];
2531 /// s.to_uppercase_into(&mut buf);
2532 /// assert_eq!(buf, B("农历新年"));
2533 /// ```
2534 ///
2535 /// Invalid UTF-8 remains as is:
2536 ///
2537 /// ```
2538 /// use bstr::{B, ByteSlice};
2539 ///
2540 /// let s = B(b"foo\xFFbar\xE2\x98baz");
2541 ///
2542 /// let mut buf = vec![];
2543 /// s.to_uppercase_into(&mut buf);
2544 /// assert_eq!(buf, B(b"FOO\xFFBAR\xE2\x98BAZ"));
2545 /// ```
2546 #[cfg(all(feature = "std", feature = "unicode"))]
2547 #[inline]
to_uppercase_into(&self, buf: &mut Vec<u8>)2548 fn to_uppercase_into(&self, buf: &mut Vec<u8>) {
2549 // TODO: This is the best we can do given what std exposes I think.
2550 // If we roll our own case handling, then we might be able to do this
2551 // a bit faster. We shouldn't roll our own case handling unless we
2552 // need to, e.g., for doing caseless matching or case folding.
2553 buf.reserve(self.as_bytes().len());
2554 for (s, e, ch) in self.char_indices() {
2555 if ch == '\u{FFFD}' {
2556 buf.push_str(&self.as_bytes()[s..e]);
2557 } else if ch.is_ascii() {
2558 buf.push_char(ch.to_ascii_uppercase());
2559 } else {
2560 for upper in ch.to_uppercase() {
2561 buf.push_char(upper);
2562 }
2563 }
2564 }
2565 }
2566
2567 /// Returns a new `Vec<u8>` containing the ASCII uppercase equivalent of
2568 /// this byte string.
2569 ///
2570 /// In this case, uppercase is only defined in ASCII letters. Namely, the
2571 /// letters `a-z` are converted to `A-Z`. All other bytes remain unchanged.
2572 /// In particular, the length of the byte string returned is always
2573 /// equivalent to the length of this byte string.
2574 ///
2575 /// If you'd like to reuse an allocation for performance reasons, then use
2576 /// [`make_ascii_uppercase`](#method.make_ascii_uppercase) to perform
2577 /// the conversion in place.
2578 ///
2579 /// # Examples
2580 ///
2581 /// Basic usage:
2582 ///
2583 /// ```
2584 /// use bstr::{B, ByteSlice};
2585 ///
2586 /// let s = B("hello β");
2587 /// assert_eq!(s.to_ascii_uppercase(), B("HELLO β"));
2588 /// ```
2589 ///
2590 /// Invalid UTF-8 remains as is:
2591 ///
2592 /// ```
2593 /// use bstr::{B, ByteSlice};
2594 ///
2595 /// let s = B(b"foo\xFFbar\xE2\x98baz");
2596 /// assert_eq!(s.to_ascii_uppercase(), B(b"FOO\xFFBAR\xE2\x98BAZ"));
2597 /// ```
2598 #[cfg(feature = "std")]
2599 #[inline]
to_ascii_uppercase(&self) -> Vec<u8>2600 fn to_ascii_uppercase(&self) -> Vec<u8> {
2601 self.as_bytes().to_ascii_uppercase()
2602 }
2603
2604 /// Convert this byte string to its uppercase ASCII equivalent in place.
2605 ///
2606 /// In this case, uppercase is only defined in ASCII letters. Namely, the
2607 /// letters `a-z` are converted to `A-Z`. All other bytes remain unchanged.
2608 ///
2609 /// If you don't need to do the conversion in
2610 /// place and instead prefer convenience, then use
2611 /// [`to_ascii_uppercase`](#method.to_ascii_uppercase) instead.
2612 ///
2613 /// # Examples
2614 ///
2615 /// Basic usage:
2616 ///
2617 /// ```
2618 /// use bstr::{B, ByteSlice};
2619 ///
2620 /// let mut s = <Vec<u8>>::from("hello β");
2621 /// s.make_ascii_uppercase();
2622 /// assert_eq!(s, B("HELLO β"));
2623 /// ```
2624 ///
2625 /// Invalid UTF-8 remains as is:
2626 ///
2627 /// ```
2628 /// use bstr::{B, ByteSlice, ByteVec};
2629 ///
2630 /// let mut s = <Vec<u8>>::from_slice(b"foo\xFFbar\xE2\x98baz");
2631 /// s.make_ascii_uppercase();
2632 /// assert_eq!(s, B(b"FOO\xFFBAR\xE2\x98BAZ"));
2633 /// ```
2634 #[inline]
make_ascii_uppercase(&mut self)2635 fn make_ascii_uppercase(&mut self) {
2636 self.as_bytes_mut().make_ascii_uppercase();
2637 }
2638
2639 /// Reverse the bytes in this string, in place.
2640 ///
2641 /// This is not necessarily a well formed operation! For example, if this
2642 /// byte string contains valid UTF-8 that isn't ASCII, then reversing the
2643 /// string will likely result in invalid UTF-8 and otherwise non-sensical
2644 /// content.
2645 ///
2646 /// Note that this is equivalent to the generic `[u8]::reverse` method.
2647 /// This method is provided to permit callers to explicitly differentiate
2648 /// between reversing bytes, codepoints and graphemes.
2649 ///
2650 /// # Examples
2651 ///
2652 /// Basic usage:
2653 ///
2654 /// ```
2655 /// use bstr::ByteSlice;
2656 ///
2657 /// let mut s = <Vec<u8>>::from("hello");
2658 /// s.reverse_bytes();
2659 /// assert_eq!(s, "olleh".as_bytes());
2660 /// ```
2661 #[inline]
reverse_bytes(&mut self)2662 fn reverse_bytes(&mut self) {
2663 self.as_bytes_mut().reverse();
2664 }
2665
2666 /// Reverse the codepoints in this string, in place.
2667 ///
2668 /// If this byte string is valid UTF-8, then its reversal by codepoint
2669 /// is also guaranteed to be valid UTF-8.
2670 ///
2671 /// This operation is equivalent to the following, but without allocating:
2672 ///
2673 /// ```
2674 /// use bstr::ByteSlice;
2675 ///
2676 /// let mut s = <Vec<u8>>::from("foo☃bar");
2677 ///
2678 /// let mut chars: Vec<char> = s.chars().collect();
2679 /// chars.reverse();
2680 ///
2681 /// let reversed: String = chars.into_iter().collect();
2682 /// assert_eq!(reversed, "rab☃oof");
2683 /// ```
2684 ///
2685 /// Note that this is not necessarily a well formed operation. For example,
2686 /// if this byte string contains grapheme clusters with more than one
2687 /// codepoint, then those grapheme clusters will not necessarily be
2688 /// preserved. If you'd like to preserve grapheme clusters, then use
2689 /// [`reverse_graphemes`](#method.reverse_graphemes) instead.
2690 ///
2691 /// # Examples
2692 ///
2693 /// Basic usage:
2694 ///
2695 /// ```
2696 /// use bstr::ByteSlice;
2697 ///
2698 /// let mut s = <Vec<u8>>::from("foo☃bar");
2699 /// s.reverse_chars();
2700 /// assert_eq!(s, "rab☃oof".as_bytes());
2701 /// ```
2702 ///
2703 /// This example shows that not all reversals lead to a well formed string.
2704 /// For example, in this case, combining marks are used to put accents over
2705 /// some letters, and those accent marks must appear after the codepoints
2706 /// they modify.
2707 ///
2708 /// ```
2709 /// use bstr::{B, ByteSlice};
2710 ///
2711 /// let mut s = <Vec<u8>>::from("résumé");
2712 /// s.reverse_chars();
2713 /// assert_eq!(s, B(b"\xCC\x81emus\xCC\x81er"));
2714 /// ```
2715 ///
2716 /// A word of warning: the above example relies on the fact that
2717 /// `résumé` is in decomposed normal form, which means there are separate
2718 /// codepoints for the accents above `e`. If it is instead in composed
2719 /// normal form, then the example works:
2720 ///
2721 /// ```
2722 /// use bstr::{B, ByteSlice};
2723 ///
2724 /// let mut s = <Vec<u8>>::from("résumé");
2725 /// s.reverse_chars();
2726 /// assert_eq!(s, B("émusér"));
2727 /// ```
2728 ///
2729 /// The point here is to be cautious and not assume that just because
2730 /// `reverse_chars` works in one case, that it therefore works in all
2731 /// cases.
2732 #[inline]
reverse_chars(&mut self)2733 fn reverse_chars(&mut self) {
2734 let mut i = 0;
2735 loop {
2736 let (_, size) = utf8::decode(&self.as_bytes()[i..]);
2737 if size == 0 {
2738 break;
2739 }
2740 if size > 1 {
2741 self.as_bytes_mut()[i..i + size].reverse_bytes();
2742 }
2743 i += size;
2744 }
2745 self.reverse_bytes();
2746 }
2747
2748 /// Reverse the graphemes in this string, in place.
2749 ///
2750 /// If this byte string is valid UTF-8, then its reversal by grapheme
2751 /// is also guaranteed to be valid UTF-8.
2752 ///
2753 /// This operation is equivalent to the following, but without allocating:
2754 ///
2755 /// ```
2756 /// use bstr::ByteSlice;
2757 ///
2758 /// let mut s = <Vec<u8>>::from("foo☃bar");
2759 ///
2760 /// let mut graphemes: Vec<&str> = s.graphemes().collect();
2761 /// graphemes.reverse();
2762 ///
2763 /// let reversed = graphemes.concat();
2764 /// assert_eq!(reversed, "rab☃oof");
2765 /// ```
2766 ///
2767 /// # Examples
2768 ///
2769 /// Basic usage:
2770 ///
2771 /// ```
2772 /// use bstr::ByteSlice;
2773 ///
2774 /// let mut s = <Vec<u8>>::from("foo☃bar");
2775 /// s.reverse_graphemes();
2776 /// assert_eq!(s, "rab☃oof".as_bytes());
2777 /// ```
2778 ///
2779 /// This example shows how this correctly handles grapheme clusters,
2780 /// unlike `reverse_chars`.
2781 ///
2782 /// ```
2783 /// use bstr::ByteSlice;
2784 ///
2785 /// let mut s = <Vec<u8>>::from("résumé");
2786 /// s.reverse_graphemes();
2787 /// assert_eq!(s, "émusér".as_bytes());
2788 /// ```
2789 #[cfg(feature = "unicode")]
2790 #[inline]
reverse_graphemes(&mut self)2791 fn reverse_graphemes(&mut self) {
2792 use unicode::decode_grapheme;
2793
2794 let mut i = 0;
2795 loop {
2796 let (_, size) = decode_grapheme(&self.as_bytes()[i..]);
2797 if size == 0 {
2798 break;
2799 }
2800 if size > 1 {
2801 self.as_bytes_mut()[i..i + size].reverse_bytes();
2802 }
2803 i += size;
2804 }
2805 self.reverse_bytes();
2806 }
2807
2808 /// Returns true if and only if every byte in this byte string is ASCII.
2809 ///
2810 /// ASCII is an encoding that defines 128 codepoints. A byte corresponds to
2811 /// an ASCII codepoint if and only if it is in the inclusive range
2812 /// `[0, 127]`.
2813 ///
2814 /// # Examples
2815 ///
2816 /// Basic usage:
2817 ///
2818 /// ```
2819 /// use bstr::{B, ByteSlice};
2820 ///
2821 /// assert!(B("abc").is_ascii());
2822 /// assert!(!B("☃βツ").is_ascii());
2823 /// assert!(!B(b"\xFF").is_ascii());
2824 /// ```
2825 #[inline]
is_ascii(&self) -> bool2826 fn is_ascii(&self) -> bool {
2827 ascii::first_non_ascii_byte(self.as_bytes()) == self.as_bytes().len()
2828 }
2829
2830 /// Returns true if and only if the entire byte string is valid UTF-8.
2831 ///
2832 /// If you need location information about where a byte string's first
2833 /// invalid UTF-8 byte is, then use the [`to_str`](#method.to_str) method.
2834 ///
2835 /// # Examples
2836 ///
2837 /// Basic usage:
2838 ///
2839 /// ```
2840 /// use bstr::{B, ByteSlice};
2841 ///
2842 /// assert!(B("abc").is_utf8());
2843 /// assert!(B("☃βツ").is_utf8());
2844 /// // invalid bytes
2845 /// assert!(!B(b"abc\xFF").is_utf8());
2846 /// // surrogate encoding
2847 /// assert!(!B(b"\xED\xA0\x80").is_utf8());
2848 /// // incomplete sequence
2849 /// assert!(!B(b"\xF0\x9D\x9Ca").is_utf8());
2850 /// // overlong sequence
2851 /// assert!(!B(b"\xF0\x82\x82\xAC").is_utf8());
2852 /// ```
2853 #[inline]
is_utf8(&self) -> bool2854 fn is_utf8(&self) -> bool {
2855 utf8::validate(self.as_bytes()).is_ok()
2856 }
2857
2858 /// Returns the last byte in this byte string, if it's non-empty. If this
2859 /// byte string is empty, this returns `None`.
2860 ///
2861 /// Note that this is like the generic `[u8]::last`, except this returns
2862 /// the byte by value instead of a reference to the byte.
2863 ///
2864 /// # Examples
2865 ///
2866 /// Basic usage:
2867 ///
2868 /// ```
2869 /// use bstr::ByteSlice;
2870 ///
2871 /// assert_eq!(Some(b'z'), b"baz".last_byte());
2872 /// assert_eq!(None, b"".last_byte());
2873 /// ```
2874 #[inline]
last_byte(&self) -> Option<u8>2875 fn last_byte(&self) -> Option<u8> {
2876 let bytes = self.as_bytes();
2877 bytes.get(bytes.len().saturating_sub(1)).map(|&b| b)
2878 }
2879
2880 /// Returns the index of the first non-ASCII byte in this byte string (if
2881 /// any such indices exist). Specifically, it returns the index of the
2882 /// first byte with a value greater than or equal to `0x80`.
2883 ///
2884 /// # Examples
2885 ///
2886 /// Basic usage:
2887 ///
2888 /// ```
2889 /// use bstr::{ByteSlice, B};
2890 ///
2891 /// assert_eq!(Some(3), b"abc\xff".find_non_ascii_byte());
2892 /// assert_eq!(None, b"abcde".find_non_ascii_byte());
2893 /// assert_eq!(Some(0), B("").find_non_ascii_byte());
2894 /// ```
2895 #[inline]
find_non_ascii_byte(&self) -> Option<usize>2896 fn find_non_ascii_byte(&self) -> Option<usize> {
2897 let index = ascii::first_non_ascii_byte(self.as_bytes());
2898 if index == self.as_bytes().len() {
2899 None
2900 } else {
2901 Some(index)
2902 }
2903 }
2904
2905 /// Copies elements from one part of the slice to another part of itself,
2906 /// where the parts may be overlapping.
2907 ///
2908 /// `src` is the range within this byte string to copy from, while `dest`
2909 /// is the starting index of the range within this byte string to copy to.
2910 /// The length indicated by `src` must be less than or equal to the number
2911 /// of bytes from `dest` to the end of the byte string.
2912 ///
2913 /// # Panics
2914 ///
2915 /// Panics if either range is out of bounds, or if `src` is too big to fit
2916 /// into `dest`, or if the end of `src` is before the start.
2917 ///
2918 /// # Examples
2919 ///
2920 /// Copying four bytes within a byte string:
2921 ///
2922 /// ```
2923 /// use bstr::{B, ByteSlice};
2924 ///
2925 /// let mut buf = *b"Hello, World!";
2926 /// let s = &mut buf;
2927 /// s.copy_within_str(1..5, 8);
2928 /// assert_eq!(s, B("Hello, Wello!"));
2929 /// ```
2930 #[inline]
copy_within_str<R>(&mut self, src: R, dest: usize) where R: ops::RangeBounds<usize>,2931 fn copy_within_str<R>(&mut self, src: R, dest: usize)
2932 where
2933 R: ops::RangeBounds<usize>,
2934 {
2935 // TODO: Deprecate this once slice::copy_within stabilizes.
2936 let src_start = match src.start_bound() {
2937 ops::Bound::Included(&n) => n,
2938 ops::Bound::Excluded(&n) => {
2939 n.checked_add(1).expect("attempted to index slice beyond max")
2940 }
2941 ops::Bound::Unbounded => 0,
2942 };
2943 let src_end = match src.end_bound() {
2944 ops::Bound::Included(&n) => {
2945 n.checked_add(1).expect("attempted to index slice beyond max")
2946 }
2947 ops::Bound::Excluded(&n) => n,
2948 ops::Bound::Unbounded => self.as_bytes().len(),
2949 };
2950 assert!(src_start <= src_end, "src end is before src start");
2951 assert!(src_end <= self.as_bytes().len(), "src is out of bounds");
2952 let count = src_end - src_start;
2953 assert!(
2954 dest <= self.as_bytes().len() - count,
2955 "dest is out of bounds",
2956 );
2957
2958 // SAFETY: This is safe because we use ptr::copy to handle overlapping
2959 // copies, and is also safe because we've checked all the bounds above.
2960 // Finally, we are only dealing with u8 data, which is Copy, which
2961 // means we can copy without worrying about ownership/destructors.
2962 unsafe {
2963 ptr::copy(
2964 self.as_bytes().get_unchecked(src_start),
2965 self.as_bytes_mut().get_unchecked_mut(dest),
2966 count,
2967 );
2968 }
2969 }
2970 }
2971
2972 /// A single substring searcher fixed to a particular needle.
2973 ///
2974 /// The purpose of this type is to permit callers to construct a substring
2975 /// searcher that can be used to search haystacks without the overhead of
2976 /// constructing the searcher in the first place. This is a somewhat niche
2977 /// concern when it's necessary to re-use the same needle to search multiple
2978 /// different haystacks with as little overhead as possible. In general, using
2979 /// [`ByteSlice::find`](trait.ByteSlice.html#method.find)
2980 /// or
2981 /// [`ByteSlice::find_iter`](trait.ByteSlice.html#method.find_iter)
2982 /// is good enough, but `Finder` is useful when you can meaningfully observe
2983 /// searcher construction time in a profile.
2984 ///
2985 /// When the `std` feature is enabled, then this type has an `into_owned`
2986 /// version which permits building a `Finder` that is not connected to the
2987 /// lifetime of its needle.
2988 #[derive(Clone, Debug)]
2989 pub struct Finder<'a> {
2990 searcher: TwoWay<'a>,
2991 }
2992
2993 impl<'a> Finder<'a> {
2994 /// Create a new finder for the given needle.
2995 #[inline]
new<B: ?Sized + AsRef<[u8]>>(needle: &'a B) -> Finder<'a>2996 pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'a B) -> Finder<'a> {
2997 Finder { searcher: TwoWay::forward(needle.as_ref()) }
2998 }
2999
3000 /// Convert this finder into its owned variant, such that it no longer
3001 /// borrows the needle.
3002 ///
3003 /// If this is already an owned finder, then this is a no-op. Otherwise,
3004 /// this copies the needle.
3005 ///
3006 /// This is only available when the `std` feature is enabled.
3007 #[cfg(feature = "std")]
3008 #[inline]
into_owned(self) -> Finder<'static>3009 pub fn into_owned(self) -> Finder<'static> {
3010 Finder { searcher: self.searcher.into_owned() }
3011 }
3012
3013 /// Returns the needle that this finder searches for.
3014 ///
3015 /// Note that the lifetime of the needle returned is tied to the lifetime
3016 /// of the finder, and may be shorter than the `'a` lifetime. Namely, a
3017 /// finder's needle can be either borrowed or owned, so the lifetime of the
3018 /// needle returned must necessarily be the shorter of the two.
3019 #[inline]
needle(&self) -> &[u8]3020 pub fn needle(&self) -> &[u8] {
3021 self.searcher.needle()
3022 }
3023
3024 /// Returns the index of the first occurrence of this needle in the given
3025 /// haystack.
3026 ///
3027 /// The haystack may be any type that can be cheaply converted into a
3028 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
3029 ///
3030 /// # Complexity
3031 ///
3032 /// This routine is guaranteed to have worst case linear time complexity
3033 /// with respect to both the needle and the haystack. That is, this runs
3034 /// in `O(needle.len() + haystack.len())` time.
3035 ///
3036 /// This routine is also guaranteed to have worst case constant space
3037 /// complexity.
3038 ///
3039 /// # Examples
3040 ///
3041 /// Basic usage:
3042 ///
3043 /// ```
3044 /// use bstr::Finder;
3045 ///
3046 /// let haystack = "foo bar baz";
3047 /// assert_eq!(Some(0), Finder::new("foo").find(haystack));
3048 /// assert_eq!(Some(4), Finder::new("bar").find(haystack));
3049 /// assert_eq!(None, Finder::new("quux").find(haystack));
3050 /// ```
3051 #[inline]
find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize>3052 pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize> {
3053 self.searcher.find(haystack.as_ref())
3054 }
3055 }
3056
3057 /// A single substring reverse searcher fixed to a particular needle.
3058 ///
3059 /// The purpose of this type is to permit callers to construct a substring
3060 /// searcher that can be used to search haystacks without the overhead of
3061 /// constructing the searcher in the first place. This is a somewhat niche
3062 /// concern when it's necessary to re-use the same needle to search multiple
3063 /// different haystacks with as little overhead as possible. In general, using
3064 /// [`ByteSlice::rfind`](trait.ByteSlice.html#method.rfind)
3065 /// or
3066 /// [`ByteSlice::rfind_iter`](trait.ByteSlice.html#method.rfind_iter)
3067 /// is good enough, but `FinderReverse` is useful when you can meaningfully
3068 /// observe searcher construction time in a profile.
3069 ///
3070 /// When the `std` feature is enabled, then this type has an `into_owned`
3071 /// version which permits building a `FinderReverse` that is not connected to
3072 /// the lifetime of its needle.
3073 #[derive(Clone, Debug)]
3074 pub struct FinderReverse<'a> {
3075 searcher: TwoWay<'a>,
3076 }
3077
3078 impl<'a> FinderReverse<'a> {
3079 /// Create a new reverse finder for the given needle.
3080 #[inline]
new<B: ?Sized + AsRef<[u8]>>(needle: &'a B) -> FinderReverse<'a>3081 pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'a B) -> FinderReverse<'a> {
3082 FinderReverse { searcher: TwoWay::reverse(needle.as_ref()) }
3083 }
3084
3085 /// Convert this finder into its owned variant, such that it no longer
3086 /// borrows the needle.
3087 ///
3088 /// If this is already an owned finder, then this is a no-op. Otherwise,
3089 /// this copies the needle.
3090 ///
3091 /// This is only available when the `std` feature is enabled.
3092 #[cfg(feature = "std")]
3093 #[inline]
into_owned(self) -> FinderReverse<'static>3094 pub fn into_owned(self) -> FinderReverse<'static> {
3095 FinderReverse { searcher: self.searcher.into_owned() }
3096 }
3097
3098 /// Returns the needle that this finder searches for.
3099 ///
3100 /// Note that the lifetime of the needle returned is tied to the lifetime
3101 /// of this finder, and may be shorter than the `'a` lifetime. Namely,
3102 /// a finder's needle can be either borrowed or owned, so the lifetime of
3103 /// the needle returned must necessarily be the shorter of the two.
3104 #[inline]
needle(&self) -> &[u8]3105 pub fn needle(&self) -> &[u8] {
3106 self.searcher.needle()
3107 }
3108
3109 /// Returns the index of the last occurrence of this needle in the given
3110 /// haystack.
3111 ///
3112 /// The haystack may be any type that can be cheaply converted into a
3113 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
3114 ///
3115 /// # Complexity
3116 ///
3117 /// This routine is guaranteed to have worst case linear time complexity
3118 /// with respect to both the needle and the haystack. That is, this runs
3119 /// in `O(needle.len() + haystack.len())` time.
3120 ///
3121 /// This routine is also guaranteed to have worst case constant space
3122 /// complexity.
3123 ///
3124 /// # Examples
3125 ///
3126 /// Basic usage:
3127 ///
3128 /// ```
3129 /// use bstr::FinderReverse;
3130 ///
3131 /// let haystack = "foo bar baz";
3132 /// assert_eq!(Some(0), FinderReverse::new("foo").rfind(haystack));
3133 /// assert_eq!(Some(4), FinderReverse::new("bar").rfind(haystack));
3134 /// assert_eq!(None, FinderReverse::new("quux").rfind(haystack));
3135 /// ```
3136 #[inline]
rfind<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize>3137 pub fn rfind<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize> {
3138 self.searcher.rfind(haystack.as_ref())
3139 }
3140 }
3141
3142 /// An iterator over non-overlapping substring matches.
3143 ///
3144 /// Matches are reported by the byte offset at which they begin.
3145 ///
3146 /// `'a` is the shorter of two lifetimes: the byte string being searched or the
3147 /// byte string being looked for.
3148 #[derive(Debug)]
3149 pub struct Find<'a> {
3150 haystack: &'a [u8],
3151 prestate: PrefilterState,
3152 searcher: TwoWay<'a>,
3153 pos: usize,
3154 }
3155
3156 impl<'a> Find<'a> {
new(haystack: &'a [u8], needle: &'a [u8]) -> Find<'a>3157 fn new(haystack: &'a [u8], needle: &'a [u8]) -> Find<'a> {
3158 let searcher = TwoWay::forward(needle);
3159 let prestate = searcher.prefilter_state();
3160 Find { haystack, prestate, searcher, pos: 0 }
3161 }
3162 }
3163
3164 impl<'a> Iterator for Find<'a> {
3165 type Item = usize;
3166
3167 #[inline]
next(&mut self) -> Option<usize>3168 fn next(&mut self) -> Option<usize> {
3169 if self.pos > self.haystack.len() {
3170 return None;
3171 }
3172 let result = self
3173 .searcher
3174 .find_with(&mut self.prestate, &self.haystack[self.pos..]);
3175 match result {
3176 None => None,
3177 Some(i) => {
3178 let pos = self.pos + i;
3179 self.pos = pos + cmp::max(1, self.searcher.needle().len());
3180 Some(pos)
3181 }
3182 }
3183 }
3184 }
3185
3186 /// An iterator over non-overlapping substring matches in reverse.
3187 ///
3188 /// Matches are reported by the byte offset at which they begin.
3189 ///
3190 /// `'a` is the shorter of two lifetimes: the byte string being searched or the
3191 /// byte string being looked for.
3192 #[derive(Debug)]
3193 pub struct FindReverse<'a> {
3194 haystack: &'a [u8],
3195 prestate: PrefilterState,
3196 searcher: TwoWay<'a>,
3197 /// When searching with an empty needle, this gets set to `None` after
3198 /// we've yielded the last element at `0`.
3199 pos: Option<usize>,
3200 }
3201
3202 impl<'a> FindReverse<'a> {
new(haystack: &'a [u8], needle: &'a [u8]) -> FindReverse<'a>3203 fn new(haystack: &'a [u8], needle: &'a [u8]) -> FindReverse<'a> {
3204 let searcher = TwoWay::reverse(needle);
3205 let prestate = searcher.prefilter_state();
3206 let pos = Some(haystack.len());
3207 FindReverse { haystack, prestate, searcher, pos }
3208 }
3209
haystack(&self) -> &'a [u8]3210 fn haystack(&self) -> &'a [u8] {
3211 self.haystack
3212 }
3213
needle(&self) -> &[u8]3214 fn needle(&self) -> &[u8] {
3215 self.searcher.needle()
3216 }
3217 }
3218
3219 impl<'a> Iterator for FindReverse<'a> {
3220 type Item = usize;
3221
3222 #[inline]
next(&mut self) -> Option<usize>3223 fn next(&mut self) -> Option<usize> {
3224 let pos = match self.pos {
3225 None => return None,
3226 Some(pos) => pos,
3227 };
3228 let result = self
3229 .searcher
3230 .rfind_with(&mut self.prestate, &self.haystack[..pos]);
3231 match result {
3232 None => None,
3233 Some(i) => {
3234 if pos == i {
3235 self.pos = pos.checked_sub(1);
3236 } else {
3237 self.pos = Some(i);
3238 }
3239 Some(i)
3240 }
3241 }
3242 }
3243 }
3244
3245 /// An iterator over the bytes in a byte string.
3246 ///
3247 /// `'a` is the lifetime of the byte string being traversed.
3248 #[derive(Clone, Debug)]
3249 pub struct Bytes<'a> {
3250 it: slice::Iter<'a, u8>,
3251 }
3252
3253 impl<'a> Bytes<'a> {
3254 /// Views the remaining underlying data as a subslice of the original data.
3255 /// This has the same lifetime as the original slice,
3256 /// and so the iterator can continue to be used while this exists.
3257 #[inline]
as_slice(&self) -> &'a [u8]3258 pub fn as_slice(&self) -> &'a [u8] {
3259 self.it.as_slice()
3260 }
3261 }
3262
3263 impl<'a> Iterator for Bytes<'a> {
3264 type Item = u8;
3265
3266 #[inline]
next(&mut self) -> Option<u8>3267 fn next(&mut self) -> Option<u8> {
3268 self.it.next().map(|&b| b)
3269 }
3270
3271 #[inline]
size_hint(&self) -> (usize, Option<usize>)3272 fn size_hint(&self) -> (usize, Option<usize>) {
3273 self.it.size_hint()
3274 }
3275 }
3276
3277 impl<'a> DoubleEndedIterator for Bytes<'a> {
3278 #[inline]
next_back(&mut self) -> Option<u8>3279 fn next_back(&mut self) -> Option<u8> {
3280 self.it.next_back().map(|&b| b)
3281 }
3282 }
3283
3284 impl<'a> ExactSizeIterator for Bytes<'a> {
3285 #[inline]
len(&self) -> usize3286 fn len(&self) -> usize {
3287 self.it.len()
3288 }
3289 }
3290
3291 impl<'a> iter::FusedIterator for Bytes<'a> {}
3292
3293 /// An iterator over the fields in a byte string, separated by whitespace.
3294 ///
3295 /// This iterator splits on contiguous runs of whitespace, such that the fields
3296 /// in `foo\t\t\n \nbar` are `foo` and `bar`.
3297 ///
3298 /// `'a` is the lifetime of the byte string being split.
3299 #[derive(Debug)]
3300 pub struct Fields<'a> {
3301 it: FieldsWith<'a, fn(char) -> bool>,
3302 }
3303
3304 impl<'a> Fields<'a> {
new(bytes: &'a [u8]) -> Fields<'a>3305 fn new(bytes: &'a [u8]) -> Fields<'a> {
3306 Fields { it: bytes.fields_with(|ch| ch.is_whitespace()) }
3307 }
3308 }
3309
3310 impl<'a> Iterator for Fields<'a> {
3311 type Item = &'a [u8];
3312
3313 #[inline]
next(&mut self) -> Option<&'a [u8]>3314 fn next(&mut self) -> Option<&'a [u8]> {
3315 self.it.next()
3316 }
3317 }
3318
3319 /// An iterator over fields in the byte string, separated by a predicate over
3320 /// codepoints.
3321 ///
3322 /// This iterator splits a byte string based on its predicate function such
3323 /// that the elements returned are separated by contiguous runs of codepoints
3324 /// for which the predicate returns true.
3325 ///
3326 /// `'a` is the lifetime of the byte string being split, while `F` is the type
3327 /// of the predicate, i.e., `FnMut(char) -> bool`.
3328 #[derive(Debug)]
3329 pub struct FieldsWith<'a, F> {
3330 f: F,
3331 bytes: &'a [u8],
3332 chars: CharIndices<'a>,
3333 }
3334
3335 impl<'a, F: FnMut(char) -> bool> FieldsWith<'a, F> {
new(bytes: &'a [u8], f: F) -> FieldsWith<'a, F>3336 fn new(bytes: &'a [u8], f: F) -> FieldsWith<'a, F> {
3337 FieldsWith { f, bytes, chars: bytes.char_indices() }
3338 }
3339 }
3340
3341 impl<'a, F: FnMut(char) -> bool> Iterator for FieldsWith<'a, F> {
3342 type Item = &'a [u8];
3343
3344 #[inline]
next(&mut self) -> Option<&'a [u8]>3345 fn next(&mut self) -> Option<&'a [u8]> {
3346 let (start, mut end);
3347 loop {
3348 match self.chars.next() {
3349 None => return None,
3350 Some((s, e, ch)) => {
3351 if !(self.f)(ch) {
3352 start = s;
3353 end = e;
3354 break;
3355 }
3356 }
3357 }
3358 }
3359 while let Some((_, e, ch)) = self.chars.next() {
3360 if (self.f)(ch) {
3361 break;
3362 }
3363 end = e;
3364 }
3365 Some(&self.bytes[start..end])
3366 }
3367 }
3368
3369 /// An iterator over substrings in a byte string, split by a separator.
3370 ///
3371 /// `'a` is the lifetime of the byte string being split.
3372 #[derive(Debug)]
3373 pub struct Split<'a> {
3374 finder: Find<'a>,
3375 /// The end position of the previous match of our splitter. The element
3376 /// we yield corresponds to the substring starting at `last` up to the
3377 /// beginning of the next match of the splitter.
3378 last: usize,
3379 /// Only set when iteration is complete. A corner case here is when a
3380 /// splitter is matched at the end of the haystack. At that point, we still
3381 /// need to yield an empty string following it.
3382 done: bool,
3383 }
3384
3385 impl<'a> Split<'a> {
new(haystack: &'a [u8], splitter: &'a [u8]) -> Split<'a>3386 fn new(haystack: &'a [u8], splitter: &'a [u8]) -> Split<'a> {
3387 let finder = haystack.find_iter(splitter);
3388 Split { finder, last: 0, done: false }
3389 }
3390 }
3391
3392 impl<'a> Iterator for Split<'a> {
3393 type Item = &'a [u8];
3394
3395 #[inline]
next(&mut self) -> Option<&'a [u8]>3396 fn next(&mut self) -> Option<&'a [u8]> {
3397 let haystack = self.finder.haystack;
3398 match self.finder.next() {
3399 Some(start) => {
3400 let next = &haystack[self.last..start];
3401 self.last = start + self.finder.searcher.needle().len();
3402 Some(next)
3403 }
3404 None => {
3405 if self.last >= haystack.len() {
3406 if !self.done {
3407 self.done = true;
3408 Some(b"")
3409 } else {
3410 None
3411 }
3412 } else {
3413 let s = &haystack[self.last..];
3414 self.last = haystack.len();
3415 self.done = true;
3416 Some(s)
3417 }
3418 }
3419 }
3420 }
3421 }
3422
3423 /// An iterator over substrings in a byte string, split by a separator, in
3424 /// reverse.
3425 ///
3426 /// `'a` is the lifetime of the byte string being split, while `F` is the type
3427 /// of the predicate, i.e., `FnMut(char) -> bool`.
3428 #[derive(Debug)]
3429 pub struct SplitReverse<'a> {
3430 finder: FindReverse<'a>,
3431 /// The end position of the previous match of our splitter. The element
3432 /// we yield corresponds to the substring starting at `last` up to the
3433 /// beginning of the next match of the splitter.
3434 last: usize,
3435 /// Only set when iteration is complete. A corner case here is when a
3436 /// splitter is matched at the end of the haystack. At that point, we still
3437 /// need to yield an empty string following it.
3438 done: bool,
3439 }
3440
3441 impl<'a> SplitReverse<'a> {
new(haystack: &'a [u8], splitter: &'a [u8]) -> SplitReverse<'a>3442 fn new(haystack: &'a [u8], splitter: &'a [u8]) -> SplitReverse<'a> {
3443 let finder = haystack.rfind_iter(splitter);
3444 SplitReverse { finder, last: haystack.len(), done: false }
3445 }
3446 }
3447
3448 impl<'a> Iterator for SplitReverse<'a> {
3449 type Item = &'a [u8];
3450
3451 #[inline]
next(&mut self) -> Option<&'a [u8]>3452 fn next(&mut self) -> Option<&'a [u8]> {
3453 let haystack = self.finder.haystack();
3454 match self.finder.next() {
3455 Some(start) => {
3456 let nlen = self.finder.needle().len();
3457 let next = &haystack[start + nlen..self.last];
3458 self.last = start;
3459 Some(next)
3460 }
3461 None => {
3462 if self.last == 0 {
3463 if !self.done {
3464 self.done = true;
3465 Some(b"")
3466 } else {
3467 None
3468 }
3469 } else {
3470 let s = &haystack[..self.last];
3471 self.last = 0;
3472 self.done = true;
3473 Some(s)
3474 }
3475 }
3476 }
3477 }
3478 }
3479
3480 /// An iterator over at most `n` substrings in a byte string, split by a
3481 /// separator.
3482 ///
3483 /// `'a` is the lifetime of the byte string being split, while `F` is the type
3484 /// of the predicate, i.e., `FnMut(char) -> bool`.
3485 #[derive(Debug)]
3486 pub struct SplitN<'a> {
3487 split: Split<'a>,
3488 limit: usize,
3489 count: usize,
3490 }
3491
3492 impl<'a> SplitN<'a> {
new( haystack: &'a [u8], splitter: &'a [u8], limit: usize, ) -> SplitN<'a>3493 fn new(
3494 haystack: &'a [u8],
3495 splitter: &'a [u8],
3496 limit: usize,
3497 ) -> SplitN<'a> {
3498 let split = haystack.split_str(splitter);
3499 SplitN { split, limit, count: 0 }
3500 }
3501 }
3502
3503 impl<'a> Iterator for SplitN<'a> {
3504 type Item = &'a [u8];
3505
3506 #[inline]
next(&mut self) -> Option<&'a [u8]>3507 fn next(&mut self) -> Option<&'a [u8]> {
3508 self.count += 1;
3509 if self.count > self.limit || self.split.done {
3510 None
3511 } else if self.count == self.limit {
3512 Some(&self.split.finder.haystack[self.split.last..])
3513 } else {
3514 self.split.next()
3515 }
3516 }
3517 }
3518
3519 /// An iterator over at most `n` substrings in a byte string, split by a
3520 /// separator, in reverse.
3521 ///
3522 /// `'a` is the lifetime of the byte string being split, while `F` is the type
3523 /// of the predicate, i.e., `FnMut(char) -> bool`.
3524 #[derive(Debug)]
3525 pub struct SplitNReverse<'a> {
3526 split: SplitReverse<'a>,
3527 limit: usize,
3528 count: usize,
3529 }
3530
3531 impl<'a> SplitNReverse<'a> {
new( haystack: &'a [u8], splitter: &'a [u8], limit: usize, ) -> SplitNReverse<'a>3532 fn new(
3533 haystack: &'a [u8],
3534 splitter: &'a [u8],
3535 limit: usize,
3536 ) -> SplitNReverse<'a> {
3537 let split = haystack.rsplit_str(splitter);
3538 SplitNReverse { split, limit, count: 0 }
3539 }
3540 }
3541
3542 impl<'a> Iterator for SplitNReverse<'a> {
3543 type Item = &'a [u8];
3544
3545 #[inline]
next(&mut self) -> Option<&'a [u8]>3546 fn next(&mut self) -> Option<&'a [u8]> {
3547 self.count += 1;
3548 if self.count > self.limit || self.split.done {
3549 None
3550 } else if self.count == self.limit {
3551 Some(&self.split.finder.haystack()[..self.split.last])
3552 } else {
3553 self.split.next()
3554 }
3555 }
3556 }
3557
3558 /// An iterator over all lines in a byte string, without their terminators.
3559 ///
3560 /// For this iterator, the only line terminators recognized are `\r\n` and
3561 /// `\n`.
3562 ///
3563 /// `'a` is the lifetime of the byte string being iterated over.
3564 pub struct Lines<'a> {
3565 it: LinesWithTerminator<'a>,
3566 }
3567
3568 impl<'a> Lines<'a> {
new(bytes: &'a [u8]) -> Lines<'a>3569 fn new(bytes: &'a [u8]) -> Lines<'a> {
3570 Lines { it: LinesWithTerminator::new(bytes) }
3571 }
3572 }
3573
3574 impl<'a> Iterator for Lines<'a> {
3575 type Item = &'a [u8];
3576
3577 #[inline]
next(&mut self) -> Option<&'a [u8]>3578 fn next(&mut self) -> Option<&'a [u8]> {
3579 let mut line = self.it.next()?;
3580 if line.last_byte() == Some(b'\n') {
3581 line = &line[..line.len() - 1];
3582 if line.last_byte() == Some(b'\r') {
3583 line = &line[..line.len() - 1];
3584 }
3585 }
3586 Some(line)
3587 }
3588 }
3589
3590 /// An iterator over all lines in a byte string, including their terminators.
3591 ///
3592 /// For this iterator, the only line terminator recognized is `\n`. (Since
3593 /// line terminators are included, this also handles `\r\n` line endings.)
3594 ///
3595 /// Line terminators are only included if they are present in the original
3596 /// byte string. For example, the last line in a byte string may not end with
3597 /// a line terminator.
3598 ///
3599 /// Concatenating all elements yielded by this iterator is guaranteed to yield
3600 /// the original byte string.
3601 ///
3602 /// `'a` is the lifetime of the byte string being iterated over.
3603 pub struct LinesWithTerminator<'a> {
3604 bytes: &'a [u8],
3605 }
3606
3607 impl<'a> LinesWithTerminator<'a> {
new(bytes: &'a [u8]) -> LinesWithTerminator<'a>3608 fn new(bytes: &'a [u8]) -> LinesWithTerminator<'a> {
3609 LinesWithTerminator { bytes }
3610 }
3611 }
3612
3613 impl<'a> Iterator for LinesWithTerminator<'a> {
3614 type Item = &'a [u8];
3615
3616 #[inline]
next(&mut self) -> Option<&'a [u8]>3617 fn next(&mut self) -> Option<&'a [u8]> {
3618 match self.bytes.find_byte(b'\n') {
3619 None if self.bytes.is_empty() => None,
3620 None => {
3621 let line = self.bytes;
3622 self.bytes = b"";
3623 Some(line)
3624 }
3625 Some(end) => {
3626 let line = &self.bytes[..end + 1];
3627 self.bytes = &self.bytes[end + 1..];
3628 Some(line)
3629 }
3630 }
3631 }
3632 }
3633
3634 #[cfg(test)]
3635 mod tests {
3636 use ext_slice::{ByteSlice, B};
3637 use tests::LOSSY_TESTS;
3638
3639 #[test]
to_str_lossy()3640 fn to_str_lossy() {
3641 for (i, &(expected, input)) in LOSSY_TESTS.iter().enumerate() {
3642 let got = B(input).to_str_lossy();
3643 assert_eq!(
3644 expected.as_bytes(),
3645 got.as_bytes(),
3646 "to_str_lossy(ith: {:?}, given: {:?})",
3647 i,
3648 input,
3649 );
3650
3651 let mut got = String::new();
3652 B(input).to_str_lossy_into(&mut got);
3653 assert_eq!(
3654 expected.as_bytes(),
3655 got.as_bytes(),
3656 "to_str_lossy_into",
3657 );
3658
3659 let got = String::from_utf8_lossy(input);
3660 assert_eq!(expected.as_bytes(), got.as_bytes(), "std");
3661 }
3662 }
3663
3664 #[test]
3665 #[should_panic]
copy_within_fail1()3666 fn copy_within_fail1() {
3667 let mut buf = *b"foobar";
3668 let s = &mut buf;
3669 s.copy_within_str(0..2, 5);
3670 }
3671
3672 #[test]
3673 #[should_panic]
copy_within_fail2()3674 fn copy_within_fail2() {
3675 let mut buf = *b"foobar";
3676 let s = &mut buf;
3677 s.copy_within_str(3..2, 0);
3678 }
3679
3680 #[test]
3681 #[should_panic]
copy_within_fail3()3682 fn copy_within_fail3() {
3683 let mut buf = *b"foobar";
3684 let s = &mut buf;
3685 s.copy_within_str(5..7, 0);
3686 }
3687
3688 #[test]
3689 #[should_panic]
copy_within_fail4()3690 fn copy_within_fail4() {
3691 let mut buf = *b"foobar";
3692 let s = &mut buf;
3693 s.copy_within_str(0..1, 6);
3694 }
3695 }
3696