1 use core::{iter, slice, str};
2
3 #[cfg(all(feature = "alloc", feature = "unicode"))]
4 use alloc::vec;
5 #[cfg(feature = "alloc")]
6 use alloc::{borrow::Cow, string::String, vec::Vec};
7
8 #[cfg(feature = "std")]
9 use std::{ffi::OsStr, path::Path};
10
11 use memchr::{memchr, memmem, memrchr};
12
13 #[cfg(feature = "alloc")]
14 use crate::ext_vec::ByteVec;
15 #[cfg(feature = "unicode")]
16 use crate::unicode::{
17 whitespace_len_fwd, whitespace_len_rev, GraphemeIndices, Graphemes,
18 SentenceIndices, Sentences, WordIndices, Words, WordsWithBreakIndices,
19 WordsWithBreaks,
20 };
21 use crate::{
22 ascii,
23 bstr::BStr,
24 byteset,
25 utf8::{self, CharIndices, Chars, Utf8Chunks, Utf8Error},
26 };
27
28 /// A short-hand constructor for building a `&[u8]`.
29 ///
30 /// This idiosyncratic constructor is useful for concisely building byte string
31 /// slices. Its primary utility is in conveniently writing byte string literals
32 /// in a uniform way. For example, consider this code that does not compile:
33 ///
34 /// ```ignore
35 /// let strs = vec![b"a", b"xy"];
36 /// ```
37 ///
38 /// The above code doesn't compile because the type of the byte string literal
39 /// `b"a"` is `&'static [u8; 1]`, and the type of `b"xy"` is
40 /// `&'static [u8; 2]`. Since their types aren't the same, they can't be stored
41 /// in the same `Vec`. (This is dissimilar from normal Unicode string slices,
42 /// where both `"a"` and `"xy"` have the same type of `&'static str`.)
43 ///
44 /// One way of getting the above code to compile is to convert byte strings to
45 /// slices. You might try this:
46 ///
47 /// ```ignore
48 /// let strs = vec![&b"a", &b"xy"];
49 /// ```
50 ///
51 /// But this just creates values with type `& &'static [u8; 1]` and
52 /// `& &'static [u8; 2]`. Instead, you need to force the issue like so:
53 ///
54 /// ```
55 /// let strs = vec![&b"a"[..], &b"xy"[..]];
56 /// // or
57 /// let strs = vec![b"a".as_ref(), b"xy".as_ref()];
58 /// ```
59 ///
60 /// But neither of these are particularly convenient to type, especially when
61 /// it's something as common as a string literal. Thus, this constructor
62 /// permits writing the following instead:
63 ///
64 /// ```
65 /// use bstr::B;
66 ///
67 /// let strs = vec![B("a"), B(b"xy")];
68 /// ```
69 ///
70 /// Notice that this also lets you mix and match both string literals and byte
71 /// string literals. This can be quite convenient!
72 #[allow(non_snake_case)]
73 #[inline]
B<'a, B: ?Sized + AsRef<[u8]>>(bytes: &'a B) -> &'a [u8]74 pub fn B<'a, B: ?Sized + AsRef<[u8]>>(bytes: &'a B) -> &'a [u8] {
75 bytes.as_ref()
76 }
77
78 impl ByteSlice for [u8] {
79 #[inline]
as_bytes(&self) -> &[u8]80 fn as_bytes(&self) -> &[u8] {
81 self
82 }
83
84 #[inline]
as_bytes_mut(&mut self) -> &mut [u8]85 fn as_bytes_mut(&mut self) -> &mut [u8] {
86 self
87 }
88 }
89
90 impl<const N: usize> ByteSlice for [u8; N] {
91 #[inline]
as_bytes(&self) -> &[u8]92 fn as_bytes(&self) -> &[u8] {
93 self
94 }
95
96 #[inline]
as_bytes_mut(&mut self) -> &mut [u8]97 fn as_bytes_mut(&mut self) -> &mut [u8] {
98 self
99 }
100 }
101
102 /// Ensure that callers cannot implement `ByteSlice` by making an
103 /// umplementable trait its super trait.
104 mod private {
105 pub trait Sealed {}
106 }
107 impl private::Sealed for [u8] {}
108 impl<const N: usize> private::Sealed for [u8; N] {}
109
110 /// A trait that extends `&[u8]` with string oriented methods.
111 ///
112 /// This trait is sealed and cannot be implemented outside of `bstr`.
113 pub trait ByteSlice: private::Sealed {
114 /// A method for accessing the raw bytes of this type. This is always a
115 /// no-op and callers shouldn't care about it. This only exists for making
116 /// the extension trait work.
117 #[doc(hidden)]
as_bytes(&self) -> &[u8]118 fn as_bytes(&self) -> &[u8];
119
120 /// A method for accessing the raw bytes of this type, mutably. This is
121 /// always a no-op and callers shouldn't care about it. This only exists
122 /// for making the extension trait work.
123 #[doc(hidden)]
as_bytes_mut(&mut self) -> &mut [u8]124 fn as_bytes_mut(&mut self) -> &mut [u8];
125
126 /// Return this byte slice as a `&BStr`.
127 ///
128 /// Use `&BStr` is useful because of its `fmt::Debug` representation
129 /// and various other trait implementations (such as `PartialEq` and
130 /// `PartialOrd`). In particular, the `Debug` implementation for `BStr`
131 /// shows its bytes as a normal string. For invalid UTF-8, hex escape
132 /// sequences are used.
133 ///
134 /// # Examples
135 ///
136 /// Basic usage:
137 ///
138 /// ```
139 /// use bstr::ByteSlice;
140 ///
141 /// println!("{:?}", b"foo\xFFbar".as_bstr());
142 /// ```
143 #[inline]
as_bstr(&self) -> &BStr144 fn as_bstr(&self) -> &BStr {
145 BStr::new(self.as_bytes())
146 }
147
148 /// Return this byte slice as a `&mut BStr`.
149 ///
150 /// Use `&mut BStr` is useful because of its `fmt::Debug` representation
151 /// and various other trait implementations (such as `PartialEq` and
152 /// `PartialOrd`). In particular, the `Debug` implementation for `BStr`
153 /// shows its bytes as a normal string. For invalid UTF-8, hex escape
154 /// sequences are used.
155 ///
156 /// # Examples
157 ///
158 /// Basic usage:
159 ///
160 /// ```
161 /// use bstr::ByteSlice;
162 ///
163 /// let mut bytes = *b"foo\xFFbar";
164 /// println!("{:?}", &mut bytes.as_bstr_mut());
165 /// ```
166 #[inline]
as_bstr_mut(&mut self) -> &mut BStr167 fn as_bstr_mut(&mut self) -> &mut BStr {
168 BStr::new_mut(self.as_bytes_mut())
169 }
170
171 /// Create an immutable byte string from an OS string slice.
172 ///
173 /// When the underlying bytes of OS strings are accessible, then this
174 /// always succeeds and is zero cost. Otherwise, this returns `None` if the
175 /// given OS string is not valid UTF-8. (For example, when the underlying
176 /// bytes are inaccessible on Windows, file paths are allowed to be a
177 /// sequence of arbitrary 16-bit integers. Not all such sequences can be
178 /// transcoded to valid UTF-8.)
179 ///
180 /// # Examples
181 ///
182 /// Basic usage:
183 ///
184 /// ```
185 /// use std::ffi::OsStr;
186 ///
187 /// use bstr::{B, ByteSlice};
188 ///
189 /// let os_str = OsStr::new("foo");
190 /// let bs = <[u8]>::from_os_str(os_str).expect("should be valid UTF-8");
191 /// assert_eq!(bs, B("foo"));
192 /// ```
193 #[cfg(feature = "std")]
194 #[inline]
from_os_str(os_str: &OsStr) -> Option<&[u8]>195 fn from_os_str(os_str: &OsStr) -> Option<&[u8]> {
196 #[cfg(unix)]
197 #[inline]
198 fn imp(os_str: &OsStr) -> Option<&[u8]> {
199 use std::os::unix::ffi::OsStrExt;
200
201 Some(os_str.as_bytes())
202 }
203
204 #[cfg(not(unix))]
205 #[inline]
206 fn imp(os_str: &OsStr) -> Option<&[u8]> {
207 os_str.to_str().map(|s| s.as_bytes())
208 }
209
210 imp(os_str)
211 }
212
213 /// Create an immutable byte string from a file path.
214 ///
215 /// When the underlying bytes of paths are accessible, then this always
216 /// succeeds and is zero cost. Otherwise, this returns `None` if the given
217 /// path is not valid UTF-8. (For example, when the underlying bytes are
218 /// inaccessible on Windows, file paths are allowed to be a sequence of
219 /// arbitrary 16-bit integers. Not all such sequences can be transcoded to
220 /// valid UTF-8.)
221 ///
222 /// # Examples
223 ///
224 /// Basic usage:
225 ///
226 /// ```
227 /// use std::path::Path;
228 ///
229 /// use bstr::{B, ByteSlice};
230 ///
231 /// let path = Path::new("foo");
232 /// let bs = <[u8]>::from_path(path).expect("should be valid UTF-8");
233 /// assert_eq!(bs, B("foo"));
234 /// ```
235 #[cfg(feature = "std")]
236 #[inline]
from_path(path: &Path) -> Option<&[u8]>237 fn from_path(path: &Path) -> Option<&[u8]> {
238 Self::from_os_str(path.as_os_str())
239 }
240
241 /// Safely convert this byte string into a `&str` if it's valid UTF-8.
242 ///
243 /// If this byte string is not valid UTF-8, then an error is returned. The
244 /// error returned indicates the first invalid byte found and the length
245 /// of the error.
246 ///
247 /// In cases where a lossy conversion to `&str` is acceptable, then use one
248 /// of the [`to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy) or
249 /// [`to_str_lossy_into`](trait.ByteSlice.html#method.to_str_lossy_into)
250 /// methods.
251 ///
252 /// # Examples
253 ///
254 /// Basic usage:
255 ///
256 /// ```
257 /// # #[cfg(feature = "alloc")] {
258 /// use bstr::{B, ByteSlice, ByteVec};
259 ///
260 /// # fn example() -> Result<(), bstr::Utf8Error> {
261 /// let s = B("☃βツ").to_str()?;
262 /// assert_eq!("☃βツ", s);
263 ///
264 /// let mut bstring = <Vec<u8>>::from("☃βツ");
265 /// bstring.push(b'\xFF');
266 /// let err = bstring.to_str().unwrap_err();
267 /// assert_eq!(8, err.valid_up_to());
268 /// # Ok(()) }; example().unwrap()
269 /// # }
270 /// ```
271 #[inline]
to_str(&self) -> Result<&str, Utf8Error>272 fn to_str(&self) -> Result<&str, Utf8Error> {
273 utf8::validate(self.as_bytes()).map(|_| {
274 // SAFETY: This is safe because of the guarantees provided by
275 // utf8::validate.
276 unsafe { str::from_utf8_unchecked(self.as_bytes()) }
277 })
278 }
279
280 /// Unsafely convert this byte string into a `&str`, without checking for
281 /// valid UTF-8.
282 ///
283 /// # Safety
284 ///
285 /// Callers *must* ensure that this byte string is valid UTF-8 before
286 /// calling this method. Converting a byte string into a `&str` that is
287 /// not valid UTF-8 is considered undefined behavior.
288 ///
289 /// This routine is useful in performance sensitive contexts where the
290 /// UTF-8 validity of the byte string is already known and it is
291 /// undesirable to pay the cost of an additional UTF-8 validation check
292 /// that [`to_str`](trait.ByteSlice.html#method.to_str) performs.
293 ///
294 /// # Examples
295 ///
296 /// Basic usage:
297 ///
298 /// ```
299 /// use bstr::{B, ByteSlice};
300 ///
301 /// // SAFETY: This is safe because string literals are guaranteed to be
302 /// // valid UTF-8 by the Rust compiler.
303 /// let s = unsafe { B("☃βツ").to_str_unchecked() };
304 /// assert_eq!("☃βツ", s);
305 /// ```
306 #[inline]
to_str_unchecked(&self) -> &str307 unsafe fn to_str_unchecked(&self) -> &str {
308 str::from_utf8_unchecked(self.as_bytes())
309 }
310
311 /// Convert this byte string to a valid UTF-8 string by replacing invalid
312 /// UTF-8 bytes with the Unicode replacement codepoint (`U+FFFD`).
313 ///
314 /// If the byte string is already valid UTF-8, then no copying or
315 /// allocation is performed and a borrrowed string slice is returned. If
316 /// the byte string is not valid UTF-8, then an owned string buffer is
317 /// returned with invalid bytes replaced by the replacement codepoint.
318 ///
319 /// This method uses the "substitution of maximal subparts" (Unicode
320 /// Standard, Chapter 3, Section 9) strategy for inserting the replacement
321 /// codepoint. Specifically, a replacement codepoint is inserted whenever a
322 /// byte is found that cannot possibly lead to a valid code unit sequence.
323 /// If there were previous bytes that represented a prefix of a well-formed
324 /// code unit sequence, then all of those bytes are substituted with a
325 /// single replacement codepoint. The "substitution of maximal subparts"
326 /// strategy is the same strategy used by
327 /// [W3C's Encoding standard](https://www.w3.org/TR/encoding/).
328 /// For a more precise description of the maximal subpart strategy, see
329 /// the Unicode Standard, Chapter 3, Section 9. See also
330 /// [Public Review Issue #121](https://www.unicode.org/review/pr-121.html).
331 ///
332 /// N.B. Rust's standard library also appears to use the same strategy,
333 /// but it does not appear to be an API guarantee.
334 ///
335 /// # Examples
336 ///
337 /// Basic usage:
338 ///
339 /// ```
340 /// use std::borrow::Cow;
341 ///
342 /// use bstr::ByteSlice;
343 ///
344 /// let mut bstring = <Vec<u8>>::from("☃βツ");
345 /// assert_eq!(Cow::Borrowed("☃βツ"), bstring.to_str_lossy());
346 ///
347 /// // Add a byte that makes the sequence invalid.
348 /// bstring.push(b'\xFF');
349 /// assert_eq!(Cow::Borrowed("☃βツ\u{FFFD}"), bstring.to_str_lossy());
350 /// ```
351 ///
352 /// This demonstrates the "maximal subpart" substitution logic.
353 ///
354 /// ```
355 /// use bstr::{B, ByteSlice};
356 ///
357 /// // \x61 is the ASCII codepoint for 'a'.
358 /// // \xF1\x80\x80 is a valid 3-byte code unit prefix.
359 /// // \xE1\x80 is a valid 2-byte code unit prefix.
360 /// // \xC2 is a valid 1-byte code unit prefix.
361 /// // \x62 is the ASCII codepoint for 'b'.
362 /// //
363 /// // In sum, each of the prefixes is replaced by a single replacement
364 /// // codepoint since none of the prefixes are properly completed. This
365 /// // is in contrast to other strategies that might insert a replacement
366 /// // codepoint for every single byte.
367 /// let bs = B(b"\x61\xF1\x80\x80\xE1\x80\xC2\x62");
368 /// assert_eq!("a\u{FFFD}\u{FFFD}\u{FFFD}b", bs.to_str_lossy());
369 /// ```
370 #[cfg(feature = "alloc")]
371 #[inline]
to_str_lossy(&self) -> Cow<'_, str>372 fn to_str_lossy(&self) -> Cow<'_, str> {
373 match utf8::validate(self.as_bytes()) {
374 Ok(()) => {
375 // SAFETY: This is safe because of the guarantees provided by
376 // utf8::validate.
377 unsafe {
378 Cow::Borrowed(str::from_utf8_unchecked(self.as_bytes()))
379 }
380 }
381 Err(err) => {
382 let mut lossy = String::with_capacity(self.as_bytes().len());
383 let (valid, after) =
384 self.as_bytes().split_at(err.valid_up_to());
385 // SAFETY: This is safe because utf8::validate guarantees
386 // that all of `valid` is valid UTF-8.
387 lossy.push_str(unsafe { str::from_utf8_unchecked(valid) });
388 lossy.push_str("\u{FFFD}");
389 if let Some(len) = err.error_len() {
390 after[len..].to_str_lossy_into(&mut lossy);
391 }
392 Cow::Owned(lossy)
393 }
394 }
395 }
396
397 /// Copy the contents of this byte string into the given owned string
398 /// buffer, while replacing invalid UTF-8 code unit sequences with the
399 /// Unicode replacement codepoint (`U+FFFD`).
400 ///
401 /// This method uses the same "substitution of maximal subparts" strategy
402 /// for inserting the replacement codepoint as the
403 /// [`to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy) method.
404 ///
405 /// This routine is useful for amortizing allocation. However, unlike
406 /// `to_str_lossy`, this routine will _always_ copy the contents of this
407 /// byte string into the destination buffer, even if this byte string is
408 /// valid UTF-8.
409 ///
410 /// # Examples
411 ///
412 /// Basic usage:
413 ///
414 /// ```
415 /// use std::borrow::Cow;
416 ///
417 /// use bstr::ByteSlice;
418 ///
419 /// let mut bstring = <Vec<u8>>::from("☃βツ");
420 /// // Add a byte that makes the sequence invalid.
421 /// bstring.push(b'\xFF');
422 ///
423 /// let mut dest = String::new();
424 /// bstring.to_str_lossy_into(&mut dest);
425 /// assert_eq!("☃βツ\u{FFFD}", dest);
426 /// ```
427 #[cfg(feature = "alloc")]
428 #[inline]
to_str_lossy_into(&self, dest: &mut String)429 fn to_str_lossy_into(&self, dest: &mut String) {
430 let mut bytes = self.as_bytes();
431 dest.reserve(bytes.len());
432 loop {
433 match utf8::validate(bytes) {
434 Ok(()) => {
435 // SAFETY: This is safe because utf8::validate guarantees
436 // that all of `bytes` is valid UTF-8.
437 dest.push_str(unsafe { str::from_utf8_unchecked(bytes) });
438 break;
439 }
440 Err(err) => {
441 let (valid, after) = bytes.split_at(err.valid_up_to());
442 // SAFETY: This is safe because utf8::validate guarantees
443 // that all of `valid` is valid UTF-8.
444 dest.push_str(unsafe { str::from_utf8_unchecked(valid) });
445 dest.push_str("\u{FFFD}");
446 match err.error_len() {
447 None => break,
448 Some(len) => bytes = &after[len..],
449 }
450 }
451 }
452 }
453 }
454
455 /// Create an OS string slice from this byte string.
456 ///
457 /// When OS strings can be constructed from arbitrary byte sequences, this
458 /// always succeeds and is zero cost. Otherwise, this returns a UTF-8
459 /// decoding error if this byte string is not valid UTF-8. (For example,
460 /// assuming the representation of `OsStr` is opaque on Windows, file paths
461 /// are allowed to be a sequence of arbitrary 16-bit integers. There is
462 /// no obvious mapping from an arbitrary sequence of 8-bit integers to an
463 /// arbitrary sequence of 16-bit integers. If the representation of `OsStr`
464 /// is even opened up, then this will convert any sequence of bytes to an
465 /// `OsStr` without cost.)
466 ///
467 /// # Examples
468 ///
469 /// Basic usage:
470 ///
471 /// ```
472 /// use bstr::{B, ByteSlice};
473 ///
474 /// let os_str = b"foo".to_os_str().expect("should be valid UTF-8");
475 /// assert_eq!(os_str, "foo");
476 /// ```
477 #[cfg(feature = "std")]
478 #[inline]
to_os_str(&self) -> Result<&OsStr, Utf8Error>479 fn to_os_str(&self) -> Result<&OsStr, Utf8Error> {
480 #[cfg(unix)]
481 #[inline]
482 fn imp(bytes: &[u8]) -> Result<&OsStr, Utf8Error> {
483 use std::os::unix::ffi::OsStrExt;
484
485 Ok(OsStr::from_bytes(bytes))
486 }
487
488 #[cfg(not(unix))]
489 #[inline]
490 fn imp(bytes: &[u8]) -> Result<&OsStr, Utf8Error> {
491 bytes.to_str().map(OsStr::new)
492 }
493
494 imp(self.as_bytes())
495 }
496
497 /// Lossily create an OS string slice from this byte string.
498 ///
499 /// When OS strings can be constructed from arbitrary byte sequences, this
500 /// is zero cost and always returns a slice. Otherwise, this will perform a
501 /// UTF-8 check and lossily convert this byte string into valid UTF-8 using
502 /// the Unicode replacement codepoint.
503 ///
504 /// Note that this can prevent the correct roundtripping of file paths when
505 /// the representation of `OsStr` is opaque.
506 ///
507 /// # Examples
508 ///
509 /// Basic usage:
510 ///
511 /// ```
512 /// use bstr::ByteSlice;
513 ///
514 /// let os_str = b"foo\xFFbar".to_os_str_lossy();
515 /// assert_eq!(os_str.to_string_lossy(), "foo\u{FFFD}bar");
516 /// ```
517 #[cfg(feature = "std")]
518 #[inline]
to_os_str_lossy(&self) -> Cow<'_, OsStr>519 fn to_os_str_lossy(&self) -> Cow<'_, OsStr> {
520 #[cfg(unix)]
521 #[inline]
522 fn imp(bytes: &[u8]) -> Cow<'_, OsStr> {
523 use std::os::unix::ffi::OsStrExt;
524
525 Cow::Borrowed(OsStr::from_bytes(bytes))
526 }
527
528 #[cfg(not(unix))]
529 #[inline]
530 fn imp(bytes: &[u8]) -> Cow<OsStr> {
531 use std::ffi::OsString;
532
533 match bytes.to_str_lossy() {
534 Cow::Borrowed(x) => Cow::Borrowed(OsStr::new(x)),
535 Cow::Owned(x) => Cow::Owned(OsString::from(x)),
536 }
537 }
538
539 imp(self.as_bytes())
540 }
541
542 /// Create a path slice from this byte string.
543 ///
544 /// When paths can be constructed from arbitrary byte sequences, this
545 /// always succeeds and is zero cost. Otherwise, this returns a UTF-8
546 /// decoding error if this byte string is not valid UTF-8. (For example,
547 /// assuming the representation of `Path` is opaque on Windows, file paths
548 /// are allowed to be a sequence of arbitrary 16-bit integers. There is
549 /// no obvious mapping from an arbitrary sequence of 8-bit integers to an
550 /// arbitrary sequence of 16-bit integers. If the representation of `Path`
551 /// is even opened up, then this will convert any sequence of bytes to an
552 /// `Path` without cost.)
553 ///
554 /// # Examples
555 ///
556 /// Basic usage:
557 ///
558 /// ```
559 /// use bstr::ByteSlice;
560 ///
561 /// let path = b"foo".to_path().expect("should be valid UTF-8");
562 /// assert_eq!(path.as_os_str(), "foo");
563 /// ```
564 #[cfg(feature = "std")]
565 #[inline]
to_path(&self) -> Result<&Path, Utf8Error>566 fn to_path(&self) -> Result<&Path, Utf8Error> {
567 self.to_os_str().map(Path::new)
568 }
569
570 /// Lossily create a path slice from this byte string.
571 ///
572 /// When paths can be constructed from arbitrary byte sequences, this is
573 /// zero cost and always returns a slice. Otherwise, this will perform a
574 /// UTF-8 check and lossily convert this byte string into valid UTF-8 using
575 /// the Unicode replacement codepoint.
576 ///
577 /// Note that this can prevent the correct roundtripping of file paths when
578 /// the representation of `Path` is opaque.
579 ///
580 /// # Examples
581 ///
582 /// Basic usage:
583 ///
584 /// ```
585 /// use bstr::ByteSlice;
586 ///
587 /// let bs = b"foo\xFFbar";
588 /// let path = bs.to_path_lossy();
589 /// assert_eq!(path.to_string_lossy(), "foo\u{FFFD}bar");
590 /// ```
591 #[cfg(feature = "std")]
592 #[inline]
to_path_lossy(&self) -> Cow<'_, Path>593 fn to_path_lossy(&self) -> Cow<'_, Path> {
594 use std::path::PathBuf;
595
596 match self.to_os_str_lossy() {
597 Cow::Borrowed(x) => Cow::Borrowed(Path::new(x)),
598 Cow::Owned(x) => Cow::Owned(PathBuf::from(x)),
599 }
600 }
601
602 /// Create a new byte string by repeating this byte string `n` times.
603 ///
604 /// # Panics
605 ///
606 /// This function panics if the capacity of the new byte string would
607 /// overflow.
608 ///
609 /// # Examples
610 ///
611 /// Basic usage:
612 ///
613 /// ```
614 /// use bstr::{B, ByteSlice};
615 ///
616 /// assert_eq!(b"foo".repeatn(4), B("foofoofoofoo"));
617 /// assert_eq!(b"foo".repeatn(0), B(""));
618 /// ```
619 #[cfg(feature = "alloc")]
620 #[inline]
repeatn(&self, n: usize) -> Vec<u8>621 fn repeatn(&self, n: usize) -> Vec<u8> {
622 self.as_bytes().repeat(n)
623 }
624
625 /// Returns true if and only if this byte string contains the given needle.
626 ///
627 /// # Examples
628 ///
629 /// Basic usage:
630 ///
631 /// ```
632 /// use bstr::ByteSlice;
633 ///
634 /// assert!(b"foo bar".contains_str("foo"));
635 /// assert!(b"foo bar".contains_str("bar"));
636 /// assert!(!b"foo".contains_str("foobar"));
637 /// ```
638 #[inline]
contains_str<B: AsRef<[u8]>>(&self, needle: B) -> bool639 fn contains_str<B: AsRef<[u8]>>(&self, needle: B) -> bool {
640 self.find(needle).is_some()
641 }
642
643 /// Returns true if and only if this byte string has the given prefix.
644 ///
645 /// # Examples
646 ///
647 /// Basic usage:
648 ///
649 /// ```
650 /// use bstr::ByteSlice;
651 ///
652 /// assert!(b"foo bar".starts_with_str("foo"));
653 /// assert!(!b"foo bar".starts_with_str("bar"));
654 /// assert!(!b"foo".starts_with_str("foobar"));
655 /// ```
656 #[inline]
starts_with_str<B: AsRef<[u8]>>(&self, prefix: B) -> bool657 fn starts_with_str<B: AsRef<[u8]>>(&self, prefix: B) -> bool {
658 self.as_bytes().starts_with(prefix.as_ref())
659 }
660
661 /// Returns true if and only if this byte string has the given suffix.
662 ///
663 /// # Examples
664 ///
665 /// Basic usage:
666 ///
667 /// ```
668 /// use bstr::ByteSlice;
669 ///
670 /// assert!(b"foo bar".ends_with_str("bar"));
671 /// assert!(!b"foo bar".ends_with_str("foo"));
672 /// assert!(!b"bar".ends_with_str("foobar"));
673 /// ```
674 #[inline]
ends_with_str<B: AsRef<[u8]>>(&self, suffix: B) -> bool675 fn ends_with_str<B: AsRef<[u8]>>(&self, suffix: B) -> bool {
676 self.as_bytes().ends_with(suffix.as_ref())
677 }
678
679 /// Returns the index of the first occurrence of the given needle.
680 ///
681 /// The needle may be any type that can be cheaply converted into a
682 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
683 ///
684 /// Note that if you're are searching for the same needle in many
685 /// different small haystacks, it may be faster to initialize a
686 /// [`Finder`](struct.Finder.html) once, and reuse it for each search.
687 ///
688 /// # Complexity
689 ///
690 /// This routine is guaranteed to have worst case linear time complexity
691 /// with respect to both the needle and the haystack. That is, this runs
692 /// in `O(needle.len() + haystack.len())` time.
693 ///
694 /// This routine is also guaranteed to have worst case constant space
695 /// complexity.
696 ///
697 /// # Examples
698 ///
699 /// Basic usage:
700 ///
701 /// ```
702 /// use bstr::ByteSlice;
703 ///
704 /// let s = b"foo bar baz";
705 /// assert_eq!(Some(0), s.find("foo"));
706 /// assert_eq!(Some(4), s.find("bar"));
707 /// assert_eq!(None, s.find("quux"));
708 /// ```
709 #[inline]
find<B: AsRef<[u8]>>(&self, needle: B) -> Option<usize>710 fn find<B: AsRef<[u8]>>(&self, needle: B) -> Option<usize> {
711 Finder::new(needle.as_ref()).find(self.as_bytes())
712 }
713
714 /// Returns the index of the last occurrence of the given needle.
715 ///
716 /// The needle may be any type that can be cheaply converted into a
717 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
718 ///
719 /// Note that if you're are searching for the same needle in many
720 /// different small haystacks, it may be faster to initialize a
721 /// [`FinderReverse`](struct.FinderReverse.html) once, and reuse it for
722 /// each search.
723 ///
724 /// # Complexity
725 ///
726 /// This routine is guaranteed to have worst case linear time complexity
727 /// with respect to both the needle and the haystack. That is, this runs
728 /// in `O(needle.len() + haystack.len())` time.
729 ///
730 /// This routine is also guaranteed to have worst case constant space
731 /// complexity.
732 ///
733 /// # Examples
734 ///
735 /// Basic usage:
736 ///
737 /// ```
738 /// use bstr::ByteSlice;
739 ///
740 /// let s = b"foo bar baz";
741 /// assert_eq!(Some(0), s.rfind("foo"));
742 /// assert_eq!(Some(4), s.rfind("bar"));
743 /// assert_eq!(Some(8), s.rfind("ba"));
744 /// assert_eq!(None, s.rfind("quux"));
745 /// ```
746 #[inline]
rfind<B: AsRef<[u8]>>(&self, needle: B) -> Option<usize>747 fn rfind<B: AsRef<[u8]>>(&self, needle: B) -> Option<usize> {
748 FinderReverse::new(needle.as_ref()).rfind(self.as_bytes())
749 }
750
751 /// Returns an iterator of the non-overlapping occurrences of the given
752 /// needle. The iterator yields byte offset positions indicating the start
753 /// of each match.
754 ///
755 /// # Complexity
756 ///
757 /// This routine is guaranteed to have worst case linear time complexity
758 /// with respect to both the needle and the haystack. That is, this runs
759 /// in `O(needle.len() + haystack.len())` time.
760 ///
761 /// This routine is also guaranteed to have worst case constant space
762 /// complexity.
763 ///
764 /// # Examples
765 ///
766 /// Basic usage:
767 ///
768 /// ```
769 /// use bstr::ByteSlice;
770 ///
771 /// let s = b"foo bar foo foo quux foo";
772 /// let matches: Vec<usize> = s.find_iter("foo").collect();
773 /// assert_eq!(matches, vec![0, 8, 12, 21]);
774 /// ```
775 ///
776 /// An empty string matches at every position, including the position
777 /// immediately following the last byte:
778 ///
779 /// ```
780 /// use bstr::ByteSlice;
781 ///
782 /// let matches: Vec<usize> = b"foo".find_iter("").collect();
783 /// assert_eq!(matches, vec![0, 1, 2, 3]);
784 ///
785 /// let matches: Vec<usize> = b"".find_iter("").collect();
786 /// assert_eq!(matches, vec![0]);
787 /// ```
788 #[inline]
find_iter<'h, 'n, B: ?Sized + AsRef<[u8]>>( &'h self, needle: &'n B, ) -> Find<'h, 'n>789 fn find_iter<'h, 'n, B: ?Sized + AsRef<[u8]>>(
790 &'h self,
791 needle: &'n B,
792 ) -> Find<'h, 'n> {
793 Find::new(self.as_bytes(), needle.as_ref())
794 }
795
796 /// Returns an iterator of the non-overlapping occurrences of the given
797 /// needle in reverse. The iterator yields byte offset positions indicating
798 /// the start of each match.
799 ///
800 /// # Complexity
801 ///
802 /// This routine is guaranteed to have worst case linear time complexity
803 /// with respect to both the needle and the haystack. That is, this runs
804 /// in `O(needle.len() + haystack.len())` time.
805 ///
806 /// This routine is also guaranteed to have worst case constant space
807 /// complexity.
808 ///
809 /// # Examples
810 ///
811 /// Basic usage:
812 ///
813 /// ```
814 /// use bstr::ByteSlice;
815 ///
816 /// let s = b"foo bar foo foo quux foo";
817 /// let matches: Vec<usize> = s.rfind_iter("foo").collect();
818 /// assert_eq!(matches, vec![21, 12, 8, 0]);
819 /// ```
820 ///
821 /// An empty string matches at every position, including the position
822 /// immediately following the last byte:
823 ///
824 /// ```
825 /// use bstr::ByteSlice;
826 ///
827 /// let matches: Vec<usize> = b"foo".rfind_iter("").collect();
828 /// assert_eq!(matches, vec![3, 2, 1, 0]);
829 ///
830 /// let matches: Vec<usize> = b"".rfind_iter("").collect();
831 /// assert_eq!(matches, vec![0]);
832 /// ```
833 #[inline]
rfind_iter<'h, 'n, B: ?Sized + AsRef<[u8]>>( &'h self, needle: &'n B, ) -> FindReverse<'h, 'n>834 fn rfind_iter<'h, 'n, B: ?Sized + AsRef<[u8]>>(
835 &'h self,
836 needle: &'n B,
837 ) -> FindReverse<'h, 'n> {
838 FindReverse::new(self.as_bytes(), needle.as_ref())
839 }
840
841 /// Returns the index of the first occurrence of the given byte. If the
842 /// byte does not occur in this byte string, then `None` is returned.
843 ///
844 /// # Examples
845 ///
846 /// Basic usage:
847 ///
848 /// ```
849 /// use bstr::ByteSlice;
850 ///
851 /// assert_eq!(Some(10), b"foo bar baz".find_byte(b'z'));
852 /// assert_eq!(None, b"foo bar baz".find_byte(b'y'));
853 /// ```
854 #[inline]
find_byte(&self, byte: u8) -> Option<usize>855 fn find_byte(&self, byte: u8) -> Option<usize> {
856 memchr(byte, self.as_bytes())
857 }
858
859 /// Returns the index of the last occurrence of the given byte. If the
860 /// byte does not occur in this byte string, then `None` is returned.
861 ///
862 /// # Examples
863 ///
864 /// Basic usage:
865 ///
866 /// ```
867 /// use bstr::ByteSlice;
868 ///
869 /// assert_eq!(Some(10), b"foo bar baz".rfind_byte(b'z'));
870 /// assert_eq!(None, b"foo bar baz".rfind_byte(b'y'));
871 /// ```
872 #[inline]
rfind_byte(&self, byte: u8) -> Option<usize>873 fn rfind_byte(&self, byte: u8) -> Option<usize> {
874 memrchr(byte, self.as_bytes())
875 }
876
877 /// Returns the index of the first occurrence of the given codepoint.
878 /// If the codepoint does not occur in this byte string, then `None` is
879 /// returned.
880 ///
881 /// Note that if one searches for the replacement codepoint, `\u{FFFD}`,
882 /// then only explicit occurrences of that encoding will be found. Invalid
883 /// UTF-8 sequences will not be matched.
884 ///
885 /// # Examples
886 ///
887 /// Basic usage:
888 ///
889 /// ```
890 /// use bstr::{B, ByteSlice};
891 ///
892 /// assert_eq!(Some(10), b"foo bar baz".find_char('z'));
893 /// assert_eq!(Some(4), B("αβγγδ").find_char('γ'));
894 /// assert_eq!(None, b"foo bar baz".find_char('y'));
895 /// ```
896 #[inline]
find_char(&self, ch: char) -> Option<usize>897 fn find_char(&self, ch: char) -> Option<usize> {
898 self.find(ch.encode_utf8(&mut [0; 4]))
899 }
900
901 /// Returns the index of the last occurrence of the given codepoint.
902 /// If the codepoint does not occur in this byte string, then `None` is
903 /// returned.
904 ///
905 /// Note that if one searches for the replacement codepoint, `\u{FFFD}`,
906 /// then only explicit occurrences of that encoding will be found. Invalid
907 /// UTF-8 sequences will not be matched.
908 ///
909 /// # Examples
910 ///
911 /// Basic usage:
912 ///
913 /// ```
914 /// use bstr::{B, ByteSlice};
915 ///
916 /// assert_eq!(Some(10), b"foo bar baz".rfind_char('z'));
917 /// assert_eq!(Some(6), B("αβγγδ").rfind_char('γ'));
918 /// assert_eq!(None, b"foo bar baz".rfind_char('y'));
919 /// ```
920 #[inline]
rfind_char(&self, ch: char) -> Option<usize>921 fn rfind_char(&self, ch: char) -> Option<usize> {
922 self.rfind(ch.encode_utf8(&mut [0; 4]))
923 }
924
925 /// Returns the index of the first occurrence of any of the bytes in the
926 /// provided set.
927 ///
928 /// The `byteset` may be any type that can be cheaply converted into a
929 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but
930 /// note that passing a `&str` which contains multibyte characters may not
931 /// behave as you expect: each byte in the `&str` is treated as an
932 /// individual member of the byte set.
933 ///
934 /// Note that order is irrelevant for the `byteset` parameter, and
935 /// duplicate bytes present in its body are ignored.
936 ///
937 /// # Complexity
938 ///
939 /// This routine is guaranteed to have worst case linear time complexity
940 /// with respect to both the set of bytes and the haystack. That is, this
941 /// runs in `O(byteset.len() + haystack.len())` time.
942 ///
943 /// This routine is also guaranteed to have worst case constant space
944 /// complexity.
945 ///
946 /// # Examples
947 ///
948 /// Basic usage:
949 ///
950 /// ```
951 /// use bstr::ByteSlice;
952 ///
953 /// assert_eq!(b"foo bar baz".find_byteset(b"zr"), Some(6));
954 /// assert_eq!(b"foo baz bar".find_byteset(b"bzr"), Some(4));
955 /// assert_eq!(None, b"foo baz bar".find_byteset(b"\t\n"));
956 /// // The empty byteset never matches.
957 /// assert_eq!(None, b"abc".find_byteset(b""));
958 /// assert_eq!(None, b"".find_byteset(b""));
959 /// ```
960 #[inline]
find_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize>961 fn find_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
962 byteset::find(self.as_bytes(), byteset.as_ref())
963 }
964
965 /// Returns the index of the first occurrence of a byte that is not a
966 /// member of the provided set.
967 ///
968 /// The `byteset` may be any type that can be cheaply converted into a
969 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but
970 /// note that passing a `&str` which contains multibyte characters may not
971 /// behave as you expect: each byte in the `&str` is treated as an
972 /// individual member of the byte set.
973 ///
974 /// Note that order is irrelevant for the `byteset` parameter, and
975 /// duplicate bytes present in its body are ignored.
976 ///
977 /// # Complexity
978 ///
979 /// This routine is guaranteed to have worst case linear time complexity
980 /// with respect to both the set of bytes and the haystack. That is, this
981 /// runs in `O(byteset.len() + haystack.len())` time.
982 ///
983 /// This routine is also guaranteed to have worst case constant space
984 /// complexity.
985 ///
986 /// # Examples
987 ///
988 /// Basic usage:
989 ///
990 /// ```
991 /// use bstr::ByteSlice;
992 ///
993 /// assert_eq!(b"foo bar baz".find_not_byteset(b"fo "), Some(4));
994 /// assert_eq!(b"\t\tbaz bar".find_not_byteset(b" \t\r\n"), Some(2));
995 /// assert_eq!(b"foo\nbaz\tbar".find_not_byteset(b"\t\n"), Some(0));
996 /// // The negation of the empty byteset matches everything.
997 /// assert_eq!(Some(0), b"abc".find_not_byteset(b""));
998 /// // But an empty string never contains anything.
999 /// assert_eq!(None, b"".find_not_byteset(b""));
1000 /// ```
1001 #[inline]
find_not_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize>1002 fn find_not_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
1003 byteset::find_not(self.as_bytes(), byteset.as_ref())
1004 }
1005
1006 /// Returns the index of the last occurrence of any of the bytes in the
1007 /// provided set.
1008 ///
1009 /// The `byteset` may be any type that can be cheaply converted into a
1010 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but
1011 /// note that passing a `&str` which contains multibyte characters may not
1012 /// behave as you expect: each byte in the `&str` is treated as an
1013 /// individual member of the byte set.
1014 ///
1015 /// Note that order is irrelevant for the `byteset` parameter, and duplicate
1016 /// bytes present in its body are ignored.
1017 ///
1018 /// # Complexity
1019 ///
1020 /// This routine is guaranteed to have worst case linear time complexity
1021 /// with respect to both the set of bytes and the haystack. That is, this
1022 /// runs in `O(byteset.len() + haystack.len())` time.
1023 ///
1024 /// This routine is also guaranteed to have worst case constant space
1025 /// complexity.
1026 ///
1027 /// # Examples
1028 ///
1029 /// Basic usage:
1030 ///
1031 /// ```
1032 /// use bstr::ByteSlice;
1033 ///
1034 /// assert_eq!(b"foo bar baz".rfind_byteset(b"agb"), Some(9));
1035 /// assert_eq!(b"foo baz bar".rfind_byteset(b"rabz "), Some(10));
1036 /// assert_eq!(b"foo baz bar".rfind_byteset(b"\n123"), None);
1037 /// ```
1038 #[inline]
rfind_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize>1039 fn rfind_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
1040 byteset::rfind(self.as_bytes(), byteset.as_ref())
1041 }
1042
1043 /// Returns the index of the last occurrence of a byte that is not a member
1044 /// of the provided set.
1045 ///
1046 /// The `byteset` may be any type that can be cheaply converted into a
1047 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but
1048 /// note that passing a `&str` which contains multibyte characters may not
1049 /// behave as you expect: each byte in the `&str` is treated as an
1050 /// individual member of the byte set.
1051 ///
1052 /// Note that order is irrelevant for the `byteset` parameter, and
1053 /// duplicate bytes present in its body are ignored.
1054 ///
1055 /// # Complexity
1056 ///
1057 /// This routine is guaranteed to have worst case linear time complexity
1058 /// with respect to both the set of bytes and the haystack. That is, this
1059 /// runs in `O(byteset.len() + haystack.len())` time.
1060 ///
1061 /// This routine is also guaranteed to have worst case constant space
1062 /// complexity.
1063 ///
1064 /// # Examples
1065 ///
1066 /// Basic usage:
1067 ///
1068 /// ```
1069 /// use bstr::ByteSlice;
1070 ///
1071 /// assert_eq!(b"foo bar baz,\t".rfind_not_byteset(b",\t"), Some(10));
1072 /// assert_eq!(b"foo baz bar".rfind_not_byteset(b"rabz "), Some(2));
1073 /// assert_eq!(None, b"foo baz bar".rfind_not_byteset(b"barfoz "));
1074 /// ```
1075 #[inline]
rfind_not_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize>1076 fn rfind_not_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> {
1077 byteset::rfind_not(self.as_bytes(), byteset.as_ref())
1078 }
1079
1080 /// Returns an iterator over the fields in a byte string, separated
1081 /// by contiguous whitespace (according to the Unicode property
1082 /// `White_Space`).
1083 ///
1084 /// # Example
1085 ///
1086 /// Basic usage:
1087 ///
1088 /// ```
1089 /// use bstr::{B, ByteSlice};
1090 ///
1091 /// let s = B(" foo\tbar\t\u{2003}\nquux \n");
1092 /// let fields: Vec<&[u8]> = s.fields().collect();
1093 /// assert_eq!(fields, vec![B("foo"), B("bar"), B("quux")]);
1094 /// ```
1095 ///
1096 /// A byte string consisting of just whitespace yields no elements:
1097 ///
1098 /// ```
1099 /// use bstr::{B, ByteSlice};
1100 ///
1101 /// assert_eq!(0, B(" \n\t\u{2003}\n \t").fields().count());
1102 /// ```
1103 #[cfg(feature = "unicode")]
1104 #[inline]
fields(&self) -> Fields<'_>1105 fn fields(&self) -> Fields<'_> {
1106 Fields::new(self.as_bytes())
1107 }
1108
1109 /// Returns an iterator over the fields in a byte string, separated by
1110 /// contiguous codepoints satisfying the given predicate.
1111 ///
1112 /// If this byte string is not valid UTF-8, then the given closure will
1113 /// be called with a Unicode replacement codepoint when invalid UTF-8
1114 /// bytes are seen.
1115 ///
1116 /// # Example
1117 ///
1118 /// Basic usage:
1119 ///
1120 /// ```
1121 /// use bstr::{B, ByteSlice};
1122 ///
1123 /// let s = b"123foo999999bar1quux123456";
1124 /// let fields: Vec<&[u8]> = s.fields_with(|c| c.is_numeric()).collect();
1125 /// assert_eq!(fields, vec![B("foo"), B("bar"), B("quux")]);
1126 /// ```
1127 ///
1128 /// A byte string consisting of all codepoints satisfying the predicate
1129 /// yields no elements:
1130 ///
1131 /// ```
1132 /// use bstr::ByteSlice;
1133 ///
1134 /// assert_eq!(0, b"1911354563".fields_with(|c| c.is_numeric()).count());
1135 /// ```
1136 #[inline]
fields_with<F: FnMut(char) -> bool>(&self, f: F) -> FieldsWith<'_, F>1137 fn fields_with<F: FnMut(char) -> bool>(&self, f: F) -> FieldsWith<'_, F> {
1138 FieldsWith::new(self.as_bytes(), f)
1139 }
1140
1141 /// Returns an iterator over substrings of this byte string, separated
1142 /// by the given byte string. Each element yielded is guaranteed not to
1143 /// include the splitter substring.
1144 ///
1145 /// The splitter may be any type that can be cheaply converted into a
1146 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
1147 ///
1148 /// # Examples
1149 ///
1150 /// Basic usage:
1151 ///
1152 /// ```
1153 /// use bstr::{B, ByteSlice};
1154 ///
1155 /// let x: Vec<&[u8]> = b"Mary had a little lamb".split_str(" ").collect();
1156 /// assert_eq!(x, vec![
1157 /// B("Mary"), B("had"), B("a"), B("little"), B("lamb"),
1158 /// ]);
1159 ///
1160 /// let x: Vec<&[u8]> = b"".split_str("X").collect();
1161 /// assert_eq!(x, vec![b""]);
1162 ///
1163 /// let x: Vec<&[u8]> = b"lionXXtigerXleopard".split_str("X").collect();
1164 /// assert_eq!(x, vec![B("lion"), B(""), B("tiger"), B("leopard")]);
1165 ///
1166 /// let x: Vec<&[u8]> = b"lion::tiger::leopard".split_str("::").collect();
1167 /// assert_eq!(x, vec![B("lion"), B("tiger"), B("leopard")]);
1168 /// ```
1169 ///
1170 /// If a string contains multiple contiguous separators, you will end up
1171 /// with empty strings yielded by the iterator:
1172 ///
1173 /// ```
1174 /// use bstr::{B, ByteSlice};
1175 ///
1176 /// let x: Vec<&[u8]> = b"||||a||b|c".split_str("|").collect();
1177 /// assert_eq!(x, vec![
1178 /// B(""), B(""), B(""), B(""), B("a"), B(""), B("b"), B("c"),
1179 /// ]);
1180 ///
1181 /// let x: Vec<&[u8]> = b"(///)".split_str("/").collect();
1182 /// assert_eq!(x, vec![B("("), B(""), B(""), B(")")]);
1183 /// ```
1184 ///
1185 /// Separators at the start or end of a string are neighbored by empty
1186 /// strings.
1187 ///
1188 /// ```
1189 /// use bstr::{B, ByteSlice};
1190 ///
1191 /// let x: Vec<&[u8]> = b"010".split_str("0").collect();
1192 /// assert_eq!(x, vec![B(""), B("1"), B("")]);
1193 /// ```
1194 ///
1195 /// When the empty string is used as a separator, it splits every **byte**
1196 /// in the byte string, along with the beginning and end of the byte
1197 /// string.
1198 ///
1199 /// ```
1200 /// use bstr::{B, ByteSlice};
1201 ///
1202 /// let x: Vec<&[u8]> = b"rust".split_str("").collect();
1203 /// assert_eq!(x, vec![
1204 /// B(""), B("r"), B("u"), B("s"), B("t"), B(""),
1205 /// ]);
1206 ///
1207 /// // Splitting by an empty string is not UTF-8 aware. Elements yielded
1208 /// // may not be valid UTF-8!
1209 /// let x: Vec<&[u8]> = B("☃").split_str("").collect();
1210 /// assert_eq!(x, vec![
1211 /// B(""), B(b"\xE2"), B(b"\x98"), B(b"\x83"), B(""),
1212 /// ]);
1213 /// ```
1214 ///
1215 /// Contiguous separators, especially whitespace, can lead to possibly
1216 /// surprising behavior. For example, this code is correct:
1217 ///
1218 /// ```
1219 /// use bstr::{B, ByteSlice};
1220 ///
1221 /// let x: Vec<&[u8]> = b" a b c".split_str(" ").collect();
1222 /// assert_eq!(x, vec![
1223 /// B(""), B(""), B(""), B(""), B("a"), B(""), B("b"), B("c"),
1224 /// ]);
1225 /// ```
1226 ///
1227 /// It does *not* give you `["a", "b", "c"]`. For that behavior, use
1228 /// [`fields`](#method.fields) instead.
1229 #[inline]
split_str<'h, 's, B: ?Sized + AsRef<[u8]>>( &'h self, splitter: &'s B, ) -> Split<'h, 's>1230 fn split_str<'h, 's, B: ?Sized + AsRef<[u8]>>(
1231 &'h self,
1232 splitter: &'s B,
1233 ) -> Split<'h, 's> {
1234 Split::new(self.as_bytes(), splitter.as_ref())
1235 }
1236
1237 /// Returns an iterator over substrings of this byte string, separated by
1238 /// the given byte string, in reverse. Each element yielded is guaranteed
1239 /// not to include the splitter substring.
1240 ///
1241 /// The splitter may be any type that can be cheaply converted into a
1242 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
1243 ///
1244 /// # Examples
1245 ///
1246 /// Basic usage:
1247 ///
1248 /// ```
1249 /// use bstr::{B, ByteSlice};
1250 ///
1251 /// let x: Vec<&[u8]> =
1252 /// b"Mary had a little lamb".rsplit_str(" ").collect();
1253 /// assert_eq!(x, vec![
1254 /// B("lamb"), B("little"), B("a"), B("had"), B("Mary"),
1255 /// ]);
1256 ///
1257 /// let x: Vec<&[u8]> = b"".rsplit_str("X").collect();
1258 /// assert_eq!(x, vec![b""]);
1259 ///
1260 /// let x: Vec<&[u8]> = b"lionXXtigerXleopard".rsplit_str("X").collect();
1261 /// assert_eq!(x, vec![B("leopard"), B("tiger"), B(""), B("lion")]);
1262 ///
1263 /// let x: Vec<&[u8]> = b"lion::tiger::leopard".rsplit_str("::").collect();
1264 /// assert_eq!(x, vec![B("leopard"), B("tiger"), B("lion")]);
1265 /// ```
1266 ///
1267 /// If a string contains multiple contiguous separators, you will end up
1268 /// with empty strings yielded by the iterator:
1269 ///
1270 /// ```
1271 /// use bstr::{B, ByteSlice};
1272 ///
1273 /// let x: Vec<&[u8]> = b"||||a||b|c".rsplit_str("|").collect();
1274 /// assert_eq!(x, vec![
1275 /// B("c"), B("b"), B(""), B("a"), B(""), B(""), B(""), B(""),
1276 /// ]);
1277 ///
1278 /// let x: Vec<&[u8]> = b"(///)".rsplit_str("/").collect();
1279 /// assert_eq!(x, vec![B(")"), B(""), B(""), B("(")]);
1280 /// ```
1281 ///
1282 /// Separators at the start or end of a string are neighbored by empty
1283 /// strings.
1284 ///
1285 /// ```
1286 /// use bstr::{B, ByteSlice};
1287 ///
1288 /// let x: Vec<&[u8]> = b"010".rsplit_str("0").collect();
1289 /// assert_eq!(x, vec![B(""), B("1"), B("")]);
1290 /// ```
1291 ///
1292 /// When the empty string is used as a separator, it splits every **byte**
1293 /// in the byte string, along with the beginning and end of the byte
1294 /// string.
1295 ///
1296 /// ```
1297 /// use bstr::{B, ByteSlice};
1298 ///
1299 /// let x: Vec<&[u8]> = b"rust".rsplit_str("").collect();
1300 /// assert_eq!(x, vec![
1301 /// B(""), B("t"), B("s"), B("u"), B("r"), B(""),
1302 /// ]);
1303 ///
1304 /// // Splitting by an empty string is not UTF-8 aware. Elements yielded
1305 /// // may not be valid UTF-8!
1306 /// let x: Vec<&[u8]> = B("☃").rsplit_str("").collect();
1307 /// assert_eq!(x, vec![B(""), B(b"\x83"), B(b"\x98"), B(b"\xE2"), B("")]);
1308 /// ```
1309 ///
1310 /// Contiguous separators, especially whitespace, can lead to possibly
1311 /// surprising behavior. For example, this code is correct:
1312 ///
1313 /// ```
1314 /// use bstr::{B, ByteSlice};
1315 ///
1316 /// let x: Vec<&[u8]> = b" a b c".rsplit_str(" ").collect();
1317 /// assert_eq!(x, vec![
1318 /// B("c"), B("b"), B(""), B("a"), B(""), B(""), B(""), B(""),
1319 /// ]);
1320 /// ```
1321 ///
1322 /// It does *not* give you `["a", "b", "c"]`.
1323 #[inline]
rsplit_str<'h, 's, B: ?Sized + AsRef<[u8]>>( &'h self, splitter: &'s B, ) -> SplitReverse<'h, 's>1324 fn rsplit_str<'h, 's, B: ?Sized + AsRef<[u8]>>(
1325 &'h self,
1326 splitter: &'s B,
1327 ) -> SplitReverse<'h, 's> {
1328 SplitReverse::new(self.as_bytes(), splitter.as_ref())
1329 }
1330
1331 /// Split this byte string at the first occurrence of `splitter`.
1332 ///
1333 /// If the `splitter` is found in the byte string, returns a tuple
1334 /// containing the parts of the string before and after the first occurrence
1335 /// of `splitter` respectively. Otherwise, if there are no occurrences of
1336 /// `splitter` in the byte string, returns `None`.
1337 ///
1338 /// The splitter may be any type that can be cheaply converted into a
1339 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
1340 ///
1341 /// If you need to split on the *last* instance of a delimiter instead, see
1342 /// the [`ByteSlice::rsplit_once_str`](#method.rsplit_once_str) method .
1343 ///
1344 /// # Examples
1345 ///
1346 /// Basic usage:
1347 ///
1348 /// ```
1349 /// use bstr::{B, ByteSlice};
1350 ///
1351 /// assert_eq!(
1352 /// B("foo,bar").split_once_str(","),
1353 /// Some((B("foo"), B("bar"))),
1354 /// );
1355 /// assert_eq!(
1356 /// B("foo,bar,baz").split_once_str(","),
1357 /// Some((B("foo"), B("bar,baz"))),
1358 /// );
1359 /// assert_eq!(B("foo").split_once_str(","), None);
1360 /// assert_eq!(B("foo,").split_once_str(b","), Some((B("foo"), B(""))));
1361 /// assert_eq!(B(",foo").split_once_str(b","), Some((B(""), B("foo"))));
1362 /// ```
1363 #[inline]
split_once_str<'a, B: ?Sized + AsRef<[u8]>>( &'a self, splitter: &B, ) -> Option<(&'a [u8], &'a [u8])>1364 fn split_once_str<'a, B: ?Sized + AsRef<[u8]>>(
1365 &'a self,
1366 splitter: &B,
1367 ) -> Option<(&'a [u8], &'a [u8])> {
1368 let bytes = self.as_bytes();
1369 let splitter = splitter.as_ref();
1370 let start = Finder::new(splitter).find(bytes)?;
1371 let end = start + splitter.len();
1372 Some((&bytes[..start], &bytes[end..]))
1373 }
1374
1375 /// Split this byte string at the last occurrence of `splitter`.
1376 ///
1377 /// If the `splitter` is found in the byte string, returns a tuple
1378 /// containing the parts of the string before and after the last occurrence
1379 /// of `splitter`, respectively. Otherwise, if there are no occurrences of
1380 /// `splitter` in the byte string, returns `None`.
1381 ///
1382 /// The splitter may be any type that can be cheaply converted into a
1383 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
1384 ///
1385 /// If you need to split on the *first* instance of a delimiter instead, see
1386 /// the [`ByteSlice::split_once_str`](#method.split_once_str) method.
1387 ///
1388 /// # Examples
1389 ///
1390 /// Basic usage:
1391 ///
1392 /// ```
1393 /// use bstr::{B, ByteSlice};
1394 ///
1395 /// assert_eq!(
1396 /// B("foo,bar").rsplit_once_str(","),
1397 /// Some((B("foo"), B("bar"))),
1398 /// );
1399 /// assert_eq!(
1400 /// B("foo,bar,baz").rsplit_once_str(","),
1401 /// Some((B("foo,bar"), B("baz"))),
1402 /// );
1403 /// assert_eq!(B("foo").rsplit_once_str(","), None);
1404 /// assert_eq!(B("foo,").rsplit_once_str(b","), Some((B("foo"), B(""))));
1405 /// assert_eq!(B(",foo").rsplit_once_str(b","), Some((B(""), B("foo"))));
1406 /// ```
1407 #[inline]
rsplit_once_str<'a, B: ?Sized + AsRef<[u8]>>( &'a self, splitter: &B, ) -> Option<(&'a [u8], &'a [u8])>1408 fn rsplit_once_str<'a, B: ?Sized + AsRef<[u8]>>(
1409 &'a self,
1410 splitter: &B,
1411 ) -> Option<(&'a [u8], &'a [u8])> {
1412 let bytes = self.as_bytes();
1413 let splitter = splitter.as_ref();
1414 let start = FinderReverse::new(splitter).rfind(bytes)?;
1415 let end = start + splitter.len();
1416 Some((&bytes[..start], &bytes[end..]))
1417 }
1418
1419 /// Returns an iterator of at most `limit` substrings of this byte string,
1420 /// separated by the given byte string. If `limit` substrings are yielded,
1421 /// then the last substring will contain the remainder of this byte string.
1422 ///
1423 /// The needle may be any type that can be cheaply converted into a
1424 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
1425 ///
1426 /// # Examples
1427 ///
1428 /// Basic usage:
1429 ///
1430 /// ```
1431 /// use bstr::{B, ByteSlice};
1432 ///
1433 /// let x: Vec<_> = b"Mary had a little lamb".splitn_str(3, " ").collect();
1434 /// assert_eq!(x, vec![B("Mary"), B("had"), B("a little lamb")]);
1435 ///
1436 /// let x: Vec<_> = b"".splitn_str(3, "X").collect();
1437 /// assert_eq!(x, vec![b""]);
1438 ///
1439 /// let x: Vec<_> = b"lionXXtigerXleopard".splitn_str(3, "X").collect();
1440 /// assert_eq!(x, vec![B("lion"), B(""), B("tigerXleopard")]);
1441 ///
1442 /// let x: Vec<_> = b"lion::tiger::leopard".splitn_str(2, "::").collect();
1443 /// assert_eq!(x, vec![B("lion"), B("tiger::leopard")]);
1444 ///
1445 /// let x: Vec<_> = b"abcXdef".splitn_str(1, "X").collect();
1446 /// assert_eq!(x, vec![B("abcXdef")]);
1447 ///
1448 /// let x: Vec<_> = b"abcdef".splitn_str(2, "X").collect();
1449 /// assert_eq!(x, vec![B("abcdef")]);
1450 ///
1451 /// let x: Vec<_> = b"abcXdef".splitn_str(0, "X").collect();
1452 /// assert!(x.is_empty());
1453 /// ```
1454 #[inline]
splitn_str<'h, 's, B: ?Sized + AsRef<[u8]>>( &'h self, limit: usize, splitter: &'s B, ) -> SplitN<'h, 's>1455 fn splitn_str<'h, 's, B: ?Sized + AsRef<[u8]>>(
1456 &'h self,
1457 limit: usize,
1458 splitter: &'s B,
1459 ) -> SplitN<'h, 's> {
1460 SplitN::new(self.as_bytes(), splitter.as_ref(), limit)
1461 }
1462
1463 /// Returns an iterator of at most `limit` substrings of this byte string,
1464 /// separated by the given byte string, in reverse. If `limit` substrings
1465 /// are yielded, then the last substring will contain the remainder of this
1466 /// byte string.
1467 ///
1468 /// The needle may be any type that can be cheaply converted into a
1469 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
1470 ///
1471 /// # Examples
1472 ///
1473 /// Basic usage:
1474 ///
1475 /// ```
1476 /// use bstr::{B, ByteSlice};
1477 ///
1478 /// let x: Vec<_> =
1479 /// b"Mary had a little lamb".rsplitn_str(3, " ").collect();
1480 /// assert_eq!(x, vec![B("lamb"), B("little"), B("Mary had a")]);
1481 ///
1482 /// let x: Vec<_> = b"".rsplitn_str(3, "X").collect();
1483 /// assert_eq!(x, vec![b""]);
1484 ///
1485 /// let x: Vec<_> = b"lionXXtigerXleopard".rsplitn_str(3, "X").collect();
1486 /// assert_eq!(x, vec![B("leopard"), B("tiger"), B("lionX")]);
1487 ///
1488 /// let x: Vec<_> = b"lion::tiger::leopard".rsplitn_str(2, "::").collect();
1489 /// assert_eq!(x, vec![B("leopard"), B("lion::tiger")]);
1490 ///
1491 /// let x: Vec<_> = b"abcXdef".rsplitn_str(1, "X").collect();
1492 /// assert_eq!(x, vec![B("abcXdef")]);
1493 ///
1494 /// let x: Vec<_> = b"abcdef".rsplitn_str(2, "X").collect();
1495 /// assert_eq!(x, vec![B("abcdef")]);
1496 ///
1497 /// let x: Vec<_> = b"abcXdef".rsplitn_str(0, "X").collect();
1498 /// assert!(x.is_empty());
1499 /// ```
1500 #[inline]
rsplitn_str<'h, 's, B: ?Sized + AsRef<[u8]>>( &'h self, limit: usize, splitter: &'s B, ) -> SplitNReverse<'h, 's>1501 fn rsplitn_str<'h, 's, B: ?Sized + AsRef<[u8]>>(
1502 &'h self,
1503 limit: usize,
1504 splitter: &'s B,
1505 ) -> SplitNReverse<'h, 's> {
1506 SplitNReverse::new(self.as_bytes(), splitter.as_ref(), limit)
1507 }
1508
1509 /// Replace all matches of the given needle with the given replacement, and
1510 /// the result as a new `Vec<u8>`.
1511 ///
1512 /// This routine is useful as a convenience. If you need to reuse an
1513 /// allocation, use [`replace_into`](#method.replace_into) instead.
1514 ///
1515 /// # Examples
1516 ///
1517 /// Basic usage:
1518 ///
1519 /// ```
1520 /// use bstr::ByteSlice;
1521 ///
1522 /// let s = b"this is old".replace("old", "new");
1523 /// assert_eq!(s, "this is new".as_bytes());
1524 /// ```
1525 ///
1526 /// When the pattern doesn't match:
1527 ///
1528 /// ```
1529 /// use bstr::ByteSlice;
1530 ///
1531 /// let s = b"this is old".replace("nada nada", "limonada");
1532 /// assert_eq!(s, "this is old".as_bytes());
1533 /// ```
1534 ///
1535 /// When the needle is an empty string:
1536 ///
1537 /// ```
1538 /// use bstr::ByteSlice;
1539 ///
1540 /// let s = b"foo".replace("", "Z");
1541 /// assert_eq!(s, "ZfZoZoZ".as_bytes());
1542 /// ```
1543 #[cfg(feature = "alloc")]
1544 #[inline]
replace<N: AsRef<[u8]>, R: AsRef<[u8]>>( &self, needle: N, replacement: R, ) -> Vec<u8>1545 fn replace<N: AsRef<[u8]>, R: AsRef<[u8]>>(
1546 &self,
1547 needle: N,
1548 replacement: R,
1549 ) -> Vec<u8> {
1550 let mut dest = Vec::with_capacity(self.as_bytes().len());
1551 self.replace_into(needle, replacement, &mut dest);
1552 dest
1553 }
1554
1555 /// Replace up to `limit` matches of the given needle with the given
1556 /// replacement, and the result as a new `Vec<u8>`.
1557 ///
1558 /// This routine is useful as a convenience. If you need to reuse an
1559 /// allocation, use [`replacen_into`](#method.replacen_into) instead.
1560 ///
1561 /// # Examples
1562 ///
1563 /// Basic usage:
1564 ///
1565 /// ```
1566 /// use bstr::ByteSlice;
1567 ///
1568 /// let s = b"foofoo".replacen("o", "z", 2);
1569 /// assert_eq!(s, "fzzfoo".as_bytes());
1570 /// ```
1571 ///
1572 /// When the pattern doesn't match:
1573 ///
1574 /// ```
1575 /// use bstr::ByteSlice;
1576 ///
1577 /// let s = b"foofoo".replacen("a", "z", 2);
1578 /// assert_eq!(s, "foofoo".as_bytes());
1579 /// ```
1580 ///
1581 /// When the needle is an empty string:
1582 ///
1583 /// ```
1584 /// use bstr::ByteSlice;
1585 ///
1586 /// let s = b"foo".replacen("", "Z", 2);
1587 /// assert_eq!(s, "ZfZoo".as_bytes());
1588 /// ```
1589 #[cfg(feature = "alloc")]
1590 #[inline]
replacen<N: AsRef<[u8]>, R: AsRef<[u8]>>( &self, needle: N, replacement: R, limit: usize, ) -> Vec<u8>1591 fn replacen<N: AsRef<[u8]>, R: AsRef<[u8]>>(
1592 &self,
1593 needle: N,
1594 replacement: R,
1595 limit: usize,
1596 ) -> Vec<u8> {
1597 let mut dest = Vec::with_capacity(self.as_bytes().len());
1598 self.replacen_into(needle, replacement, limit, &mut dest);
1599 dest
1600 }
1601
1602 /// Replace all matches of the given needle with the given replacement,
1603 /// and write the result into the provided `Vec<u8>`.
1604 ///
1605 /// This does **not** clear `dest` before writing to it.
1606 ///
1607 /// This routine is useful for reusing allocation. For a more convenient
1608 /// API, use [`replace`](#method.replace) instead.
1609 ///
1610 /// # Examples
1611 ///
1612 /// Basic usage:
1613 ///
1614 /// ```
1615 /// use bstr::ByteSlice;
1616 ///
1617 /// let s = b"this is old";
1618 ///
1619 /// let mut dest = vec![];
1620 /// s.replace_into("old", "new", &mut dest);
1621 /// assert_eq!(dest, "this is new".as_bytes());
1622 /// ```
1623 ///
1624 /// When the pattern doesn't match:
1625 ///
1626 /// ```
1627 /// use bstr::ByteSlice;
1628 ///
1629 /// let s = b"this is old";
1630 ///
1631 /// let mut dest = vec![];
1632 /// s.replace_into("nada nada", "limonada", &mut dest);
1633 /// assert_eq!(dest, "this is old".as_bytes());
1634 /// ```
1635 ///
1636 /// When the needle is an empty string:
1637 ///
1638 /// ```
1639 /// use bstr::ByteSlice;
1640 ///
1641 /// let s = b"foo";
1642 ///
1643 /// let mut dest = vec![];
1644 /// s.replace_into("", "Z", &mut dest);
1645 /// assert_eq!(dest, "ZfZoZoZ".as_bytes());
1646 /// ```
1647 #[cfg(feature = "alloc")]
1648 #[inline]
replace_into<N: AsRef<[u8]>, R: AsRef<[u8]>>( &self, needle: N, replacement: R, dest: &mut Vec<u8>, )1649 fn replace_into<N: AsRef<[u8]>, R: AsRef<[u8]>>(
1650 &self,
1651 needle: N,
1652 replacement: R,
1653 dest: &mut Vec<u8>,
1654 ) {
1655 let (needle, replacement) = (needle.as_ref(), replacement.as_ref());
1656
1657 let mut last = 0;
1658 for start in self.find_iter(needle) {
1659 dest.push_str(&self.as_bytes()[last..start]);
1660 dest.push_str(replacement);
1661 last = start + needle.len();
1662 }
1663 dest.push_str(&self.as_bytes()[last..]);
1664 }
1665
1666 /// Replace up to `limit` matches of the given needle with the given
1667 /// replacement, and write the result into the provided `Vec<u8>`.
1668 ///
1669 /// This does **not** clear `dest` before writing to it.
1670 ///
1671 /// This routine is useful for reusing allocation. For a more convenient
1672 /// API, use [`replacen`](#method.replacen) instead.
1673 ///
1674 /// # Examples
1675 ///
1676 /// Basic usage:
1677 ///
1678 /// ```
1679 /// use bstr::ByteSlice;
1680 ///
1681 /// let s = b"foofoo";
1682 ///
1683 /// let mut dest = vec![];
1684 /// s.replacen_into("o", "z", 2, &mut dest);
1685 /// assert_eq!(dest, "fzzfoo".as_bytes());
1686 /// ```
1687 ///
1688 /// When the pattern doesn't match:
1689 ///
1690 /// ```
1691 /// use bstr::ByteSlice;
1692 ///
1693 /// let s = b"foofoo";
1694 ///
1695 /// let mut dest = vec![];
1696 /// s.replacen_into("a", "z", 2, &mut dest);
1697 /// assert_eq!(dest, "foofoo".as_bytes());
1698 /// ```
1699 ///
1700 /// When the needle is an empty string:
1701 ///
1702 /// ```
1703 /// use bstr::ByteSlice;
1704 ///
1705 /// let s = b"foo";
1706 ///
1707 /// let mut dest = vec![];
1708 /// s.replacen_into("", "Z", 2, &mut dest);
1709 /// assert_eq!(dest, "ZfZoo".as_bytes());
1710 /// ```
1711 #[cfg(feature = "alloc")]
1712 #[inline]
replacen_into<N: AsRef<[u8]>, R: AsRef<[u8]>>( &self, needle: N, replacement: R, limit: usize, dest: &mut Vec<u8>, )1713 fn replacen_into<N: AsRef<[u8]>, R: AsRef<[u8]>>(
1714 &self,
1715 needle: N,
1716 replacement: R,
1717 limit: usize,
1718 dest: &mut Vec<u8>,
1719 ) {
1720 let (needle, replacement) = (needle.as_ref(), replacement.as_ref());
1721
1722 let mut last = 0;
1723 for start in self.find_iter(needle).take(limit) {
1724 dest.push_str(&self.as_bytes()[last..start]);
1725 dest.push_str(replacement);
1726 last = start + needle.len();
1727 }
1728 dest.push_str(&self.as_bytes()[last..]);
1729 }
1730
1731 /// Returns an iterator over the bytes in this byte string.
1732 ///
1733 /// # Examples
1734 ///
1735 /// Basic usage:
1736 ///
1737 /// ```
1738 /// use bstr::ByteSlice;
1739 ///
1740 /// let bs = b"foobar";
1741 /// let bytes: Vec<u8> = bs.bytes().collect();
1742 /// assert_eq!(bytes, bs);
1743 /// ```
1744 #[inline]
bytes(&self) -> Bytes<'_>1745 fn bytes(&self) -> Bytes<'_> {
1746 Bytes { it: self.as_bytes().iter() }
1747 }
1748
1749 /// Returns an iterator over the Unicode scalar values in this byte string.
1750 /// If invalid UTF-8 is encountered, then the Unicode replacement codepoint
1751 /// is yielded instead.
1752 ///
1753 /// # Examples
1754 ///
1755 /// Basic usage:
1756 ///
1757 /// ```
1758 /// use bstr::ByteSlice;
1759 ///
1760 /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61";
1761 /// let chars: Vec<char> = bs.chars().collect();
1762 /// assert_eq!(vec!['☃', '\u{FFFD}', '', '\u{FFFD}', 'a'], chars);
1763 /// ```
1764 ///
1765 /// Codepoints can also be iterated over in reverse:
1766 ///
1767 /// ```
1768 /// use bstr::ByteSlice;
1769 ///
1770 /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61";
1771 /// let chars: Vec<char> = bs.chars().rev().collect();
1772 /// assert_eq!(vec!['a', '\u{FFFD}', '', '\u{FFFD}', '☃'], chars);
1773 /// ```
1774 #[inline]
chars(&self) -> Chars<'_>1775 fn chars(&self) -> Chars<'_> {
1776 Chars::new(self.as_bytes())
1777 }
1778
1779 /// Returns an iterator over the Unicode scalar values in this byte string
1780 /// along with their starting and ending byte index positions. If invalid
1781 /// UTF-8 is encountered, then the Unicode replacement codepoint is yielded
1782 /// instead.
1783 ///
1784 /// Note that this is slightly different from the `CharIndices` iterator
1785 /// provided by the standard library. Aside from working on possibly
1786 /// invalid UTF-8, this iterator provides both the corresponding starting
1787 /// and ending byte indices of each codepoint yielded. The ending position
1788 /// is necessary to slice the original byte string when invalid UTF-8 bytes
1789 /// are converted into a Unicode replacement codepoint, since a single
1790 /// replacement codepoint can substitute anywhere from 1 to 3 invalid bytes
1791 /// (inclusive).
1792 ///
1793 /// # Examples
1794 ///
1795 /// Basic usage:
1796 ///
1797 /// ```
1798 /// use bstr::ByteSlice;
1799 ///
1800 /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61";
1801 /// let chars: Vec<(usize, usize, char)> = bs.char_indices().collect();
1802 /// assert_eq!(chars, vec![
1803 /// (0, 3, '☃'),
1804 /// (3, 4, '\u{FFFD}'),
1805 /// (4, 8, ''),
1806 /// (8, 10, '\u{FFFD}'),
1807 /// (10, 11, 'a'),
1808 /// ]);
1809 /// ```
1810 ///
1811 /// Codepoints can also be iterated over in reverse:
1812 ///
1813 /// ```
1814 /// use bstr::ByteSlice;
1815 ///
1816 /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61";
1817 /// let chars: Vec<(usize, usize, char)> = bs
1818 /// .char_indices()
1819 /// .rev()
1820 /// .collect();
1821 /// assert_eq!(chars, vec![
1822 /// (10, 11, 'a'),
1823 /// (8, 10, '\u{FFFD}'),
1824 /// (4, 8, ''),
1825 /// (3, 4, '\u{FFFD}'),
1826 /// (0, 3, '☃'),
1827 /// ]);
1828 /// ```
1829 #[inline]
char_indices(&self) -> CharIndices<'_>1830 fn char_indices(&self) -> CharIndices<'_> {
1831 CharIndices::new(self.as_bytes())
1832 }
1833
1834 /// Iterate over chunks of valid UTF-8.
1835 ///
1836 /// The iterator returned yields chunks of valid UTF-8 separated by invalid
1837 /// UTF-8 bytes, if they exist. Invalid UTF-8 bytes are always 1-3 bytes,
1838 /// which are determined via the "substitution of maximal subparts"
1839 /// strategy described in the docs for the
1840 /// [`ByteSlice::to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy)
1841 /// method.
1842 ///
1843 /// # Examples
1844 ///
1845 /// This example shows how to gather all valid and invalid chunks from a
1846 /// byte slice:
1847 ///
1848 /// ```
1849 /// use bstr::{ByteSlice, Utf8Chunk};
1850 ///
1851 /// let bytes = b"foo\xFD\xFEbar\xFF";
1852 ///
1853 /// let (mut valid_chunks, mut invalid_chunks) = (vec![], vec![]);
1854 /// for chunk in bytes.utf8_chunks() {
1855 /// if !chunk.valid().is_empty() {
1856 /// valid_chunks.push(chunk.valid());
1857 /// }
1858 /// if !chunk.invalid().is_empty() {
1859 /// invalid_chunks.push(chunk.invalid());
1860 /// }
1861 /// }
1862 ///
1863 /// assert_eq!(valid_chunks, vec!["foo", "bar"]);
1864 /// assert_eq!(invalid_chunks, vec![b"\xFD", b"\xFE", b"\xFF"]);
1865 /// ```
1866 #[inline]
utf8_chunks(&self) -> Utf8Chunks<'_>1867 fn utf8_chunks(&self) -> Utf8Chunks<'_> {
1868 Utf8Chunks { bytes: self.as_bytes() }
1869 }
1870
1871 /// Returns an iterator over the grapheme clusters in this byte string.
1872 /// If invalid UTF-8 is encountered, then the Unicode replacement codepoint
1873 /// is yielded instead.
1874 ///
1875 /// # Examples
1876 ///
1877 /// This example shows how multiple codepoints can combine to form a
1878 /// single grapheme cluster:
1879 ///
1880 /// ```
1881 /// use bstr::ByteSlice;
1882 ///
1883 /// let bs = "a\u{0300}\u{0316}\u{1F1FA}\u{1F1F8}".as_bytes();
1884 /// let graphemes: Vec<&str> = bs.graphemes().collect();
1885 /// assert_eq!(vec!["à̖", ""], graphemes);
1886 /// ```
1887 ///
1888 /// This shows that graphemes can be iterated over in reverse:
1889 ///
1890 /// ```
1891 /// use bstr::ByteSlice;
1892 ///
1893 /// let bs = "a\u{0300}\u{0316}\u{1F1FA}\u{1F1F8}".as_bytes();
1894 /// let graphemes: Vec<&str> = bs.graphemes().rev().collect();
1895 /// assert_eq!(vec!["", "à̖"], graphemes);
1896 /// ```
1897 #[cfg(feature = "unicode")]
1898 #[inline]
graphemes(&self) -> Graphemes<'_>1899 fn graphemes(&self) -> Graphemes<'_> {
1900 Graphemes::new(self.as_bytes())
1901 }
1902
1903 /// Returns an iterator over the grapheme clusters in this byte string
1904 /// along with their starting and ending byte index positions. If invalid
1905 /// UTF-8 is encountered, then the Unicode replacement codepoint is yielded
1906 /// instead.
1907 ///
1908 /// # Examples
1909 ///
1910 /// This example shows how to get the byte offsets of each individual
1911 /// grapheme cluster:
1912 ///
1913 /// ```
1914 /// use bstr::ByteSlice;
1915 ///
1916 /// let bs = "a\u{0300}\u{0316}\u{1F1FA}\u{1F1F8}".as_bytes();
1917 /// let graphemes: Vec<(usize, usize, &str)> =
1918 /// bs.grapheme_indices().collect();
1919 /// assert_eq!(vec![(0, 5, "à̖"), (5, 13, "")], graphemes);
1920 /// ```
1921 ///
1922 /// This example shows what happens when invalid UTF-8 is encountered. Note
1923 /// that the offsets are valid indices into the original string, and do
1924 /// not necessarily correspond to the length of the `&str` returned!
1925 ///
1926 /// ```
1927 /// # #[cfg(all(feature = "alloc"))] {
1928 /// use bstr::{ByteSlice, ByteVec};
1929 ///
1930 /// let mut bytes = vec![];
1931 /// bytes.push_str("a\u{0300}\u{0316}");
1932 /// bytes.push(b'\xFF');
1933 /// bytes.push_str("\u{1F1FA}\u{1F1F8}");
1934 ///
1935 /// let graphemes: Vec<(usize, usize, &str)> =
1936 /// bytes.grapheme_indices().collect();
1937 /// assert_eq!(
1938 /// graphemes,
1939 /// vec![(0, 5, "à̖"), (5, 6, "\u{FFFD}"), (6, 14, "")]
1940 /// );
1941 /// # }
1942 /// ```
1943 #[cfg(feature = "unicode")]
1944 #[inline]
grapheme_indices(&self) -> GraphemeIndices<'_>1945 fn grapheme_indices(&self) -> GraphemeIndices<'_> {
1946 GraphemeIndices::new(self.as_bytes())
1947 }
1948
1949 /// Returns an iterator over the words in this byte string. If invalid
1950 /// UTF-8 is encountered, then the Unicode replacement codepoint is yielded
1951 /// instead.
1952 ///
1953 /// This is similar to
1954 /// [`words_with_breaks`](trait.ByteSlice.html#method.words_with_breaks),
1955 /// except it only returns elements that contain a "word" character. A word
1956 /// character is defined by UTS #18 (Annex C) to be the combination of the
1957 /// `Alphabetic` and `Join_Control` properties, along with the
1958 /// `Decimal_Number`, `Mark` and `Connector_Punctuation` general
1959 /// categories.
1960 ///
1961 /// Since words are made up of one or more codepoints, this iterator
1962 /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
1963 /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
1964 ///
1965 /// # Examples
1966 ///
1967 /// Basic usage:
1968 ///
1969 /// ```
1970 /// use bstr::ByteSlice;
1971 ///
1972 /// let bs = br#"The quick ("brown") fox can't jump 32.3 feet, right?"#;
1973 /// let words: Vec<&str> = bs.words().collect();
1974 /// assert_eq!(words, vec![
1975 /// "The", "quick", "brown", "fox", "can't",
1976 /// "jump", "32.3", "feet", "right",
1977 /// ]);
1978 /// ```
1979 #[cfg(feature = "unicode")]
1980 #[inline]
words(&self) -> Words<'_>1981 fn words(&self) -> Words<'_> {
1982 Words::new(self.as_bytes())
1983 }
1984
1985 /// Returns an iterator over the words in this byte string along with
1986 /// their starting and ending byte index positions.
1987 ///
1988 /// This is similar to
1989 /// [`words_with_break_indices`](trait.ByteSlice.html#method.words_with_break_indices),
1990 /// except it only returns elements that contain a "word" character. A word
1991 /// character is defined by UTS #18 (Annex C) to be the combination of the
1992 /// `Alphabetic` and `Join_Control` properties, along with the
1993 /// `Decimal_Number`, `Mark` and `Connector_Punctuation` general
1994 /// categories.
1995 ///
1996 /// Since words are made up of one or more codepoints, this iterator
1997 /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
1998 /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
1999 ///
2000 /// # Examples
2001 ///
2002 /// This example shows how to get the byte offsets of each individual
2003 /// word:
2004 ///
2005 /// ```
2006 /// use bstr::ByteSlice;
2007 ///
2008 /// let bs = b"can't jump 32.3 feet";
2009 /// let words: Vec<(usize, usize, &str)> = bs.word_indices().collect();
2010 /// assert_eq!(words, vec![
2011 /// (0, 5, "can't"),
2012 /// (6, 10, "jump"),
2013 /// (11, 15, "32.3"),
2014 /// (16, 20, "feet"),
2015 /// ]);
2016 /// ```
2017 #[cfg(feature = "unicode")]
2018 #[inline]
word_indices(&self) -> WordIndices<'_>2019 fn word_indices(&self) -> WordIndices<'_> {
2020 WordIndices::new(self.as_bytes())
2021 }
2022
2023 /// Returns an iterator over the words in this byte string, along with
2024 /// all breaks between the words. Concatenating all elements yielded by
2025 /// the iterator results in the original string (modulo Unicode replacement
2026 /// codepoint substitutions if invalid UTF-8 is encountered).
2027 ///
2028 /// Since words are made up of one or more codepoints, this iterator
2029 /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
2030 /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
2031 ///
2032 /// # Examples
2033 ///
2034 /// Basic usage:
2035 ///
2036 /// ```
2037 /// use bstr::ByteSlice;
2038 ///
2039 /// let bs = br#"The quick ("brown") fox can't jump 32.3 feet, right?"#;
2040 /// let words: Vec<&str> = bs.words_with_breaks().collect();
2041 /// assert_eq!(words, vec![
2042 /// "The", " ", "quick", " ", "(", "\"", "brown", "\"", ")",
2043 /// " ", "fox", " ", "can't", " ", "jump", " ", "32.3", " ", "feet",
2044 /// ",", " ", "right", "?",
2045 /// ]);
2046 /// ```
2047 #[cfg(feature = "unicode")]
2048 #[inline]
words_with_breaks(&self) -> WordsWithBreaks<'_>2049 fn words_with_breaks(&self) -> WordsWithBreaks<'_> {
2050 WordsWithBreaks::new(self.as_bytes())
2051 }
2052
2053 /// Returns an iterator over the words and their byte offsets in this
2054 /// byte string, along with all breaks between the words. Concatenating
2055 /// all elements yielded by the iterator results in the original string
2056 /// (modulo Unicode replacement codepoint substitutions if invalid UTF-8 is
2057 /// encountered).
2058 ///
2059 /// Since words are made up of one or more codepoints, this iterator
2060 /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
2061 /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
2062 ///
2063 /// # Examples
2064 ///
2065 /// This example shows how to get the byte offsets of each individual
2066 /// word:
2067 ///
2068 /// ```
2069 /// use bstr::ByteSlice;
2070 ///
2071 /// let bs = b"can't jump 32.3 feet";
2072 /// let words: Vec<(usize, usize, &str)> =
2073 /// bs.words_with_break_indices().collect();
2074 /// assert_eq!(words, vec![
2075 /// (0, 5, "can't"),
2076 /// (5, 6, " "),
2077 /// (6, 10, "jump"),
2078 /// (10, 11, " "),
2079 /// (11, 15, "32.3"),
2080 /// (15, 16, " "),
2081 /// (16, 20, "feet"),
2082 /// ]);
2083 /// ```
2084 #[cfg(feature = "unicode")]
2085 #[inline]
words_with_break_indices(&self) -> WordsWithBreakIndices<'_>2086 fn words_with_break_indices(&self) -> WordsWithBreakIndices<'_> {
2087 WordsWithBreakIndices::new(self.as_bytes())
2088 }
2089
2090 /// Returns an iterator over the sentences in this byte string.
2091 ///
2092 /// Typically, a sentence will include its trailing punctuation and
2093 /// whitespace. Concatenating all elements yielded by the iterator
2094 /// results in the original string (modulo Unicode replacement codepoint
2095 /// substitutions if invalid UTF-8 is encountered).
2096 ///
2097 /// Since sentences are made up of one or more codepoints, this iterator
2098 /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
2099 /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
2100 ///
2101 /// # Examples
2102 ///
2103 /// Basic usage:
2104 ///
2105 /// ```
2106 /// use bstr::ByteSlice;
2107 ///
2108 /// let bs = b"I want this. Not that. Right now.";
2109 /// let sentences: Vec<&str> = bs.sentences().collect();
2110 /// assert_eq!(sentences, vec![
2111 /// "I want this. ",
2112 /// "Not that. ",
2113 /// "Right now.",
2114 /// ]);
2115 /// ```
2116 #[cfg(feature = "unicode")]
2117 #[inline]
sentences(&self) -> Sentences<'_>2118 fn sentences(&self) -> Sentences<'_> {
2119 Sentences::new(self.as_bytes())
2120 }
2121
2122 /// Returns an iterator over the sentences in this byte string along with
2123 /// their starting and ending byte index positions.
2124 ///
2125 /// Typically, a sentence will include its trailing punctuation and
2126 /// whitespace. Concatenating all elements yielded by the iterator
2127 /// results in the original string (modulo Unicode replacement codepoint
2128 /// substitutions if invalid UTF-8 is encountered).
2129 ///
2130 /// Since sentences are made up of one or more codepoints, this iterator
2131 /// yields `&str` elements. When invalid UTF-8 is encountered, replacement
2132 /// codepoints are [substituted](index.html#handling-of-invalid-utf-8).
2133 ///
2134 /// # Examples
2135 ///
2136 /// Basic usage:
2137 ///
2138 /// ```
2139 /// use bstr::ByteSlice;
2140 ///
2141 /// let bs = b"I want this. Not that. Right now.";
2142 /// let sentences: Vec<(usize, usize, &str)> =
2143 /// bs.sentence_indices().collect();
2144 /// assert_eq!(sentences, vec![
2145 /// (0, 13, "I want this. "),
2146 /// (13, 23, "Not that. "),
2147 /// (23, 33, "Right now."),
2148 /// ]);
2149 /// ```
2150 #[cfg(feature = "unicode")]
2151 #[inline]
sentence_indices(&self) -> SentenceIndices<'_>2152 fn sentence_indices(&self) -> SentenceIndices<'_> {
2153 SentenceIndices::new(self.as_bytes())
2154 }
2155
2156 /// An iterator over all lines in a byte string, without their
2157 /// terminators.
2158 ///
2159 /// For this iterator, the only line terminators recognized are `\r\n` and
2160 /// `\n`.
2161 ///
2162 /// # Examples
2163 ///
2164 /// Basic usage:
2165 ///
2166 /// ```
2167 /// use bstr::{B, ByteSlice};
2168 ///
2169 /// let s = b"\
2170 /// foo
2171 ///
2172 /// bar\r
2173 /// baz
2174 ///
2175 ///
2176 /// quux";
2177 /// let lines: Vec<&[u8]> = s.lines().collect();
2178 /// assert_eq!(lines, vec![
2179 /// B("foo"), B(""), B("bar"), B("baz"), B(""), B(""), B("quux"),
2180 /// ]);
2181 /// ```
2182 #[inline]
lines(&self) -> Lines<'_>2183 fn lines(&self) -> Lines<'_> {
2184 Lines::new(self.as_bytes())
2185 }
2186
2187 /// An iterator over all lines in a byte string, including their
2188 /// terminators.
2189 ///
2190 /// For this iterator, the only line terminator recognized is `\n`. (Since
2191 /// line terminators are included, this also handles `\r\n` line endings.)
2192 ///
2193 /// Line terminators are only included if they are present in the original
2194 /// byte string. For example, the last line in a byte string may not end
2195 /// with a line terminator.
2196 ///
2197 /// Concatenating all elements yielded by this iterator is guaranteed to
2198 /// yield the original byte string.
2199 ///
2200 /// # Examples
2201 ///
2202 /// Basic usage:
2203 ///
2204 /// ```
2205 /// use bstr::{B, ByteSlice};
2206 ///
2207 /// let s = b"\
2208 /// foo
2209 ///
2210 /// bar\r
2211 /// baz
2212 ///
2213 ///
2214 /// quux";
2215 /// let lines: Vec<&[u8]> = s.lines_with_terminator().collect();
2216 /// assert_eq!(lines, vec![
2217 /// B("foo\n"),
2218 /// B("\n"),
2219 /// B("bar\r\n"),
2220 /// B("baz\n"),
2221 /// B("\n"),
2222 /// B("\n"),
2223 /// B("quux"),
2224 /// ]);
2225 /// ```
2226 #[inline]
lines_with_terminator(&self) -> LinesWithTerminator<'_>2227 fn lines_with_terminator(&self) -> LinesWithTerminator<'_> {
2228 LinesWithTerminator::new(self.as_bytes())
2229 }
2230
2231 /// Return a byte string slice with leading and trailing whitespace
2232 /// removed.
2233 ///
2234 /// Whitespace is defined according to the terms of the `White_Space`
2235 /// Unicode property.
2236 ///
2237 /// # Examples
2238 ///
2239 /// Basic usage:
2240 ///
2241 /// ```
2242 /// use bstr::{B, ByteSlice};
2243 ///
2244 /// let s = B(" foo\tbar\t\u{2003}\n");
2245 /// assert_eq!(s.trim(), B("foo\tbar"));
2246 /// ```
2247 #[cfg(feature = "unicode")]
2248 #[inline]
trim(&self) -> &[u8]2249 fn trim(&self) -> &[u8] {
2250 self.trim_start().trim_end()
2251 }
2252
2253 /// Return a byte string slice with leading whitespace removed.
2254 ///
2255 /// Whitespace is defined according to the terms of the `White_Space`
2256 /// Unicode property.
2257 ///
2258 /// # Examples
2259 ///
2260 /// Basic usage:
2261 ///
2262 /// ```
2263 /// use bstr::{B, ByteSlice};
2264 ///
2265 /// let s = B(" foo\tbar\t\u{2003}\n");
2266 /// assert_eq!(s.trim_start(), B("foo\tbar\t\u{2003}\n"));
2267 /// ```
2268 #[cfg(feature = "unicode")]
2269 #[inline]
trim_start(&self) -> &[u8]2270 fn trim_start(&self) -> &[u8] {
2271 let start = whitespace_len_fwd(self.as_bytes());
2272 &self.as_bytes()[start..]
2273 }
2274
2275 /// Return a byte string slice with trailing whitespace removed.
2276 ///
2277 /// Whitespace is defined according to the terms of the `White_Space`
2278 /// Unicode property.
2279 ///
2280 /// # Examples
2281 ///
2282 /// Basic usage:
2283 ///
2284 /// ```
2285 /// use bstr::{B, ByteSlice};
2286 ///
2287 /// let s = B(" foo\tbar\t\u{2003}\n");
2288 /// assert_eq!(s.trim_end(), B(" foo\tbar"));
2289 /// ```
2290 #[cfg(feature = "unicode")]
2291 #[inline]
trim_end(&self) -> &[u8]2292 fn trim_end(&self) -> &[u8] {
2293 let end = whitespace_len_rev(self.as_bytes());
2294 &self.as_bytes()[..end]
2295 }
2296
2297 /// Return a byte string slice with leading and trailing characters
2298 /// satisfying the given predicate removed.
2299 ///
2300 /// # Examples
2301 ///
2302 /// Basic usage:
2303 ///
2304 /// ```
2305 /// use bstr::{B, ByteSlice};
2306 ///
2307 /// let s = b"123foo5bar789";
2308 /// assert_eq!(s.trim_with(|c| c.is_numeric()), B("foo5bar"));
2309 /// ```
2310 #[inline]
trim_with<F: FnMut(char) -> bool>(&self, mut trim: F) -> &[u8]2311 fn trim_with<F: FnMut(char) -> bool>(&self, mut trim: F) -> &[u8] {
2312 self.trim_start_with(&mut trim).trim_end_with(&mut trim)
2313 }
2314
2315 /// Return a byte string slice with leading characters satisfying the given
2316 /// predicate removed.
2317 ///
2318 /// # Examples
2319 ///
2320 /// Basic usage:
2321 ///
2322 /// ```
2323 /// use bstr::{B, ByteSlice};
2324 ///
2325 /// let s = b"123foo5bar789";
2326 /// assert_eq!(s.trim_start_with(|c| c.is_numeric()), B("foo5bar789"));
2327 /// ```
2328 #[inline]
trim_start_with<F: FnMut(char) -> bool>(&self, mut trim: F) -> &[u8]2329 fn trim_start_with<F: FnMut(char) -> bool>(&self, mut trim: F) -> &[u8] {
2330 for (s, _, ch) in self.char_indices() {
2331 if !trim(ch) {
2332 return &self.as_bytes()[s..];
2333 }
2334 }
2335 b""
2336 }
2337
2338 /// Return a byte string slice with trailing characters satisfying the
2339 /// given predicate removed.
2340 ///
2341 /// # Examples
2342 ///
2343 /// Basic usage:
2344 ///
2345 /// ```
2346 /// use bstr::{B, ByteSlice};
2347 ///
2348 /// let s = b"123foo5bar789";
2349 /// assert_eq!(s.trim_end_with(|c| c.is_numeric()), B("123foo5bar"));
2350 /// ```
2351 #[inline]
trim_end_with<F: FnMut(char) -> bool>(&self, mut trim: F) -> &[u8]2352 fn trim_end_with<F: FnMut(char) -> bool>(&self, mut trim: F) -> &[u8] {
2353 for (_, e, ch) in self.char_indices().rev() {
2354 if !trim(ch) {
2355 return &self.as_bytes()[..e];
2356 }
2357 }
2358 b""
2359 }
2360
2361 /// Returns a new `Vec<u8>` containing the lowercase equivalent of this
2362 /// byte string.
2363 ///
2364 /// In this case, lowercase is defined according to the `Lowercase` Unicode
2365 /// property.
2366 ///
2367 /// If invalid UTF-8 is seen, or if a character has no lowercase variant,
2368 /// then it is written to the given buffer unchanged.
2369 ///
2370 /// Note that some characters in this byte string may expand into multiple
2371 /// characters when changing the case, so the number of bytes written to
2372 /// the given byte string may not be equivalent to the number of bytes in
2373 /// this byte string.
2374 ///
2375 /// If you'd like to reuse an allocation for performance reasons, then use
2376 /// [`to_lowercase_into`](#method.to_lowercase_into) instead.
2377 ///
2378 /// # Examples
2379 ///
2380 /// Basic usage:
2381 ///
2382 /// ```
2383 /// use bstr::{B, ByteSlice};
2384 ///
2385 /// let s = B("HELLO Β");
2386 /// assert_eq!("hello β".as_bytes(), s.to_lowercase().as_bytes());
2387 /// ```
2388 ///
2389 /// Scripts without case are not changed:
2390 ///
2391 /// ```
2392 /// use bstr::{B, ByteSlice};
2393 ///
2394 /// let s = B("农历新年");
2395 /// assert_eq!("农历新年".as_bytes(), s.to_lowercase().as_bytes());
2396 /// ```
2397 ///
2398 /// Invalid UTF-8 remains as is:
2399 ///
2400 /// ```
2401 /// use bstr::{B, ByteSlice};
2402 ///
2403 /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ");
2404 /// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), s.to_lowercase().as_bytes());
2405 /// ```
2406 #[cfg(all(feature = "alloc", feature = "unicode"))]
2407 #[inline]
to_lowercase(&self) -> Vec<u8>2408 fn to_lowercase(&self) -> Vec<u8> {
2409 let mut buf = vec![];
2410 self.to_lowercase_into(&mut buf);
2411 buf
2412 }
2413
2414 /// Writes the lowercase equivalent of this byte string into the given
2415 /// buffer. The buffer is not cleared before written to.
2416 ///
2417 /// In this case, lowercase is defined according to the `Lowercase`
2418 /// Unicode property.
2419 ///
2420 /// If invalid UTF-8 is seen, or if a character has no lowercase variant,
2421 /// then it is written to the given buffer unchanged.
2422 ///
2423 /// Note that some characters in this byte string may expand into multiple
2424 /// characters when changing the case, so the number of bytes written to
2425 /// the given byte string may not be equivalent to the number of bytes in
2426 /// this byte string.
2427 ///
2428 /// If you don't need to amortize allocation and instead prefer
2429 /// convenience, then use [`to_lowercase`](#method.to_lowercase) instead.
2430 ///
2431 /// # Examples
2432 ///
2433 /// Basic usage:
2434 ///
2435 /// ```
2436 /// use bstr::{B, ByteSlice};
2437 ///
2438 /// let s = B("HELLO Β");
2439 ///
2440 /// let mut buf = vec![];
2441 /// s.to_lowercase_into(&mut buf);
2442 /// assert_eq!("hello β".as_bytes(), buf.as_bytes());
2443 /// ```
2444 ///
2445 /// Scripts without case are not changed:
2446 ///
2447 /// ```
2448 /// use bstr::{B, ByteSlice};
2449 ///
2450 /// let s = B("农历新年");
2451 ///
2452 /// let mut buf = vec![];
2453 /// s.to_lowercase_into(&mut buf);
2454 /// assert_eq!("农历新年".as_bytes(), buf.as_bytes());
2455 /// ```
2456 ///
2457 /// Invalid UTF-8 remains as is:
2458 ///
2459 /// ```
2460 /// use bstr::{B, ByteSlice};
2461 ///
2462 /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ");
2463 ///
2464 /// let mut buf = vec![];
2465 /// s.to_lowercase_into(&mut buf);
2466 /// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), buf.as_bytes());
2467 /// ```
2468 #[cfg(all(feature = "alloc", feature = "unicode"))]
2469 #[inline]
to_lowercase_into(&self, buf: &mut Vec<u8>)2470 fn to_lowercase_into(&self, buf: &mut Vec<u8>) {
2471 // TODO: This is the best we can do given what std exposes I think.
2472 // If we roll our own case handling, then we might be able to do this
2473 // a bit faster. We shouldn't roll our own case handling unless we
2474 // need to, e.g., for doing caseless matching or case folding.
2475
2476 // TODO(BUG): This doesn't handle any special casing rules.
2477
2478 buf.reserve(self.as_bytes().len());
2479 for (s, e, ch) in self.char_indices() {
2480 if ch == '\u{FFFD}' {
2481 buf.push_str(&self.as_bytes()[s..e]);
2482 } else if ch.is_ascii() {
2483 buf.push_char(ch.to_ascii_lowercase());
2484 } else {
2485 for upper in ch.to_lowercase() {
2486 buf.push_char(upper);
2487 }
2488 }
2489 }
2490 }
2491
2492 /// Returns a new `Vec<u8>` containing the ASCII lowercase equivalent of
2493 /// this byte string.
2494 ///
2495 /// In this case, lowercase is only defined in ASCII letters. Namely, the
2496 /// letters `A-Z` are converted to `a-z`. All other bytes remain unchanged.
2497 /// In particular, the length of the byte string returned is always
2498 /// equivalent to the length of this byte string.
2499 ///
2500 /// If you'd like to reuse an allocation for performance reasons, then use
2501 /// [`make_ascii_lowercase`](#method.make_ascii_lowercase) to perform
2502 /// the conversion in place.
2503 ///
2504 /// # Examples
2505 ///
2506 /// Basic usage:
2507 ///
2508 /// ```
2509 /// use bstr::{B, ByteSlice};
2510 ///
2511 /// let s = B("HELLO Β");
2512 /// assert_eq!("hello Β".as_bytes(), s.to_ascii_lowercase().as_bytes());
2513 /// ```
2514 ///
2515 /// Invalid UTF-8 remains as is:
2516 ///
2517 /// ```
2518 /// use bstr::{B, ByteSlice};
2519 ///
2520 /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ");
2521 /// assert_eq!(s.to_ascii_lowercase(), B(b"foo\xFFbar\xE2\x98baz"));
2522 /// ```
2523 #[cfg(feature = "alloc")]
2524 #[inline]
to_ascii_lowercase(&self) -> Vec<u8>2525 fn to_ascii_lowercase(&self) -> Vec<u8> {
2526 self.as_bytes().to_ascii_lowercase()
2527 }
2528
2529 /// Convert this byte string to its lowercase ASCII equivalent in place.
2530 ///
2531 /// In this case, lowercase is only defined in ASCII letters. Namely, the
2532 /// letters `A-Z` are converted to `a-z`. All other bytes remain unchanged.
2533 ///
2534 /// If you don't need to do the conversion in
2535 /// place and instead prefer convenience, then use
2536 /// [`to_ascii_lowercase`](#method.to_ascii_lowercase) instead.
2537 ///
2538 /// # Examples
2539 ///
2540 /// Basic usage:
2541 ///
2542 /// ```
2543 /// use bstr::ByteSlice;
2544 ///
2545 /// let mut s = <Vec<u8>>::from("HELLO Β");
2546 /// s.make_ascii_lowercase();
2547 /// assert_eq!(s, "hello Β".as_bytes());
2548 /// ```
2549 ///
2550 /// Invalid UTF-8 remains as is:
2551 ///
2552 /// ```
2553 /// # #[cfg(feature = "alloc")] {
2554 /// use bstr::{B, ByteSlice, ByteVec};
2555 ///
2556 /// let mut s = <Vec<u8>>::from_slice(b"FOO\xFFBAR\xE2\x98BAZ");
2557 /// s.make_ascii_lowercase();
2558 /// assert_eq!(s, B(b"foo\xFFbar\xE2\x98baz"));
2559 /// # }
2560 /// ```
2561 #[inline]
make_ascii_lowercase(&mut self)2562 fn make_ascii_lowercase(&mut self) {
2563 self.as_bytes_mut().make_ascii_lowercase();
2564 }
2565
2566 /// Returns a new `Vec<u8>` containing the uppercase equivalent of this
2567 /// byte string.
2568 ///
2569 /// In this case, uppercase is defined according to the `Uppercase`
2570 /// Unicode property.
2571 ///
2572 /// If invalid UTF-8 is seen, or if a character has no uppercase variant,
2573 /// then it is written to the given buffer unchanged.
2574 ///
2575 /// Note that some characters in this byte string may expand into multiple
2576 /// characters when changing the case, so the number of bytes written to
2577 /// the given byte string may not be equivalent to the number of bytes in
2578 /// this byte string.
2579 ///
2580 /// If you'd like to reuse an allocation for performance reasons, then use
2581 /// [`to_uppercase_into`](#method.to_uppercase_into) instead.
2582 ///
2583 /// # Examples
2584 ///
2585 /// Basic usage:
2586 ///
2587 /// ```
2588 /// use bstr::{B, ByteSlice};
2589 ///
2590 /// let s = B("hello β");
2591 /// assert_eq!(s.to_uppercase(), B("HELLO Β"));
2592 /// ```
2593 ///
2594 /// Scripts without case are not changed:
2595 ///
2596 /// ```
2597 /// use bstr::{B, ByteSlice};
2598 ///
2599 /// let s = B("农历新年");
2600 /// assert_eq!(s.to_uppercase(), B("农历新年"));
2601 /// ```
2602 ///
2603 /// Invalid UTF-8 remains as is:
2604 ///
2605 /// ```
2606 /// use bstr::{B, ByteSlice};
2607 ///
2608 /// let s = B(b"foo\xFFbar\xE2\x98baz");
2609 /// assert_eq!(s.to_uppercase(), B(b"FOO\xFFBAR\xE2\x98BAZ"));
2610 /// ```
2611 #[cfg(all(feature = "alloc", feature = "unicode"))]
2612 #[inline]
to_uppercase(&self) -> Vec<u8>2613 fn to_uppercase(&self) -> Vec<u8> {
2614 let mut buf = vec![];
2615 self.to_uppercase_into(&mut buf);
2616 buf
2617 }
2618
2619 /// Writes the uppercase equivalent of this byte string into the given
2620 /// buffer. The buffer is not cleared before written to.
2621 ///
2622 /// In this case, uppercase is defined according to the `Uppercase`
2623 /// Unicode property.
2624 ///
2625 /// If invalid UTF-8 is seen, or if a character has no uppercase variant,
2626 /// then it is written to the given buffer unchanged.
2627 ///
2628 /// Note that some characters in this byte string may expand into multiple
2629 /// characters when changing the case, so the number of bytes written to
2630 /// the given byte string may not be equivalent to the number of bytes in
2631 /// this byte string.
2632 ///
2633 /// If you don't need to amortize allocation and instead prefer
2634 /// convenience, then use [`to_uppercase`](#method.to_uppercase) instead.
2635 ///
2636 /// # Examples
2637 ///
2638 /// Basic usage:
2639 ///
2640 /// ```
2641 /// use bstr::{B, ByteSlice};
2642 ///
2643 /// let s = B("hello β");
2644 ///
2645 /// let mut buf = vec![];
2646 /// s.to_uppercase_into(&mut buf);
2647 /// assert_eq!(buf, B("HELLO Β"));
2648 /// ```
2649 ///
2650 /// Scripts without case are not changed:
2651 ///
2652 /// ```
2653 /// use bstr::{B, ByteSlice};
2654 ///
2655 /// let s = B("农历新年");
2656 ///
2657 /// let mut buf = vec![];
2658 /// s.to_uppercase_into(&mut buf);
2659 /// assert_eq!(buf, B("农历新年"));
2660 /// ```
2661 ///
2662 /// Invalid UTF-8 remains as is:
2663 ///
2664 /// ```
2665 /// use bstr::{B, ByteSlice};
2666 ///
2667 /// let s = B(b"foo\xFFbar\xE2\x98baz");
2668 ///
2669 /// let mut buf = vec![];
2670 /// s.to_uppercase_into(&mut buf);
2671 /// assert_eq!(buf, B(b"FOO\xFFBAR\xE2\x98BAZ"));
2672 /// ```
2673 #[cfg(all(feature = "alloc", feature = "unicode"))]
2674 #[inline]
to_uppercase_into(&self, buf: &mut Vec<u8>)2675 fn to_uppercase_into(&self, buf: &mut Vec<u8>) {
2676 // TODO: This is the best we can do given what std exposes I think.
2677 // If we roll our own case handling, then we might be able to do this
2678 // a bit faster. We shouldn't roll our own case handling unless we
2679 // need to, e.g., for doing caseless matching or case folding.
2680 buf.reserve(self.as_bytes().len());
2681 for (s, e, ch) in self.char_indices() {
2682 if ch == '\u{FFFD}' {
2683 buf.push_str(&self.as_bytes()[s..e]);
2684 } else if ch.is_ascii() {
2685 buf.push_char(ch.to_ascii_uppercase());
2686 } else {
2687 for upper in ch.to_uppercase() {
2688 buf.push_char(upper);
2689 }
2690 }
2691 }
2692 }
2693
2694 /// Returns a new `Vec<u8>` containing the ASCII uppercase equivalent of
2695 /// this byte string.
2696 ///
2697 /// In this case, uppercase is only defined in ASCII letters. Namely, the
2698 /// letters `a-z` are converted to `A-Z`. All other bytes remain unchanged.
2699 /// In particular, the length of the byte string returned is always
2700 /// equivalent to the length of this byte string.
2701 ///
2702 /// If you'd like to reuse an allocation for performance reasons, then use
2703 /// [`make_ascii_uppercase`](#method.make_ascii_uppercase) to perform
2704 /// the conversion in place.
2705 ///
2706 /// # Examples
2707 ///
2708 /// Basic usage:
2709 ///
2710 /// ```
2711 /// use bstr::{B, ByteSlice};
2712 ///
2713 /// let s = B("hello β");
2714 /// assert_eq!(s.to_ascii_uppercase(), B("HELLO β"));
2715 /// ```
2716 ///
2717 /// Invalid UTF-8 remains as is:
2718 ///
2719 /// ```
2720 /// use bstr::{B, ByteSlice};
2721 ///
2722 /// let s = B(b"foo\xFFbar\xE2\x98baz");
2723 /// assert_eq!(s.to_ascii_uppercase(), B(b"FOO\xFFBAR\xE2\x98BAZ"));
2724 /// ```
2725 #[cfg(feature = "alloc")]
2726 #[inline]
to_ascii_uppercase(&self) -> Vec<u8>2727 fn to_ascii_uppercase(&self) -> Vec<u8> {
2728 self.as_bytes().to_ascii_uppercase()
2729 }
2730
2731 /// Convert this byte string to its uppercase ASCII equivalent in place.
2732 ///
2733 /// In this case, uppercase is only defined in ASCII letters. Namely, the
2734 /// letters `a-z` are converted to `A-Z`. All other bytes remain unchanged.
2735 ///
2736 /// If you don't need to do the conversion in
2737 /// place and instead prefer convenience, then use
2738 /// [`to_ascii_uppercase`](#method.to_ascii_uppercase) instead.
2739 ///
2740 /// # Examples
2741 ///
2742 /// Basic usage:
2743 ///
2744 /// ```
2745 /// use bstr::{B, ByteSlice};
2746 ///
2747 /// let mut s = <Vec<u8>>::from("hello β");
2748 /// s.make_ascii_uppercase();
2749 /// assert_eq!(s, B("HELLO β"));
2750 /// ```
2751 ///
2752 /// Invalid UTF-8 remains as is:
2753 ///
2754 /// ```
2755 /// # #[cfg(feature = "alloc")] {
2756 /// use bstr::{B, ByteSlice, ByteVec};
2757 ///
2758 /// let mut s = <Vec<u8>>::from_slice(b"foo\xFFbar\xE2\x98baz");
2759 /// s.make_ascii_uppercase();
2760 /// assert_eq!(s, B(b"FOO\xFFBAR\xE2\x98BAZ"));
2761 /// # }
2762 /// ```
2763 #[inline]
make_ascii_uppercase(&mut self)2764 fn make_ascii_uppercase(&mut self) {
2765 self.as_bytes_mut().make_ascii_uppercase();
2766 }
2767
2768 /// Reverse the bytes in this string, in place.
2769 ///
2770 /// This is not necessarily a well formed operation! For example, if this
2771 /// byte string contains valid UTF-8 that isn't ASCII, then reversing the
2772 /// string will likely result in invalid UTF-8 and otherwise non-sensical
2773 /// content.
2774 ///
2775 /// Note that this is equivalent to the generic `[u8]::reverse` method.
2776 /// This method is provided to permit callers to explicitly differentiate
2777 /// between reversing bytes, codepoints and graphemes.
2778 ///
2779 /// # Examples
2780 ///
2781 /// Basic usage:
2782 ///
2783 /// ```
2784 /// use bstr::ByteSlice;
2785 ///
2786 /// let mut s = <Vec<u8>>::from("hello");
2787 /// s.reverse_bytes();
2788 /// assert_eq!(s, "olleh".as_bytes());
2789 /// ```
2790 #[inline]
reverse_bytes(&mut self)2791 fn reverse_bytes(&mut self) {
2792 self.as_bytes_mut().reverse();
2793 }
2794
2795 /// Reverse the codepoints in this string, in place.
2796 ///
2797 /// If this byte string is valid UTF-8, then its reversal by codepoint
2798 /// is also guaranteed to be valid UTF-8.
2799 ///
2800 /// This operation is equivalent to the following, but without allocating:
2801 ///
2802 /// ```
2803 /// use bstr::ByteSlice;
2804 ///
2805 /// let mut s = <Vec<u8>>::from("foo☃bar");
2806 ///
2807 /// let mut chars: Vec<char> = s.chars().collect();
2808 /// chars.reverse();
2809 ///
2810 /// let reversed: String = chars.into_iter().collect();
2811 /// assert_eq!(reversed, "rab☃oof");
2812 /// ```
2813 ///
2814 /// Note that this is not necessarily a well formed operation. For example,
2815 /// if this byte string contains grapheme clusters with more than one
2816 /// codepoint, then those grapheme clusters will not necessarily be
2817 /// preserved. If you'd like to preserve grapheme clusters, then use
2818 /// [`reverse_graphemes`](#method.reverse_graphemes) instead.
2819 ///
2820 /// # Examples
2821 ///
2822 /// Basic usage:
2823 ///
2824 /// ```
2825 /// use bstr::ByteSlice;
2826 ///
2827 /// let mut s = <Vec<u8>>::from("foo☃bar");
2828 /// s.reverse_chars();
2829 /// assert_eq!(s, "rab☃oof".as_bytes());
2830 /// ```
2831 ///
2832 /// This example shows that not all reversals lead to a well formed string.
2833 /// For example, in this case, combining marks are used to put accents over
2834 /// some letters, and those accent marks must appear after the codepoints
2835 /// they modify.
2836 ///
2837 /// ```
2838 /// use bstr::{B, ByteSlice};
2839 ///
2840 /// let mut s = <Vec<u8>>::from("résumé");
2841 /// s.reverse_chars();
2842 /// assert_eq!(s, B(b"\xCC\x81emus\xCC\x81er"));
2843 /// ```
2844 ///
2845 /// A word of warning: the above example relies on the fact that
2846 /// `résumé` is in decomposed normal form, which means there are separate
2847 /// codepoints for the accents above `e`. If it is instead in composed
2848 /// normal form, then the example works:
2849 ///
2850 /// ```
2851 /// use bstr::{B, ByteSlice};
2852 ///
2853 /// let mut s = <Vec<u8>>::from("résumé");
2854 /// s.reverse_chars();
2855 /// assert_eq!(s, B("émusér"));
2856 /// ```
2857 ///
2858 /// The point here is to be cautious and not assume that just because
2859 /// `reverse_chars` works in one case, that it therefore works in all
2860 /// cases.
2861 #[inline]
reverse_chars(&mut self)2862 fn reverse_chars(&mut self) {
2863 let mut i = 0;
2864 loop {
2865 let (_, size) = utf8::decode(&self.as_bytes()[i..]);
2866 if size == 0 {
2867 break;
2868 }
2869 if size > 1 {
2870 self.as_bytes_mut()[i..i + size].reverse_bytes();
2871 }
2872 i += size;
2873 }
2874 self.reverse_bytes();
2875 }
2876
2877 /// Reverse the graphemes in this string, in place.
2878 ///
2879 /// If this byte string is valid UTF-8, then its reversal by grapheme
2880 /// is also guaranteed to be valid UTF-8.
2881 ///
2882 /// This operation is equivalent to the following, but without allocating:
2883 ///
2884 /// ```
2885 /// use bstr::ByteSlice;
2886 ///
2887 /// let mut s = <Vec<u8>>::from("foo☃bar");
2888 ///
2889 /// let mut graphemes: Vec<&str> = s.graphemes().collect();
2890 /// graphemes.reverse();
2891 ///
2892 /// let reversed = graphemes.concat();
2893 /// assert_eq!(reversed, "rab☃oof");
2894 /// ```
2895 ///
2896 /// # Examples
2897 ///
2898 /// Basic usage:
2899 ///
2900 /// ```
2901 /// use bstr::ByteSlice;
2902 ///
2903 /// let mut s = <Vec<u8>>::from("foo☃bar");
2904 /// s.reverse_graphemes();
2905 /// assert_eq!(s, "rab☃oof".as_bytes());
2906 /// ```
2907 ///
2908 /// This example shows how this correctly handles grapheme clusters,
2909 /// unlike `reverse_chars`.
2910 ///
2911 /// ```
2912 /// use bstr::ByteSlice;
2913 ///
2914 /// let mut s = <Vec<u8>>::from("résumé");
2915 /// s.reverse_graphemes();
2916 /// assert_eq!(s, "émusér".as_bytes());
2917 /// ```
2918 #[cfg(feature = "unicode")]
2919 #[inline]
reverse_graphemes(&mut self)2920 fn reverse_graphemes(&mut self) {
2921 use crate::unicode::decode_grapheme;
2922
2923 let mut i = 0;
2924 loop {
2925 let (_, size) = decode_grapheme(&self.as_bytes()[i..]);
2926 if size == 0 {
2927 break;
2928 }
2929 if size > 1 {
2930 self.as_bytes_mut()[i..i + size].reverse_bytes();
2931 }
2932 i += size;
2933 }
2934 self.reverse_bytes();
2935 }
2936
2937 /// Returns true if and only if every byte in this byte string is ASCII.
2938 ///
2939 /// ASCII is an encoding that defines 128 codepoints. A byte corresponds to
2940 /// an ASCII codepoint if and only if it is in the inclusive range
2941 /// `[0, 127]`.
2942 ///
2943 /// # Examples
2944 ///
2945 /// Basic usage:
2946 ///
2947 /// ```
2948 /// use bstr::{B, ByteSlice};
2949 ///
2950 /// assert!(B("abc").is_ascii());
2951 /// assert!(!B("☃βツ").is_ascii());
2952 /// assert!(!B(b"\xFF").is_ascii());
2953 /// ```
2954 #[inline]
is_ascii(&self) -> bool2955 fn is_ascii(&self) -> bool {
2956 ascii::first_non_ascii_byte(self.as_bytes()) == self.as_bytes().len()
2957 }
2958
2959 /// Returns true if and only if the entire byte string is valid UTF-8.
2960 ///
2961 /// If you need location information about where a byte string's first
2962 /// invalid UTF-8 byte is, then use the [`to_str`](#method.to_str) method.
2963 ///
2964 /// # Examples
2965 ///
2966 /// Basic usage:
2967 ///
2968 /// ```
2969 /// use bstr::{B, ByteSlice};
2970 ///
2971 /// assert!(B("abc").is_utf8());
2972 /// assert!(B("☃βツ").is_utf8());
2973 /// // invalid bytes
2974 /// assert!(!B(b"abc\xFF").is_utf8());
2975 /// // surrogate encoding
2976 /// assert!(!B(b"\xED\xA0\x80").is_utf8());
2977 /// // incomplete sequence
2978 /// assert!(!B(b"\xF0\x9D\x9Ca").is_utf8());
2979 /// // overlong sequence
2980 /// assert!(!B(b"\xF0\x82\x82\xAC").is_utf8());
2981 /// ```
2982 #[inline]
is_utf8(&self) -> bool2983 fn is_utf8(&self) -> bool {
2984 utf8::validate(self.as_bytes()).is_ok()
2985 }
2986
2987 /// Returns the last byte in this byte string, if it's non-empty. If this
2988 /// byte string is empty, this returns `None`.
2989 ///
2990 /// Note that this is like the generic `[u8]::last`, except this returns
2991 /// the byte by value instead of a reference to the byte.
2992 ///
2993 /// # Examples
2994 ///
2995 /// Basic usage:
2996 ///
2997 /// ```
2998 /// use bstr::ByteSlice;
2999 ///
3000 /// assert_eq!(Some(b'z'), b"baz".last_byte());
3001 /// assert_eq!(None, b"".last_byte());
3002 /// ```
3003 #[inline]
last_byte(&self) -> Option<u8>3004 fn last_byte(&self) -> Option<u8> {
3005 let bytes = self.as_bytes();
3006 bytes.get(bytes.len().saturating_sub(1)).map(|&b| b)
3007 }
3008
3009 /// Returns the index of the first non-ASCII byte in this byte string (if
3010 /// any such indices exist). Specifically, it returns the index of the
3011 /// first byte with a value greater than or equal to `0x80`.
3012 ///
3013 /// # Examples
3014 ///
3015 /// Basic usage:
3016 ///
3017 /// ```
3018 /// use bstr::{ByteSlice, B};
3019 ///
3020 /// assert_eq!(Some(3), b"abc\xff".find_non_ascii_byte());
3021 /// assert_eq!(None, b"abcde".find_non_ascii_byte());
3022 /// assert_eq!(Some(0), B("").find_non_ascii_byte());
3023 /// ```
3024 #[inline]
find_non_ascii_byte(&self) -> Option<usize>3025 fn find_non_ascii_byte(&self) -> Option<usize> {
3026 let index = ascii::first_non_ascii_byte(self.as_bytes());
3027 if index == self.as_bytes().len() {
3028 None
3029 } else {
3030 Some(index)
3031 }
3032 }
3033 }
3034
3035 /// A single substring searcher fixed to a particular needle.
3036 ///
3037 /// The purpose of this type is to permit callers to construct a substring
3038 /// searcher that can be used to search haystacks without the overhead of
3039 /// constructing the searcher in the first place. This is a somewhat niche
3040 /// concern when it's necessary to re-use the same needle to search multiple
3041 /// different haystacks with as little overhead as possible. In general, using
3042 /// [`ByteSlice::find`](trait.ByteSlice.html#method.find)
3043 /// or
3044 /// [`ByteSlice::find_iter`](trait.ByteSlice.html#method.find_iter)
3045 /// is good enough, but `Finder` is useful when you can meaningfully observe
3046 /// searcher construction time in a profile.
3047 ///
3048 /// When the `std` feature is enabled, then this type has an `into_owned`
3049 /// version which permits building a `Finder` that is not connected to the
3050 /// lifetime of its needle.
3051 #[derive(Clone, Debug)]
3052 pub struct Finder<'a>(memmem::Finder<'a>);
3053
3054 impl<'a> Finder<'a> {
3055 /// Create a new finder for the given needle.
3056 #[inline]
new<B: ?Sized + AsRef<[u8]>>(needle: &'a B) -> Finder<'a>3057 pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'a B) -> Finder<'a> {
3058 Finder(memmem::Finder::new(needle.as_ref()))
3059 }
3060
3061 /// Convert this finder into its owned variant, such that it no longer
3062 /// borrows the needle.
3063 ///
3064 /// If this is already an owned finder, then this is a no-op. Otherwise,
3065 /// this copies the needle.
3066 ///
3067 /// This is only available when the `std` feature is enabled.
3068 #[cfg(feature = "std")]
3069 #[inline]
into_owned(self) -> Finder<'static>3070 pub fn into_owned(self) -> Finder<'static> {
3071 Finder(self.0.into_owned())
3072 }
3073
3074 /// Returns the needle that this finder searches for.
3075 ///
3076 /// Note that the lifetime of the needle returned is tied to the lifetime
3077 /// of the finder, and may be shorter than the `'a` lifetime. Namely, a
3078 /// finder's needle can be either borrowed or owned, so the lifetime of the
3079 /// needle returned must necessarily be the shorter of the two.
3080 #[inline]
needle(&self) -> &[u8]3081 pub fn needle(&self) -> &[u8] {
3082 self.0.needle()
3083 }
3084
3085 /// Returns the index of the first occurrence of this needle in the given
3086 /// haystack.
3087 ///
3088 /// The haystack may be any type that can be cheaply converted into a
3089 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
3090 ///
3091 /// # Complexity
3092 ///
3093 /// This routine is guaranteed to have worst case linear time complexity
3094 /// with respect to both the needle and the haystack. That is, this runs
3095 /// in `O(needle.len() + haystack.len())` time.
3096 ///
3097 /// This routine is also guaranteed to have worst case constant space
3098 /// complexity.
3099 ///
3100 /// # Examples
3101 ///
3102 /// Basic usage:
3103 ///
3104 /// ```
3105 /// use bstr::Finder;
3106 ///
3107 /// let haystack = "foo bar baz";
3108 /// assert_eq!(Some(0), Finder::new("foo").find(haystack));
3109 /// assert_eq!(Some(4), Finder::new("bar").find(haystack));
3110 /// assert_eq!(None, Finder::new("quux").find(haystack));
3111 /// ```
3112 #[inline]
find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize>3113 pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize> {
3114 self.0.find(haystack.as_ref())
3115 }
3116 }
3117
3118 /// A single substring reverse searcher fixed to a particular needle.
3119 ///
3120 /// The purpose of this type is to permit callers to construct a substring
3121 /// searcher that can be used to search haystacks without the overhead of
3122 /// constructing the searcher in the first place. This is a somewhat niche
3123 /// concern when it's necessary to re-use the same needle to search multiple
3124 /// different haystacks with as little overhead as possible. In general, using
3125 /// [`ByteSlice::rfind`](trait.ByteSlice.html#method.rfind)
3126 /// or
3127 /// [`ByteSlice::rfind_iter`](trait.ByteSlice.html#method.rfind_iter)
3128 /// is good enough, but `FinderReverse` is useful when you can meaningfully
3129 /// observe searcher construction time in a profile.
3130 ///
3131 /// When the `std` feature is enabled, then this type has an `into_owned`
3132 /// version which permits building a `FinderReverse` that is not connected to
3133 /// the lifetime of its needle.
3134 #[derive(Clone, Debug)]
3135 pub struct FinderReverse<'a>(memmem::FinderRev<'a>);
3136
3137 impl<'a> FinderReverse<'a> {
3138 /// Create a new reverse finder for the given needle.
3139 #[inline]
new<B: ?Sized + AsRef<[u8]>>(needle: &'a B) -> FinderReverse<'a>3140 pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'a B) -> FinderReverse<'a> {
3141 FinderReverse(memmem::FinderRev::new(needle.as_ref()))
3142 }
3143
3144 /// Convert this finder into its owned variant, such that it no longer
3145 /// borrows the needle.
3146 ///
3147 /// If this is already an owned finder, then this is a no-op. Otherwise,
3148 /// this copies the needle.
3149 ///
3150 /// This is only available when the `std` feature is enabled.
3151 #[cfg(feature = "std")]
3152 #[inline]
into_owned(self) -> FinderReverse<'static>3153 pub fn into_owned(self) -> FinderReverse<'static> {
3154 FinderReverse(self.0.into_owned())
3155 }
3156
3157 /// Returns the needle that this finder searches for.
3158 ///
3159 /// Note that the lifetime of the needle returned is tied to the lifetime
3160 /// of this finder, and may be shorter than the `'a` lifetime. Namely,
3161 /// a finder's needle can be either borrowed or owned, so the lifetime of
3162 /// the needle returned must necessarily be the shorter of the two.
3163 #[inline]
needle(&self) -> &[u8]3164 pub fn needle(&self) -> &[u8] {
3165 self.0.needle()
3166 }
3167
3168 /// Returns the index of the last occurrence of this needle in the given
3169 /// haystack.
3170 ///
3171 /// The haystack may be any type that can be cheaply converted into a
3172 /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`.
3173 ///
3174 /// # Complexity
3175 ///
3176 /// This routine is guaranteed to have worst case linear time complexity
3177 /// with respect to both the needle and the haystack. That is, this runs
3178 /// in `O(needle.len() + haystack.len())` time.
3179 ///
3180 /// This routine is also guaranteed to have worst case constant space
3181 /// complexity.
3182 ///
3183 /// # Examples
3184 ///
3185 /// Basic usage:
3186 ///
3187 /// ```
3188 /// use bstr::FinderReverse;
3189 ///
3190 /// let haystack = "foo bar baz";
3191 /// assert_eq!(Some(0), FinderReverse::new("foo").rfind(haystack));
3192 /// assert_eq!(Some(4), FinderReverse::new("bar").rfind(haystack));
3193 /// assert_eq!(None, FinderReverse::new("quux").rfind(haystack));
3194 /// ```
3195 #[inline]
rfind<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize>3196 pub fn rfind<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize> {
3197 self.0.rfind(haystack.as_ref())
3198 }
3199 }
3200
3201 /// An iterator over non-overlapping substring matches.
3202 ///
3203 /// Matches are reported by the byte offset at which they begin.
3204 ///
3205 /// `'h` is the lifetime of the haystack while `'n` is the lifetime of the
3206 /// needle.
3207 #[derive(Debug)]
3208 pub struct Find<'h, 'n> {
3209 it: memmem::FindIter<'h, 'n>,
3210 haystack: &'h [u8],
3211 needle: &'n [u8],
3212 }
3213
3214 impl<'h, 'n> Find<'h, 'n> {
new(haystack: &'h [u8], needle: &'n [u8]) -> Find<'h, 'n>3215 fn new(haystack: &'h [u8], needle: &'n [u8]) -> Find<'h, 'n> {
3216 Find { it: memmem::find_iter(haystack, needle), haystack, needle }
3217 }
3218 }
3219
3220 impl<'h, 'n> Iterator for Find<'h, 'n> {
3221 type Item = usize;
3222
3223 #[inline]
next(&mut self) -> Option<usize>3224 fn next(&mut self) -> Option<usize> {
3225 self.it.next()
3226 }
3227 }
3228
3229 /// An iterator over non-overlapping substring matches in reverse.
3230 ///
3231 /// Matches are reported by the byte offset at which they begin.
3232 ///
3233 /// `'h` is the lifetime of the haystack while `'n` is the lifetime of the
3234 /// needle.
3235 #[derive(Debug)]
3236 pub struct FindReverse<'h, 'n> {
3237 it: memmem::FindRevIter<'h, 'n>,
3238 haystack: &'h [u8],
3239 needle: &'n [u8],
3240 }
3241
3242 impl<'h, 'n> FindReverse<'h, 'n> {
new(haystack: &'h [u8], needle: &'n [u8]) -> FindReverse<'h, 'n>3243 fn new(haystack: &'h [u8], needle: &'n [u8]) -> FindReverse<'h, 'n> {
3244 FindReverse {
3245 it: memmem::rfind_iter(haystack, needle),
3246 haystack,
3247 needle,
3248 }
3249 }
3250
haystack(&self) -> &'h [u8]3251 fn haystack(&self) -> &'h [u8] {
3252 self.haystack
3253 }
3254
needle(&self) -> &'n [u8]3255 fn needle(&self) -> &'n [u8] {
3256 self.needle
3257 }
3258 }
3259
3260 impl<'h, 'n> Iterator for FindReverse<'h, 'n> {
3261 type Item = usize;
3262
3263 #[inline]
next(&mut self) -> Option<usize>3264 fn next(&mut self) -> Option<usize> {
3265 self.it.next()
3266 }
3267 }
3268
3269 /// An iterator over the bytes in a byte string.
3270 ///
3271 /// `'a` is the lifetime of the byte string being traversed.
3272 #[derive(Clone, Debug)]
3273 pub struct Bytes<'a> {
3274 it: slice::Iter<'a, u8>,
3275 }
3276
3277 impl<'a> Bytes<'a> {
3278 /// Views the remaining underlying data as a subslice of the original data.
3279 /// This has the same lifetime as the original slice,
3280 /// and so the iterator can continue to be used while this exists.
3281 #[inline]
as_bytes(&self) -> &'a [u8]3282 pub fn as_bytes(&self) -> &'a [u8] {
3283 self.it.as_slice()
3284 }
3285 }
3286
3287 impl<'a> Iterator for Bytes<'a> {
3288 type Item = u8;
3289
3290 #[inline]
next(&mut self) -> Option<u8>3291 fn next(&mut self) -> Option<u8> {
3292 self.it.next().map(|&b| b)
3293 }
3294
3295 #[inline]
size_hint(&self) -> (usize, Option<usize>)3296 fn size_hint(&self) -> (usize, Option<usize>) {
3297 self.it.size_hint()
3298 }
3299 }
3300
3301 impl<'a> DoubleEndedIterator for Bytes<'a> {
3302 #[inline]
next_back(&mut self) -> Option<u8>3303 fn next_back(&mut self) -> Option<u8> {
3304 self.it.next_back().map(|&b| b)
3305 }
3306 }
3307
3308 impl<'a> ExactSizeIterator for Bytes<'a> {
3309 #[inline]
len(&self) -> usize3310 fn len(&self) -> usize {
3311 self.it.len()
3312 }
3313 }
3314
3315 impl<'a> iter::FusedIterator for Bytes<'a> {}
3316
3317 /// An iterator over the fields in a byte string, separated by whitespace.
3318 ///
3319 /// Whitespace for this iterator is defined by the Unicode property
3320 /// `White_Space`.
3321 ///
3322 /// This iterator splits on contiguous runs of whitespace, such that the fields
3323 /// in `foo\t\t\n \nbar` are `foo` and `bar`.
3324 ///
3325 /// `'a` is the lifetime of the byte string being split.
3326 #[cfg(feature = "unicode")]
3327 #[derive(Debug)]
3328 pub struct Fields<'a> {
3329 it: FieldsWith<'a, fn(char) -> bool>,
3330 }
3331
3332 #[cfg(feature = "unicode")]
3333 impl<'a> Fields<'a> {
new(bytes: &'a [u8]) -> Fields<'a>3334 fn new(bytes: &'a [u8]) -> Fields<'a> {
3335 Fields { it: bytes.fields_with(|ch| ch.is_whitespace()) }
3336 }
3337 }
3338
3339 #[cfg(feature = "unicode")]
3340 impl<'a> Iterator for Fields<'a> {
3341 type Item = &'a [u8];
3342
3343 #[inline]
next(&mut self) -> Option<&'a [u8]>3344 fn next(&mut self) -> Option<&'a [u8]> {
3345 self.it.next()
3346 }
3347 }
3348
3349 /// An iterator over fields in the byte string, separated by a predicate over
3350 /// codepoints.
3351 ///
3352 /// This iterator splits a byte string based on its predicate function such
3353 /// that the elements returned are separated by contiguous runs of codepoints
3354 /// for which the predicate returns true.
3355 ///
3356 /// `'a` is the lifetime of the byte string being split, while `F` is the type
3357 /// of the predicate, i.e., `FnMut(char) -> bool`.
3358 #[derive(Debug)]
3359 pub struct FieldsWith<'a, F> {
3360 f: F,
3361 bytes: &'a [u8],
3362 chars: CharIndices<'a>,
3363 }
3364
3365 impl<'a, F: FnMut(char) -> bool> FieldsWith<'a, F> {
new(bytes: &'a [u8], f: F) -> FieldsWith<'a, F>3366 fn new(bytes: &'a [u8], f: F) -> FieldsWith<'a, F> {
3367 FieldsWith { f, bytes, chars: bytes.char_indices() }
3368 }
3369 }
3370
3371 impl<'a, F: FnMut(char) -> bool> Iterator for FieldsWith<'a, F> {
3372 type Item = &'a [u8];
3373
3374 #[inline]
next(&mut self) -> Option<&'a [u8]>3375 fn next(&mut self) -> Option<&'a [u8]> {
3376 let (start, mut end);
3377 loop {
3378 match self.chars.next() {
3379 None => return None,
3380 Some((s, e, ch)) => {
3381 if !(self.f)(ch) {
3382 start = s;
3383 end = e;
3384 break;
3385 }
3386 }
3387 }
3388 }
3389 while let Some((_, e, ch)) = self.chars.next() {
3390 if (self.f)(ch) {
3391 break;
3392 }
3393 end = e;
3394 }
3395 Some(&self.bytes[start..end])
3396 }
3397 }
3398
3399 /// An iterator over substrings in a byte string, split by a separator.
3400 ///
3401 /// `'h` is the lifetime of the byte string being split (the haystack), while
3402 /// `'s` is the lifetime of the byte string doing the splitting.
3403 #[derive(Debug)]
3404 pub struct Split<'h, 's> {
3405 finder: Find<'h, 's>,
3406 /// The end position of the previous match of our splitter. The element
3407 /// we yield corresponds to the substring starting at `last` up to the
3408 /// beginning of the next match of the splitter.
3409 last: usize,
3410 /// Only set when iteration is complete. A corner case here is when a
3411 /// splitter is matched at the end of the haystack. At that point, we still
3412 /// need to yield an empty string following it.
3413 done: bool,
3414 }
3415
3416 impl<'h, 's> Split<'h, 's> {
new(haystack: &'h [u8], splitter: &'s [u8]) -> Split<'h, 's>3417 fn new(haystack: &'h [u8], splitter: &'s [u8]) -> Split<'h, 's> {
3418 let finder = haystack.find_iter(splitter);
3419 Split { finder, last: 0, done: false }
3420 }
3421 }
3422
3423 impl<'h, 's> Iterator for Split<'h, 's> {
3424 type Item = &'h [u8];
3425
3426 #[inline]
next(&mut self) -> Option<&'h [u8]>3427 fn next(&mut self) -> Option<&'h [u8]> {
3428 let haystack = self.finder.haystack;
3429 match self.finder.next() {
3430 Some(start) => {
3431 let next = &haystack[self.last..start];
3432 self.last = start + self.finder.needle.len();
3433 Some(next)
3434 }
3435 None => {
3436 if self.last >= haystack.len() {
3437 if !self.done {
3438 self.done = true;
3439 Some(b"")
3440 } else {
3441 None
3442 }
3443 } else {
3444 let s = &haystack[self.last..];
3445 self.last = haystack.len();
3446 self.done = true;
3447 Some(s)
3448 }
3449 }
3450 }
3451 }
3452 }
3453
3454 /// An iterator over substrings in a byte string, split by a separator, in
3455 /// reverse.
3456 ///
3457 /// `'h` is the lifetime of the byte string being split (the haystack), while
3458 /// `'s` is the lifetime of the byte string doing the splitting.
3459 #[derive(Debug)]
3460 pub struct SplitReverse<'h, 's> {
3461 finder: FindReverse<'h, 's>,
3462 /// The end position of the previous match of our splitter. The element
3463 /// we yield corresponds to the substring starting at `last` up to the
3464 /// beginning of the next match of the splitter.
3465 last: usize,
3466 /// Only set when iteration is complete. A corner case here is when a
3467 /// splitter is matched at the end of the haystack. At that point, we still
3468 /// need to yield an empty string following it.
3469 done: bool,
3470 }
3471
3472 impl<'h, 's> SplitReverse<'h, 's> {
new(haystack: &'h [u8], splitter: &'s [u8]) -> SplitReverse<'h, 's>3473 fn new(haystack: &'h [u8], splitter: &'s [u8]) -> SplitReverse<'h, 's> {
3474 let finder = haystack.rfind_iter(splitter);
3475 SplitReverse { finder, last: haystack.len(), done: false }
3476 }
3477 }
3478
3479 impl<'h, 's> Iterator for SplitReverse<'h, 's> {
3480 type Item = &'h [u8];
3481
3482 #[inline]
next(&mut self) -> Option<&'h [u8]>3483 fn next(&mut self) -> Option<&'h [u8]> {
3484 let haystack = self.finder.haystack();
3485 match self.finder.next() {
3486 Some(start) => {
3487 let nlen = self.finder.needle().len();
3488 let next = &haystack[start + nlen..self.last];
3489 self.last = start;
3490 Some(next)
3491 }
3492 None => {
3493 if self.last == 0 {
3494 if !self.done {
3495 self.done = true;
3496 Some(b"")
3497 } else {
3498 None
3499 }
3500 } else {
3501 let s = &haystack[..self.last];
3502 self.last = 0;
3503 self.done = true;
3504 Some(s)
3505 }
3506 }
3507 }
3508 }
3509 }
3510
3511 /// An iterator over at most `n` substrings in a byte string, split by a
3512 /// separator.
3513 ///
3514 /// `'h` is the lifetime of the byte string being split (the haystack), while
3515 /// `'s` is the lifetime of the byte string doing the splitting.
3516 #[derive(Debug)]
3517 pub struct SplitN<'h, 's> {
3518 split: Split<'h, 's>,
3519 limit: usize,
3520 count: usize,
3521 }
3522
3523 impl<'h, 's> SplitN<'h, 's> {
new( haystack: &'h [u8], splitter: &'s [u8], limit: usize, ) -> SplitN<'h, 's>3524 fn new(
3525 haystack: &'h [u8],
3526 splitter: &'s [u8],
3527 limit: usize,
3528 ) -> SplitN<'h, 's> {
3529 let split = haystack.split_str(splitter);
3530 SplitN { split, limit, count: 0 }
3531 }
3532 }
3533
3534 impl<'h, 's> Iterator for SplitN<'h, 's> {
3535 type Item = &'h [u8];
3536
3537 #[inline]
next(&mut self) -> Option<&'h [u8]>3538 fn next(&mut self) -> Option<&'h [u8]> {
3539 self.count += 1;
3540 if self.count > self.limit || self.split.done {
3541 None
3542 } else if self.count == self.limit {
3543 Some(&self.split.finder.haystack[self.split.last..])
3544 } else {
3545 self.split.next()
3546 }
3547 }
3548 }
3549
3550 /// An iterator over at most `n` substrings in a byte string, split by a
3551 /// separator, in reverse.
3552 ///
3553 /// `'h` is the lifetime of the byte string being split (the haystack), while
3554 /// `'s` is the lifetime of the byte string doing the splitting.
3555 #[derive(Debug)]
3556 pub struct SplitNReverse<'h, 's> {
3557 split: SplitReverse<'h, 's>,
3558 limit: usize,
3559 count: usize,
3560 }
3561
3562 impl<'h, 's> SplitNReverse<'h, 's> {
new( haystack: &'h [u8], splitter: &'s [u8], limit: usize, ) -> SplitNReverse<'h, 's>3563 fn new(
3564 haystack: &'h [u8],
3565 splitter: &'s [u8],
3566 limit: usize,
3567 ) -> SplitNReverse<'h, 's> {
3568 let split = haystack.rsplit_str(splitter);
3569 SplitNReverse { split, limit, count: 0 }
3570 }
3571 }
3572
3573 impl<'h, 's> Iterator for SplitNReverse<'h, 's> {
3574 type Item = &'h [u8];
3575
3576 #[inline]
next(&mut self) -> Option<&'h [u8]>3577 fn next(&mut self) -> Option<&'h [u8]> {
3578 self.count += 1;
3579 if self.count > self.limit || self.split.done {
3580 None
3581 } else if self.count == self.limit {
3582 Some(&self.split.finder.haystack()[..self.split.last])
3583 } else {
3584 self.split.next()
3585 }
3586 }
3587 }
3588
3589 /// An iterator over all lines in a byte string, without their terminators.
3590 ///
3591 /// For this iterator, the only line terminators recognized are `\r\n` and
3592 /// `\n`.
3593 ///
3594 /// `'a` is the lifetime of the byte string being iterated over.
3595 #[derive(Clone, Debug)]
3596 pub struct Lines<'a> {
3597 it: LinesWithTerminator<'a>,
3598 }
3599
3600 impl<'a> Lines<'a> {
new(bytes: &'a [u8]) -> Lines<'a>3601 fn new(bytes: &'a [u8]) -> Lines<'a> {
3602 Lines { it: LinesWithTerminator::new(bytes) }
3603 }
3604
3605 /// Return a copy of the rest of the underlying bytes without affecting the
3606 /// iterator itself.
3607 ///
3608 /// # Examples
3609 ///
3610 /// Basic usage:
3611 ///
3612 /// ```
3613 /// use bstr::{B, ByteSlice};
3614 ///
3615 /// let s = b"\
3616 /// foo
3617 /// bar\r
3618 /// baz";
3619 /// let mut lines = s.lines();
3620 /// assert_eq!(lines.next(), Some(B("foo")));
3621 /// assert_eq!(lines.as_bytes(), B("bar\r\nbaz"));
3622 /// ```
as_bytes(&self) -> &'a [u8]3623 pub fn as_bytes(&self) -> &'a [u8] {
3624 self.it.bytes
3625 }
3626 }
3627
3628 impl<'a> Iterator for Lines<'a> {
3629 type Item = &'a [u8];
3630
3631 #[inline]
next(&mut self) -> Option<&'a [u8]>3632 fn next(&mut self) -> Option<&'a [u8]> {
3633 Some(trim_last_terminator(self.it.next()?))
3634 }
3635 }
3636
3637 impl<'a> DoubleEndedIterator for Lines<'a> {
3638 #[inline]
next_back(&mut self) -> Option<Self::Item>3639 fn next_back(&mut self) -> Option<Self::Item> {
3640 Some(trim_last_terminator(self.it.next_back()?))
3641 }
3642 }
3643
3644 impl<'a> iter::FusedIterator for Lines<'a> {}
3645
3646 /// An iterator over all lines in a byte string, including their terminators.
3647 ///
3648 /// For this iterator, the only line terminator recognized is `\n`. (Since
3649 /// line terminators are included, this also handles `\r\n` line endings.)
3650 ///
3651 /// Line terminators are only included if they are present in the original
3652 /// byte string. For example, the last line in a byte string may not end with
3653 /// a line terminator.
3654 ///
3655 /// Concatenating all elements yielded by this iterator is guaranteed to yield
3656 /// the original byte string.
3657 ///
3658 /// `'a` is the lifetime of the byte string being iterated over.
3659 #[derive(Clone, Debug)]
3660 pub struct LinesWithTerminator<'a> {
3661 bytes: &'a [u8],
3662 }
3663
3664 impl<'a> LinesWithTerminator<'a> {
new(bytes: &'a [u8]) -> LinesWithTerminator<'a>3665 fn new(bytes: &'a [u8]) -> LinesWithTerminator<'a> {
3666 LinesWithTerminator { bytes }
3667 }
3668
3669 /// Return a copy of the rest of the underlying bytes without affecting the
3670 /// iterator itself.
3671 ///
3672 /// # Examples
3673 ///
3674 /// Basic usage:
3675 ///
3676 /// ```
3677 /// use bstr::{B, ByteSlice};
3678 ///
3679 /// let s = b"\
3680 /// foo
3681 /// bar\r
3682 /// baz";
3683 /// let mut lines = s.lines_with_terminator();
3684 /// assert_eq!(lines.next(), Some(B("foo\n")));
3685 /// assert_eq!(lines.as_bytes(), B("bar\r\nbaz"));
3686 /// ```
as_bytes(&self) -> &'a [u8]3687 pub fn as_bytes(&self) -> &'a [u8] {
3688 self.bytes
3689 }
3690 }
3691
3692 impl<'a> Iterator for LinesWithTerminator<'a> {
3693 type Item = &'a [u8];
3694
3695 #[inline]
next(&mut self) -> Option<&'a [u8]>3696 fn next(&mut self) -> Option<&'a [u8]> {
3697 match self.bytes.find_byte(b'\n') {
3698 None if self.bytes.is_empty() => None,
3699 None => {
3700 let line = self.bytes;
3701 self.bytes = b"";
3702 Some(line)
3703 }
3704 Some(end) => {
3705 let line = &self.bytes[..end + 1];
3706 self.bytes = &self.bytes[end + 1..];
3707 Some(line)
3708 }
3709 }
3710 }
3711 }
3712
3713 impl<'a> DoubleEndedIterator for LinesWithTerminator<'a> {
3714 #[inline]
next_back(&mut self) -> Option<Self::Item>3715 fn next_back(&mut self) -> Option<Self::Item> {
3716 let end = self.bytes.len().checked_sub(1)?;
3717 match self.bytes[..end].rfind_byte(b'\n') {
3718 None => {
3719 let line = self.bytes;
3720 self.bytes = b"";
3721 Some(line)
3722 }
3723 Some(end) => {
3724 let line = &self.bytes[end + 1..];
3725 self.bytes = &self.bytes[..end + 1];
3726 Some(line)
3727 }
3728 }
3729 }
3730 }
3731
3732 impl<'a> iter::FusedIterator for LinesWithTerminator<'a> {}
3733
trim_last_terminator(mut s: &[u8]) -> &[u8]3734 fn trim_last_terminator(mut s: &[u8]) -> &[u8] {
3735 if s.last_byte() == Some(b'\n') {
3736 s = &s[..s.len() - 1];
3737 if s.last_byte() == Some(b'\r') {
3738 s = &s[..s.len() - 1];
3739 }
3740 }
3741 s
3742 }
3743
3744 #[cfg(all(test, feature = "std"))]
3745 mod tests {
3746 use crate::{
3747 ext_slice::{ByteSlice, Lines, LinesWithTerminator, B},
3748 tests::LOSSY_TESTS,
3749 };
3750
3751 #[test]
to_str_lossy()3752 fn to_str_lossy() {
3753 for (i, &(expected, input)) in LOSSY_TESTS.iter().enumerate() {
3754 let got = B(input).to_str_lossy();
3755 assert_eq!(
3756 expected.as_bytes(),
3757 got.as_bytes(),
3758 "to_str_lossy(ith: {:?}, given: {:?})",
3759 i,
3760 input,
3761 );
3762
3763 let mut got = String::new();
3764 B(input).to_str_lossy_into(&mut got);
3765 assert_eq!(
3766 expected.as_bytes(),
3767 got.as_bytes(),
3768 "to_str_lossy_into",
3769 );
3770
3771 let got = String::from_utf8_lossy(input);
3772 assert_eq!(expected.as_bytes(), got.as_bytes(), "std");
3773 }
3774 }
3775
3776 #[test]
lines_iteration()3777 fn lines_iteration() {
3778 macro_rules! t {
3779 ($it:expr, $forward:expr) => {
3780 let mut res: Vec<&[u8]> = Vec::from($forward);
3781 assert_eq!($it.collect::<Vec<_>>(), res);
3782 res.reverse();
3783 assert_eq!($it.rev().collect::<Vec<_>>(), res);
3784 };
3785 }
3786
3787 t!(Lines::new(b""), []);
3788 t!(LinesWithTerminator::new(b""), []);
3789
3790 t!(Lines::new(b"\n"), [B("")]);
3791 t!(Lines::new(b"\r\n"), [B("")]);
3792 t!(LinesWithTerminator::new(b"\n"), [B("\n")]);
3793
3794 t!(Lines::new(b"a"), [B("a")]);
3795 t!(LinesWithTerminator::new(b"a"), [B("a")]);
3796
3797 t!(Lines::new(b"abc"), [B("abc")]);
3798 t!(LinesWithTerminator::new(b"abc"), [B("abc")]);
3799
3800 t!(Lines::new(b"abc\n"), [B("abc")]);
3801 t!(Lines::new(b"abc\r\n"), [B("abc")]);
3802 t!(LinesWithTerminator::new(b"abc\n"), [B("abc\n")]);
3803
3804 t!(Lines::new(b"abc\n\n"), [B("abc"), B("")]);
3805 t!(LinesWithTerminator::new(b"abc\n\n"), [B("abc\n"), B("\n")]);
3806
3807 t!(Lines::new(b"abc\n\ndef"), [B("abc"), B(""), B("def")]);
3808 t!(
3809 LinesWithTerminator::new(b"abc\n\ndef"),
3810 [B("abc\n"), B("\n"), B("def")]
3811 );
3812
3813 t!(Lines::new(b"abc\n\ndef\n"), [B("abc"), B(""), B("def")]);
3814 t!(
3815 LinesWithTerminator::new(b"abc\n\ndef\n"),
3816 [B("abc\n"), B("\n"), B("def\n")]
3817 );
3818
3819 t!(Lines::new(b"\na\nb\n"), [B(""), B("a"), B("b")]);
3820 t!(
3821 LinesWithTerminator::new(b"\na\nb\n"),
3822 [B("\n"), B("a\n"), B("b\n")]
3823 );
3824
3825 t!(Lines::new(b"\n\n\n"), [B(""), B(""), B("")]);
3826 t!(LinesWithTerminator::new(b"\n\n\n"), [B("\n"), B("\n"), B("\n")]);
3827 }
3828 }
3829