1 /*!
2 Utilities for working with I/O using byte strings.
3
4 This module currently only exports a single trait, `BufReadExt`, which provides
5 facilities for conveniently and efficiently working with lines as byte strings.
6
7 More APIs may be added in the future.
8 */
9
10 use std::io;
11
12 use ext_slice::ByteSlice;
13 use ext_vec::ByteVec;
14
15 /// An extention trait for
16 /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html)
17 /// which provides convenience APIs for dealing with byte strings.
18 pub trait BufReadExt: io::BufRead {
19 /// Returns an iterator over the lines of this reader, where each line
20 /// is represented as a byte string.
21 ///
22 /// Each item yielded by this iterator is a `io::Result<Vec<u8>>`, where
23 /// an error is yielded if there was a problem reading from the underlying
24 /// reader.
25 ///
26 /// On success, the next line in the iterator is returned. The line does
27 /// *not* contain a trailing `\n` or `\r\n`.
28 ///
29 /// # Examples
30 ///
31 /// Basic usage:
32 ///
33 /// ```
34 /// use std::io;
35 ///
36 /// use bstr::io::BufReadExt;
37 ///
38 /// # fn example() -> Result<(), io::Error> {
39 /// let cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
40 ///
41 /// let mut lines = vec![];
42 /// for result in cursor.byte_lines() {
43 /// let line = result?;
44 /// lines.push(line);
45 /// }
46 /// assert_eq!(lines.len(), 3);
47 /// assert_eq!(lines[0], "lorem".as_bytes());
48 /// assert_eq!(lines[1], "ipsum".as_bytes());
49 /// assert_eq!(lines[2], "dolor".as_bytes());
50 /// # Ok(()) }; example().unwrap()
51 /// ```
byte_lines(self) -> ByteLines<Self> where Self: Sized,52 fn byte_lines(self) -> ByteLines<Self>
53 where
54 Self: Sized,
55 {
56 ByteLines { buf: self }
57 }
58
59 /// Returns an iterator over byte-terminated records of this reader, where
60 /// each record is represented as a byte string.
61 ///
62 /// Each item yielded by this iterator is a `io::Result<Vec<u8>>`, where
63 /// an error is yielded if there was a problem reading from the underlying
64 /// reader.
65 ///
66 /// On success, the next record in the iterator is returned. The record
67 /// does *not* contain its trailing terminator.
68 ///
69 /// Note that calling `byte_records(b'\n')` differs from `byte_lines()` in
70 /// that it has no special handling for `\r`.
71 ///
72 /// # Examples
73 ///
74 /// Basic usage:
75 ///
76 /// ```
77 /// use std::io;
78 ///
79 /// use bstr::io::BufReadExt;
80 ///
81 /// # fn example() -> Result<(), io::Error> {
82 /// let cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
83 ///
84 /// let mut records = vec![];
85 /// for result in cursor.byte_records(b'\x00') {
86 /// let record = result?;
87 /// records.push(record);
88 /// }
89 /// assert_eq!(records.len(), 3);
90 /// assert_eq!(records[0], "lorem".as_bytes());
91 /// assert_eq!(records[1], "ipsum".as_bytes());
92 /// assert_eq!(records[2], "dolor".as_bytes());
93 /// # Ok(()) }; example().unwrap()
94 /// ```
byte_records(self, terminator: u8) -> ByteRecords<Self> where Self: Sized,95 fn byte_records(self, terminator: u8) -> ByteRecords<Self>
96 where
97 Self: Sized,
98 {
99 ByteRecords { terminator, buf: self }
100 }
101
102 /// Executes the given closure on each line in the underlying reader.
103 ///
104 /// If the closure returns an error (or if the underlying reader returns an
105 /// error), then iteration is stopped and the error is returned. If false
106 /// is returned, then iteration is stopped and no error is returned.
107 ///
108 /// The closure given is called on exactly the same values as yielded by
109 /// the [`byte_lines`](trait.BufReadExt.html#method.byte_lines)
110 /// iterator. Namely, lines do _not_ contain trailing `\n` or `\r\n` bytes.
111 ///
112 /// This routine is useful for iterating over lines as quickly as
113 /// possible. Namely, a single allocation is reused for each line.
114 ///
115 /// # Examples
116 ///
117 /// Basic usage:
118 ///
119 /// ```
120 /// use std::io;
121 ///
122 /// use bstr::io::BufReadExt;
123 ///
124 /// # fn example() -> Result<(), io::Error> {
125 /// let cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
126 ///
127 /// let mut lines = vec![];
128 /// cursor.for_byte_line(|line| {
129 /// lines.push(line.to_vec());
130 /// Ok(true)
131 /// })?;
132 /// assert_eq!(lines.len(), 3);
133 /// assert_eq!(lines[0], "lorem".as_bytes());
134 /// assert_eq!(lines[1], "ipsum".as_bytes());
135 /// assert_eq!(lines[2], "dolor".as_bytes());
136 /// # Ok(()) }; example().unwrap()
137 /// ```
for_byte_line<F>(self, mut for_each_line: F) -> io::Result<()> where Self: Sized, F: FnMut(&[u8]) -> io::Result<bool>,138 fn for_byte_line<F>(self, mut for_each_line: F) -> io::Result<()>
139 where
140 Self: Sized,
141 F: FnMut(&[u8]) -> io::Result<bool>,
142 {
143 self.for_byte_line_with_terminator(|line| {
144 for_each_line(&trim_line_slice(&line))
145 })
146 }
147
148 /// Executes the given closure on each byte-terminated record in the
149 /// underlying reader.
150 ///
151 /// If the closure returns an error (or if the underlying reader returns an
152 /// error), then iteration is stopped and the error is returned. If false
153 /// is returned, then iteration is stopped and no error is returned.
154 ///
155 /// The closure given is called on exactly the same values as yielded by
156 /// the [`byte_records`](trait.BufReadExt.html#method.byte_records)
157 /// iterator. Namely, records do _not_ contain a trailing terminator byte.
158 ///
159 /// This routine is useful for iterating over records as quickly as
160 /// possible. Namely, a single allocation is reused for each record.
161 ///
162 /// # Examples
163 ///
164 /// Basic usage:
165 ///
166 /// ```
167 /// use std::io;
168 ///
169 /// use bstr::io::BufReadExt;
170 ///
171 /// # fn example() -> Result<(), io::Error> {
172 /// let cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
173 ///
174 /// let mut records = vec![];
175 /// cursor.for_byte_record(b'\x00', |record| {
176 /// records.push(record.to_vec());
177 /// Ok(true)
178 /// })?;
179 /// assert_eq!(records.len(), 3);
180 /// assert_eq!(records[0], "lorem".as_bytes());
181 /// assert_eq!(records[1], "ipsum".as_bytes());
182 /// assert_eq!(records[2], "dolor".as_bytes());
183 /// # Ok(()) }; example().unwrap()
184 /// ```
for_byte_record<F>( self, terminator: u8, mut for_each_record: F, ) -> io::Result<()> where Self: Sized, F: FnMut(&[u8]) -> io::Result<bool>,185 fn for_byte_record<F>(
186 self,
187 terminator: u8,
188 mut for_each_record: F,
189 ) -> io::Result<()>
190 where
191 Self: Sized,
192 F: FnMut(&[u8]) -> io::Result<bool>,
193 {
194 self.for_byte_record_with_terminator(terminator, |chunk| {
195 for_each_record(&trim_record_slice(&chunk, terminator))
196 })
197 }
198
199 /// Executes the given closure on each line in the underlying reader.
200 ///
201 /// If the closure returns an error (or if the underlying reader returns an
202 /// error), then iteration is stopped and the error is returned. If false
203 /// is returned, then iteration is stopped and no error is returned.
204 ///
205 /// Unlike
206 /// [`for_byte_line`](trait.BufReadExt.html#method.for_byte_line),
207 /// the lines given to the closure *do* include the line terminator, if one
208 /// exists.
209 ///
210 /// This routine is useful for iterating over lines as quickly as
211 /// possible. Namely, a single allocation is reused for each line.
212 ///
213 /// This is identical to `for_byte_record_with_terminator` with a
214 /// terminator of `\n`.
215 ///
216 /// # Examples
217 ///
218 /// Basic usage:
219 ///
220 /// ```
221 /// use std::io;
222 ///
223 /// use bstr::io::BufReadExt;
224 ///
225 /// # fn example() -> Result<(), io::Error> {
226 /// let cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
227 ///
228 /// let mut lines = vec![];
229 /// cursor.for_byte_line_with_terminator(|line| {
230 /// lines.push(line.to_vec());
231 /// Ok(true)
232 /// })?;
233 /// assert_eq!(lines.len(), 3);
234 /// assert_eq!(lines[0], "lorem\n".as_bytes());
235 /// assert_eq!(lines[1], "ipsum\r\n".as_bytes());
236 /// assert_eq!(lines[2], "dolor".as_bytes());
237 /// # Ok(()) }; example().unwrap()
238 /// ```
for_byte_line_with_terminator<F>( self, for_each_line: F, ) -> io::Result<()> where Self: Sized, F: FnMut(&[u8]) -> io::Result<bool>,239 fn for_byte_line_with_terminator<F>(
240 self,
241 for_each_line: F,
242 ) -> io::Result<()>
243 where
244 Self: Sized,
245 F: FnMut(&[u8]) -> io::Result<bool>,
246 {
247 self.for_byte_record_with_terminator(b'\n', for_each_line)
248 }
249
250 /// Executes the given closure on each byte-terminated record in the
251 /// underlying reader.
252 ///
253 /// If the closure returns an error (or if the underlying reader returns an
254 /// error), then iteration is stopped and the error is returned. If false
255 /// is returned, then iteration is stopped and no error is returned.
256 ///
257 /// Unlike
258 /// [`for_byte_record`](trait.BufReadExt.html#method.for_byte_record),
259 /// the lines given to the closure *do* include the record terminator, if
260 /// one exists.
261 ///
262 /// This routine is useful for iterating over records as quickly as
263 /// possible. Namely, a single allocation is reused for each record.
264 ///
265 /// # Examples
266 ///
267 /// Basic usage:
268 ///
269 /// ```
270 /// use std::io;
271 ///
272 /// use bstr::B;
273 /// use bstr::io::BufReadExt;
274 ///
275 /// # fn example() -> Result<(), io::Error> {
276 /// let cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
277 ///
278 /// let mut records = vec![];
279 /// cursor.for_byte_record_with_terminator(b'\x00', |record| {
280 /// records.push(record.to_vec());
281 /// Ok(true)
282 /// })?;
283 /// assert_eq!(records.len(), 3);
284 /// assert_eq!(records[0], B(b"lorem\x00"));
285 /// assert_eq!(records[1], B("ipsum\x00"));
286 /// assert_eq!(records[2], B("dolor"));
287 /// # Ok(()) }; example().unwrap()
288 /// ```
for_byte_record_with_terminator<F>( mut self, terminator: u8, mut for_each_record: F, ) -> io::Result<()> where Self: Sized, F: FnMut(&[u8]) -> io::Result<bool>,289 fn for_byte_record_with_terminator<F>(
290 mut self,
291 terminator: u8,
292 mut for_each_record: F,
293 ) -> io::Result<()>
294 where
295 Self: Sized,
296 F: FnMut(&[u8]) -> io::Result<bool>,
297 {
298 let mut bytes = vec![];
299 let mut res = Ok(());
300 let mut consumed = 0;
301 'outer: loop {
302 // Lend out complete record slices from our buffer
303 {
304 let mut buf = self.fill_buf()?;
305 while let Some(index) = buf.find_byte(terminator) {
306 let (record, rest) = buf.split_at(index + 1);
307 buf = rest;
308 consumed += record.len();
309 match for_each_record(&record) {
310 Ok(false) => break 'outer,
311 Err(err) => {
312 res = Err(err);
313 break 'outer;
314 }
315 _ => (),
316 }
317 }
318
319 // Copy the final record fragment to our local buffer. This
320 // saves read_until() from re-scanning a buffer we know
321 // contains no remaining terminators.
322 bytes.extend_from_slice(&buf);
323 consumed += buf.len();
324 }
325
326 self.consume(consumed);
327 consumed = 0;
328
329 // N.B. read_until uses a different version of memchr that may
330 // be slower than the memchr crate that bstr uses. However, this
331 // should only run for a fairly small number of records, assuming a
332 // decent buffer size.
333 self.read_until(terminator, &mut bytes)?;
334 if bytes.is_empty() || !for_each_record(&bytes)? {
335 break;
336 }
337 bytes.clear();
338 }
339 self.consume(consumed);
340 res
341 }
342 }
343
344 impl<B: io::BufRead> BufReadExt for B {}
345
346 /// An iterator over lines from an instance of
347 /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html).
348 ///
349 /// This iterator is generally created by calling the
350 /// [`byte_lines`](trait.BufReadExt.html#method.byte_lines)
351 /// method on the
352 /// [`BufReadExt`](trait.BufReadExt.html)
353 /// trait.
354 #[derive(Debug)]
355 pub struct ByteLines<B> {
356 buf: B,
357 }
358
359 /// An iterator over records from an instance of
360 /// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html).
361 ///
362 /// A byte record is any sequence of bytes terminated by a particular byte
363 /// chosen by the caller. For example, NUL separated byte strings are said to
364 /// be NUL-terminated byte records.
365 ///
366 /// This iterator is generally created by calling the
367 /// [`byte_records`](trait.BufReadExt.html#method.byte_records)
368 /// method on the
369 /// [`BufReadExt`](trait.BufReadExt.html)
370 /// trait.
371 #[derive(Debug)]
372 pub struct ByteRecords<B> {
373 buf: B,
374 terminator: u8,
375 }
376
377 impl<B: io::BufRead> Iterator for ByteLines<B> {
378 type Item = io::Result<Vec<u8>>;
379
next(&mut self) -> Option<io::Result<Vec<u8>>>380 fn next(&mut self) -> Option<io::Result<Vec<u8>>> {
381 let mut bytes = vec![];
382 match self.buf.read_until(b'\n', &mut bytes) {
383 Err(e) => Some(Err(e)),
384 Ok(0) => None,
385 Ok(_) => {
386 trim_line(&mut bytes);
387 Some(Ok(bytes))
388 }
389 }
390 }
391 }
392
393 impl<B: io::BufRead> Iterator for ByteRecords<B> {
394 type Item = io::Result<Vec<u8>>;
395
next(&mut self) -> Option<io::Result<Vec<u8>>>396 fn next(&mut self) -> Option<io::Result<Vec<u8>>> {
397 let mut bytes = vec![];
398 match self.buf.read_until(self.terminator, &mut bytes) {
399 Err(e) => Some(Err(e)),
400 Ok(0) => None,
401 Ok(_) => {
402 trim_record(&mut bytes, self.terminator);
403 Some(Ok(bytes))
404 }
405 }
406 }
407 }
408
trim_line(line: &mut Vec<u8>)409 fn trim_line(line: &mut Vec<u8>) {
410 if line.last_byte() == Some(b'\n') {
411 line.pop_byte();
412 if line.last_byte() == Some(b'\r') {
413 line.pop_byte();
414 }
415 }
416 }
417
trim_line_slice(mut line: &[u8]) -> &[u8]418 fn trim_line_slice(mut line: &[u8]) -> &[u8] {
419 if line.last_byte() == Some(b'\n') {
420 line = &line[..line.len() - 1];
421 if line.last_byte() == Some(b'\r') {
422 line = &line[..line.len() - 1];
423 }
424 }
425 line
426 }
427
trim_record(record: &mut Vec<u8>, terminator: u8)428 fn trim_record(record: &mut Vec<u8>, terminator: u8) {
429 if record.last_byte() == Some(terminator) {
430 record.pop_byte();
431 }
432 }
433
trim_record_slice(mut record: &[u8], terminator: u8) -> &[u8]434 fn trim_record_slice(mut record: &[u8], terminator: u8) -> &[u8] {
435 if record.last_byte() == Some(terminator) {
436 record = &record[..record.len() - 1];
437 }
438 record
439 }
440
441 #[cfg(test)]
442 mod tests {
443 use super::BufReadExt;
444 use bstring::BString;
445
collect_lines<B: AsRef<[u8]>>(slice: B) -> Vec<BString>446 fn collect_lines<B: AsRef<[u8]>>(slice: B) -> Vec<BString> {
447 let mut lines = vec![];
448 slice
449 .as_ref()
450 .for_byte_line(|line| {
451 lines.push(BString::from(line.to_vec()));
452 Ok(true)
453 })
454 .unwrap();
455 lines
456 }
457
collect_lines_term<B: AsRef<[u8]>>(slice: B) -> Vec<BString>458 fn collect_lines_term<B: AsRef<[u8]>>(slice: B) -> Vec<BString> {
459 let mut lines = vec![];
460 slice
461 .as_ref()
462 .for_byte_line_with_terminator(|line| {
463 lines.push(BString::from(line.to_vec()));
464 Ok(true)
465 })
466 .unwrap();
467 lines
468 }
469
470 #[test]
lines_without_terminator()471 fn lines_without_terminator() {
472 assert_eq!(collect_lines(""), Vec::<BString>::new());
473
474 assert_eq!(collect_lines("\n"), vec![""]);
475 assert_eq!(collect_lines("\n\n"), vec!["", ""]);
476 assert_eq!(collect_lines("a\nb\n"), vec!["a", "b"]);
477 assert_eq!(collect_lines("a\nb"), vec!["a", "b"]);
478 assert_eq!(collect_lines("abc\nxyz\n"), vec!["abc", "xyz"]);
479 assert_eq!(collect_lines("abc\nxyz"), vec!["abc", "xyz"]);
480
481 assert_eq!(collect_lines("\r\n"), vec![""]);
482 assert_eq!(collect_lines("\r\n\r\n"), vec!["", ""]);
483 assert_eq!(collect_lines("a\r\nb\r\n"), vec!["a", "b"]);
484 assert_eq!(collect_lines("a\r\nb"), vec!["a", "b"]);
485 assert_eq!(collect_lines("abc\r\nxyz\r\n"), vec!["abc", "xyz"]);
486 assert_eq!(collect_lines("abc\r\nxyz"), vec!["abc", "xyz"]);
487
488 assert_eq!(collect_lines("abc\rxyz"), vec!["abc\rxyz"]);
489 }
490
491 #[test]
lines_with_terminator()492 fn lines_with_terminator() {
493 assert_eq!(collect_lines_term(""), Vec::<BString>::new());
494
495 assert_eq!(collect_lines_term("\n"), vec!["\n"]);
496 assert_eq!(collect_lines_term("\n\n"), vec!["\n", "\n"]);
497 assert_eq!(collect_lines_term("a\nb\n"), vec!["a\n", "b\n"]);
498 assert_eq!(collect_lines_term("a\nb"), vec!["a\n", "b"]);
499 assert_eq!(collect_lines_term("abc\nxyz\n"), vec!["abc\n", "xyz\n"]);
500 assert_eq!(collect_lines_term("abc\nxyz"), vec!["abc\n", "xyz"]);
501
502 assert_eq!(collect_lines_term("\r\n"), vec!["\r\n"]);
503 assert_eq!(collect_lines_term("\r\n\r\n"), vec!["\r\n", "\r\n"]);
504 assert_eq!(collect_lines_term("a\r\nb\r\n"), vec!["a\r\n", "b\r\n"]);
505 assert_eq!(collect_lines_term("a\r\nb"), vec!["a\r\n", "b"]);
506 assert_eq!(
507 collect_lines_term("abc\r\nxyz\r\n"),
508 vec!["abc\r\n", "xyz\r\n"]
509 );
510 assert_eq!(collect_lines_term("abc\r\nxyz"), vec!["abc\r\n", "xyz"]);
511
512 assert_eq!(collect_lines_term("abc\rxyz"), vec!["abc\rxyz"]);
513 }
514 }
515