• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use core::fmt;
2 
3 use crate::Terminator;
4 
5 // BE ADVISED
6 //
7 // This may just be one of the more complicated CSV parsers you'll come across.
8 // The implementation never allocates and consists of both a functional NFA
9 // parser and a DFA parser. The DFA parser is the work horse and we could elide
10 // much of the work involved in making the NFA parser work, but the NFA parser
11 // is much easier to debug. The NFA parser is tested alongside the DFA parser,
12 // so they should never be out of sync.
13 //
14 // The basic structure of the implementation is to encode the NFA parser as
15 // an explicit state machine in code. The DFA is then generated by populating
16 // a transition table on the stack by exhaustively enumerating all possible
17 // states on all possible inputs (this is possible because the number of states
18 // and the number of inputs is very small).
19 //
20 // Note that some pieces of the NFA parser (such as the NFA state machine) are
21 // required. In particular, the translation from the NFA to the DFA depends on
22 // the configuration of the CSV parser as given by the caller, and indeed, this
23 // is one of the key performance benefits of the DFA: it doesn't have any
24 // overhead (other than a bigger transition table) associated with the number
25 // of configuration options.
26 //
27 // ADVICE FOR HACKERS
28 //
29 // This code is too clever for its own good. As such, changes to some parts of
30 // the code may have a non-obvious impact on other parts. This is mostly
31 // motivated by trying to keep the DFA transition table as small as possible,
32 // since it is stored on the stack. Here are some tips that may save you some
33 // time:
34 //
35 // * If you add a new NFA state, then you also need to consider how it impacts
36 //   the DFA. If all of the incoming transitions into an NFA state are
37 //   epsilon transitions, then it probably isn't materialized in the DFA.
38 //   If the NFA state indicates that a field or a record has been parsed, then
39 //   it should be considered final. Let the comments in `NfaState` be your
40 //   guide.
41 // * If you add a new configuration knob to the parser, then you may need to
42 //   modify the `TRANS_CLASSES` constant below. The `TRANS_CLASSES` constant
43 //   indicates the total number of discriminating bytes in the DFA. And if you
44 //   modify `TRANS_CLASSES`, you probably also need to modify `build_dfa` to
45 //   add a new class. For example, in order to add parsing support for
46 //   comments, I bumped `TRANS_CLASSES` from `6` to `7` and added the comment
47 //   byte (if one exists) to the list of classes in `build_dfa`.
48 // * The special DFA start state doubles as the final state once all input
49 //   from the caller has been exhausted. We must be careful to guard this
50 //   case analysis on whether the input is actually exhausted, since the start
51 //   state is an otherwise valid state.
52 
53 /// A pull based CSV reader.
54 ///
55 /// This reader parses CSV data using a finite state machine. Callers can
56 /// extract parsed data incrementally using one of the `read` methods.
57 ///
58 /// Note that this CSV reader is somewhat encoding agnostic. The source data
59 /// needs to be at least ASCII compatible. There is no support for specifying
60 /// the full gamut of Unicode delimiters/terminators/quotes/escapes. Instead,
61 /// any byte can be used, although callers probably want to stick to the ASCII
62 /// subset (`<= 0x7F`).
63 ///
64 /// # Usage
65 ///
66 /// A reader has two different ways to read CSV data, each with their own
67 /// trade offs.
68 ///
69 /// * `read_field` - Copies a single CSV field into an output buffer while
70 ///   unescaping quotes. This is simple to use and doesn't require storing an
71 ///   entire record contiguously in memory, but it is slower.
72 /// * `read_record` - Copies an entire CSV record into an output buffer while
73 ///   unescaping quotes. The ending positions of each field are copied into
74 ///   an additional buffer. This is harder to use and requires larger output
75 ///   buffers, but it is faster than `read_field` since it amortizes more
76 ///   costs.
77 ///
78 /// # RFC 4180
79 ///
80 /// [RFC 4180](https://tools.ietf.org/html/rfc4180)
81 /// is the closest thing to a specification for CSV data. Unfortunately,
82 /// CSV data that is seen in the wild can vary significantly. Often, the CSV
83 /// data is outright invalid. Instead of fixing the producers of bad CSV data,
84 /// we have seen fit to make consumers much more flexible in what they accept.
85 /// This reader continues that tradition, and therefore, isn't technically
86 /// compliant with RFC 4180. In particular, this reader will never return an
87 /// error and will always find *a* parse.
88 ///
89 /// Here are some detailed differences from RFC 4180:
90 ///
91 /// * CRLF, LF and CR are each treated as a single record terminator by
92 ///   default.
93 /// * Records are permitted to be of varying length.
94 /// * Empty lines (that do not include other whitespace) are ignored.
95 #[derive(Clone, Debug)]
96 pub struct Reader {
97     /// A table-based DFA for parsing CSV.
98     dfa: Dfa,
99     /// The current DFA state, if the DFA is used.
100     dfa_state: DfaState,
101     /// The current NFA state, if the NFA is used.
102     nfa_state: NfaState,
103     /// The delimiter that separates fields.
104     delimiter: u8,
105     /// The terminator that separates records.
106     term: Terminator,
107     /// The quotation byte.
108     quote: u8,
109     /// Whether to recognize escaped quotes.
110     escape: Option<u8>,
111     /// Whether to recognized doubled quotes.
112     double_quote: bool,
113     /// If enabled, lines beginning with this byte are ignored.
114     comment: Option<u8>,
115     /// If enabled (the default), then quotes are respected. When disabled,
116     /// quotes are not treated specially.
117     quoting: bool,
118     /// Whether to use the NFA for parsing.
119     ///
120     /// Generally this is for debugging. There's otherwise no good reason
121     /// to avoid the DFA.
122     use_nfa: bool,
123     /// The current line number.
124     line: u64,
125     /// Whether this parser has ever read anything.
126     has_read: bool,
127     /// The current position in the output buffer when reading a record.
128     output_pos: usize,
129 }
130 
131 impl Default for Reader {
default() -> Reader132     fn default() -> Reader {
133         Reader {
134             dfa: Dfa::new(),
135             dfa_state: DfaState::start(),
136             nfa_state: NfaState::StartRecord,
137             delimiter: b',',
138             term: Terminator::default(),
139             quote: b'"',
140             escape: None,
141             double_quote: true,
142             comment: None,
143             quoting: true,
144             use_nfa: false,
145             line: 1,
146             has_read: false,
147             output_pos: 0,
148         }
149     }
150 }
151 
152 /// Builds a CSV reader with various configuration knobs.
153 ///
154 /// This builder can be used to tweak the field delimiter, record terminator
155 /// and more for parsing CSV. Once a CSV `Reader` is built, its configuration
156 /// cannot be changed.
157 #[derive(Debug, Default)]
158 pub struct ReaderBuilder {
159     rdr: Reader,
160 }
161 
162 impl ReaderBuilder {
163     /// Create a new builder.
new() -> ReaderBuilder164     pub fn new() -> ReaderBuilder {
165         ReaderBuilder::default()
166     }
167 
168     /// Build a CSV parser from this configuration.
build(&self) -> Reader169     pub fn build(&self) -> Reader {
170         let mut rdr = self.rdr.clone();
171         rdr.build_dfa();
172         rdr
173     }
174 
175     /// The field delimiter to use when parsing CSV.
176     ///
177     /// The default is `b','`.
delimiter(&mut self, delimiter: u8) -> &mut ReaderBuilder178     pub fn delimiter(&mut self, delimiter: u8) -> &mut ReaderBuilder {
179         self.rdr.delimiter = delimiter;
180         self
181     }
182 
183     /// The record terminator to use when parsing CSV.
184     ///
185     /// A record terminator can be any single byte. The default is a special
186     /// value, `Terminator::CRLF`, which treats any occurrence of `\r`, `\n`
187     /// or `\r\n` as a single record terminator.
terminator(&mut self, term: Terminator) -> &mut ReaderBuilder188     pub fn terminator(&mut self, term: Terminator) -> &mut ReaderBuilder {
189         self.rdr.term = term;
190         self
191     }
192 
193     /// The quote character to use when parsing CSV.
194     ///
195     /// The default is `b'"'`.
quote(&mut self, quote: u8) -> &mut ReaderBuilder196     pub fn quote(&mut self, quote: u8) -> &mut ReaderBuilder {
197         self.rdr.quote = quote;
198         self
199     }
200 
201     /// The escape character to use when parsing CSV.
202     ///
203     /// In some variants of CSV, quotes are escaped using a special escape
204     /// character like `\` (instead of escaping quotes by doubling them).
205     ///
206     /// By default, recognizing these idiosyncratic escapes is disabled.
escape(&mut self, escape: Option<u8>) -> &mut ReaderBuilder207     pub fn escape(&mut self, escape: Option<u8>) -> &mut ReaderBuilder {
208         self.rdr.escape = escape;
209         self
210     }
211 
212     /// Enable double quote escapes.
213     ///
214     /// This is enabled by default, but it may be disabled. When disabled,
215     /// doubled quotes are not interpreted as escapes.
double_quote(&mut self, yes: bool) -> &mut ReaderBuilder216     pub fn double_quote(&mut self, yes: bool) -> &mut ReaderBuilder {
217         self.rdr.double_quote = yes;
218         self
219     }
220 
221     /// Enable or disable quoting.
222     ///
223     /// This is enabled by default, but it may be disabled. When disabled,
224     /// quotes are not treated specially.
quoting(&mut self, yes: bool) -> &mut ReaderBuilder225     pub fn quoting(&mut self, yes: bool) -> &mut ReaderBuilder {
226         self.rdr.quoting = yes;
227         self
228     }
229 
230     /// The comment character to use when parsing CSV.
231     ///
232     /// If the start of a record begins with the byte given here, then that
233     /// line is ignored by the CSV parser.
234     ///
235     /// This is disabled by default.
comment(&mut self, comment: Option<u8>) -> &mut ReaderBuilder236     pub fn comment(&mut self, comment: Option<u8>) -> &mut ReaderBuilder {
237         self.rdr.comment = comment;
238         self
239     }
240 
241     /// A convenience method for specifying a configuration to read ASCII
242     /// delimited text.
243     ///
244     /// This sets the delimiter and record terminator to the ASCII unit
245     /// separator (`\x1F`) and record separator (`\x1E`), respectively.
ascii(&mut self) -> &mut ReaderBuilder246     pub fn ascii(&mut self) -> &mut ReaderBuilder {
247         self.delimiter(b'\x1F').terminator(Terminator::Any(b'\x1E'))
248     }
249 
250     /// Enable or disable the NFA for parsing CSV.
251     ///
252     /// This is intended to be a debug option useful for debugging. The NFA
253     /// is always slower than the DFA.
254     #[doc(hidden)]
nfa(&mut self, yes: bool) -> &mut ReaderBuilder255     pub fn nfa(&mut self, yes: bool) -> &mut ReaderBuilder {
256         self.rdr.use_nfa = yes;
257         self
258     }
259 }
260 
261 /// The result of parsing at most one field from CSV data.
262 #[derive(Clone, Debug, Eq, PartialEq)]
263 pub enum ReadFieldResult {
264     /// The caller provided input was exhausted before the end of a field or
265     /// record was found.
266     InputEmpty,
267     /// The caller provided output buffer was filled before an entire field
268     /// could be written to it.
269     OutputFull,
270     /// The end of a field was found.
271     ///
272     /// Note that when `record_end` is true, then the end of this field also
273     /// corresponds to the end of a record.
274     Field {
275         /// Whether this was the last field in a record or not.
276         record_end: bool,
277     },
278     /// All CSV data has been read.
279     ///
280     /// This state can only be returned when an empty input buffer is provided
281     /// by the caller.
282     End,
283 }
284 
285 impl ReadFieldResult {
from_nfa( state: NfaState, inpdone: bool, outdone: bool, ) -> ReadFieldResult286     fn from_nfa(
287         state: NfaState,
288         inpdone: bool,
289         outdone: bool,
290     ) -> ReadFieldResult {
291         match state {
292             NfaState::End => ReadFieldResult::End,
293             NfaState::EndRecord | NfaState::CRLF => {
294                 ReadFieldResult::Field { record_end: true }
295             }
296             NfaState::EndFieldDelim => {
297                 ReadFieldResult::Field { record_end: false }
298             }
299             _ => {
300                 assert!(!state.is_field_final());
301                 if !inpdone && outdone {
302                     ReadFieldResult::OutputFull
303                 } else {
304                     ReadFieldResult::InputEmpty
305                 }
306             }
307         }
308     }
309 }
310 
311 /// The result of parsing at most one field from CSV data while ignoring the
312 /// output.
313 #[derive(Clone, Debug, Eq, PartialEq)]
314 pub enum ReadFieldNoCopyResult {
315     /// The caller provided input was exhausted before the end of a field or
316     /// record was found.
317     InputEmpty,
318     /// The end of a field was found.
319     ///
320     /// Note that when `record_end` is true, then the end of this field also
321     /// corresponds to the end of a record.
322     Field {
323         /// Whether this was the last field in a record or not.
324         record_end: bool,
325     },
326     /// All CSV data has been read.
327     ///
328     /// This state can only be returned when an empty input buffer is provided
329     /// by the caller.
330     End,
331 }
332 
333 /// The result of parsing at most one record from CSV data.
334 #[derive(Clone, Debug, Eq, PartialEq)]
335 pub enum ReadRecordResult {
336     /// The caller provided input was exhausted before the end of a record was
337     /// found.
338     InputEmpty,
339     /// The caller provided output buffer was filled before an entire field
340     /// could be written to it.
341     OutputFull,
342     /// The caller provided output buffer of field end poisitions was filled
343     /// before the next field could be parsed.
344     OutputEndsFull,
345     /// The end of a record was found.
346     Record,
347     /// All CSV data has been read.
348     ///
349     /// This state can only be returned when an empty input buffer is provided
350     /// by the caller.
351     End,
352 }
353 
354 impl ReadRecordResult {
is_record(&self) -> bool355     fn is_record(&self) -> bool {
356         *self == ReadRecordResult::Record
357     }
358 
from_nfa( state: NfaState, inpdone: bool, outdone: bool, endsdone: bool, ) -> ReadRecordResult359     fn from_nfa(
360         state: NfaState,
361         inpdone: bool,
362         outdone: bool,
363         endsdone: bool,
364     ) -> ReadRecordResult {
365         match state {
366             NfaState::End => ReadRecordResult::End,
367             NfaState::EndRecord | NfaState::CRLF => ReadRecordResult::Record,
368             _ => {
369                 assert!(!state.is_record_final());
370                 if !inpdone && outdone {
371                     ReadRecordResult::OutputFull
372                 } else if !inpdone && endsdone {
373                     ReadRecordResult::OutputEndsFull
374                 } else {
375                     ReadRecordResult::InputEmpty
376                 }
377             }
378         }
379     }
380 }
381 
382 /// The result of parsing at most one record from CSV data while ignoring
383 /// output.
384 #[derive(Clone, Debug, Eq, PartialEq)]
385 pub enum ReadRecordNoCopyResult {
386     /// The caller provided input was exhausted before the end of a record was
387     /// found.
388     InputEmpty,
389     /// The end of a record was found.
390     Record,
391     /// All CSV data has been read.
392     ///
393     /// This state can only be returned when an empty input buffer is provided
394     /// by the caller.
395     End,
396 }
397 
398 /// What should be done with input bytes during an NFA transition
399 #[derive(Clone, Debug, Eq, PartialEq)]
400 enum NfaInputAction {
401     // Do not consume an input byte
402     Epsilon,
403     // Copy input byte to a caller-provided output buffer
404     CopyToOutput,
405     // Consume but do not copy input byte (for example, seeing a field
406     // delimiter will consume an input byte but should not copy it to the
407     // output buffer.
408     Discard,
409 }
410 
411 /// An NFA state is a state that can be visited in the NFA parser.
412 ///
413 /// Given the simplicity of the machine, a subset of NFA states double as DFA
414 /// states. NFA states that only have incoming epsilon transitions are
415 /// optimized out when converting the machine to a DFA.
416 #[derive(Copy, Clone, Debug, Eq, PartialEq)]
417 enum NfaState {
418     // These states aren't used in the DFA, so we
419     // assign them meaningless numbers.
420     EndFieldTerm = 200,
421     InRecordTerm = 201,
422     End = 202,
423 
424     // All states below are DFA states.
425     StartRecord = 0,
426     StartField = 1,
427     InField = 2,
428     InQuotedField = 3,
429     InEscapedQuote = 4,
430     InDoubleEscapedQuote = 5,
431     InComment = 6,
432     // All states below are "final field" states.
433     // Namely, they indicate that a field has been parsed.
434     EndFieldDelim = 7,
435     // All states below are "final record" states.
436     // Namely, they indicate that a record has been parsed.
437     EndRecord = 8,
438     CRLF = 9,
439 }
440 
441 /// A list of NFA states that have an explicit representation in the DFA.
442 const NFA_STATES: &'static [NfaState] = &[
443     NfaState::StartRecord,
444     NfaState::StartField,
445     NfaState::EndFieldDelim,
446     NfaState::InField,
447     NfaState::InQuotedField,
448     NfaState::InEscapedQuote,
449     NfaState::InDoubleEscapedQuote,
450     NfaState::InComment,
451     NfaState::EndRecord,
452     NfaState::CRLF,
453 ];
454 
455 impl NfaState {
456     /// Returns true if this state indicates that a field has been parsed.
is_field_final(&self) -> bool457     fn is_field_final(&self) -> bool {
458         match *self {
459             NfaState::End
460             | NfaState::EndRecord
461             | NfaState::CRLF
462             | NfaState::EndFieldDelim => true,
463             _ => false,
464         }
465     }
466 
467     /// Returns true if this state indicates that a record has been parsed.
is_record_final(&self) -> bool468     fn is_record_final(&self) -> bool {
469         match *self {
470             NfaState::End | NfaState::EndRecord | NfaState::CRLF => true,
471             _ => false,
472         }
473     }
474 }
475 
476 impl Reader {
477     /// Create a new CSV reader with a default parser configuration.
new() -> Reader478     pub fn new() -> Reader {
479         ReaderBuilder::new().build()
480     }
481 
482     /// Reset the parser such that it behaves as if it had never been used.
483     ///
484     /// This may be useful when reading CSV data in a random access pattern.
reset(&mut self)485     pub fn reset(&mut self) {
486         self.dfa_state = self.dfa.new_state(NfaState::StartRecord);
487         self.nfa_state = NfaState::StartRecord;
488         self.line = 1;
489         self.has_read = false;
490         self.output_pos = 0;
491     }
492 
493     /// Return the current line number as measured by the number of occurrences
494     /// of `\n`.
495     ///
496     /// Line numbers starts at `1` and are reset when `reset` is called.
line(&self) -> u64497     pub fn line(&self) -> u64 {
498         self.line
499     }
500 
501     /// Set the line number.
502     ///
503     /// This is useful after a call to `reset` where the caller knows the
504     /// line number from some additional context.
set_line(&mut self, line: u64)505     pub fn set_line(&mut self, line: u64) {
506         self.line = line;
507     }
508 
509     /// Parse a single CSV field in `input` and copy field data to `output`.
510     ///
511     /// This routine requires a caller provided buffer of CSV data as the
512     /// `input` and a caller provided buffer, `output`, in which to store field
513     /// data extracted from `input`. The field data copied to `output` will
514     /// have its quotes unescaped.
515     ///
516     /// Calling this routine parses at most a single field and returns
517     /// three values indicating the state of the parser. The first value, a
518     /// `ReadFieldResult`, tells the caller what to do next. For example, if
519     /// the entire input was read or if the output buffer was filled before
520     /// a full field had been read, then `ReadFieldResult::InputEmpty` or
521     /// `ReadFieldResult::OutputFull` is returned, respectively. See the
522     /// documentation for `ReadFieldResult` for more details.
523     ///
524     /// The other two values returned correspond to the number of bytes
525     /// read from `input` and written to `output`, respectively.
526     ///
527     /// # Termination
528     ///
529     /// This reader interprets an empty `input` buffer as an indication that
530     /// there is no CSV data left to read. Namely, when the caller has
531     /// exhausted all CSV data, the caller should continue to call `read` with
532     /// an empty input buffer until `ReadFieldResult::End` is returned.
533     ///
534     /// # Errors
535     ///
536     /// This CSV reader can never return an error. Instead, it prefers *a*
537     /// parse over *no* parse.
read_field( &mut self, input: &[u8], output: &mut [u8], ) -> (ReadFieldResult, usize, usize)538     pub fn read_field(
539         &mut self,
540         input: &[u8],
541         output: &mut [u8],
542     ) -> (ReadFieldResult, usize, usize) {
543         let (input, bom_nin) = self.strip_utf8_bom(input);
544         let (res, nin, nout) = if self.use_nfa {
545             self.read_field_nfa(input, output)
546         } else {
547             self.read_field_dfa(input, output)
548         };
549         self.has_read = true;
550         (res, nin + bom_nin, nout)
551     }
552 
553     /// Parse a single CSV record in `input` and copy each field contiguously
554     /// to `output`, with the end position of each field written to `ends`.
555     ///
556     /// **NOTE**: This method is more cumbersome to use than `read_field`, but
557     /// it can be faster since it amortizes more work.
558     ///
559     /// This routine requires a caller provided buffer of CSV data as the
560     /// `input` and two caller provided buffers to store the unescaped field
561     /// data (`output`) and the end position of each field in the record
562     /// (`fields`).
563     ///
564     /// Calling this routine parses at most a single record and returns four
565     /// values indicating the state of the parser. The first value, a
566     /// `ReadRecordResult`, tells the caller what to do next. For example, if
567     /// the entire input was read or if the output buffer was filled before a
568     /// full field had been read, then `ReadRecordResult::InputEmpty` or
569     /// `ReadRecordResult::OutputFull` is returned, respectively. Similarly, if
570     /// the `ends` buffer is full, then `ReadRecordResult::OutputEndsFull` is
571     /// returned. See the documentation for `ReadRecordResult` for more
572     /// details.
573     ///
574     /// The other three values correspond to the number of bytes read from
575     /// `input`, the number of bytes written to `output` and the number of
576     /// end positions written to `ends`, respectively.
577     ///
578     /// The end positions written to `ends` are constructed as if there was
579     /// a single contiguous buffer in memory containing the entire row, even
580     /// if `ReadRecordResult::OutputFull` was returned in the middle of reading
581     /// a row.
582     ///
583     /// # Termination
584     ///
585     /// This reader interprets an empty `input` buffer as an indication that
586     /// there is no CSV data left to read. Namely, when the caller has
587     /// exhausted all CSV data, the caller should continue to call `read` with
588     /// an empty input buffer until `ReadRecordResult::End` is returned.
589     ///
590     /// # Errors
591     ///
592     /// This CSV reader can never return an error. Instead, it prefers *a*
593     /// parse over *no* parse.
read_record( &mut self, input: &[u8], output: &mut [u8], ends: &mut [usize], ) -> (ReadRecordResult, usize, usize, usize)594     pub fn read_record(
595         &mut self,
596         input: &[u8],
597         output: &mut [u8],
598         ends: &mut [usize],
599     ) -> (ReadRecordResult, usize, usize, usize) {
600         let (input, bom_nin) = self.strip_utf8_bom(input);
601         let (res, nin, nout, nend) = if self.use_nfa {
602             self.read_record_nfa(input, output, ends)
603         } else {
604             self.read_record_dfa(input, output, ends)
605         };
606         self.has_read = true;
607         (res, nin + bom_nin, nout, nend)
608     }
609 
610     /// Strip off a possible UTF-8 BOM at the start of a file. Quick note that
611     /// this method will fail to strip off the BOM if only part of the BOM is
612     /// buffered. Hopefully that won't happen very often.
strip_utf8_bom<'a>(&self, input: &'a [u8]) -> (&'a [u8], usize)613     fn strip_utf8_bom<'a>(&self, input: &'a [u8]) -> (&'a [u8], usize) {
614         let (input, nin) = if {
615             !self.has_read
616                 && input.len() >= 3
617                 && &input[0..3] == b"\xef\xbb\xbf"
618         } {
619             (&input[3..], 3)
620         } else {
621             (input, 0)
622         };
623         (input, nin)
624     }
625 
626     #[inline(always)]
read_record_dfa( &mut self, input: &[u8], output: &mut [u8], ends: &mut [usize], ) -> (ReadRecordResult, usize, usize, usize)627     fn read_record_dfa(
628         &mut self,
629         input: &[u8],
630         output: &mut [u8],
631         ends: &mut [usize],
632     ) -> (ReadRecordResult, usize, usize, usize) {
633         if input.is_empty() {
634             let s = self.transition_final_dfa(self.dfa_state);
635             let res =
636                 self.dfa.new_read_record_result(s, true, false, false, false);
637             // This part is a little tricky. When reading the final record,
638             // the last result the caller will get is an InputEmpty, and while
639             // they'll have everything they need in `output`, they'll be
640             // missing the final end position of the final field in `ends`.
641             // We insert that here, but we must take care to handle the case
642             // where `ends` doesn't have enough space. If it doesn't have
643             // enough space, then we also can't transition to the next state.
644             return match res {
645                 ReadRecordResult::Record => {
646                     if ends.is_empty() {
647                         return (ReadRecordResult::OutputEndsFull, 0, 0, 0);
648                     }
649                     self.dfa_state = s;
650                     ends[0] = self.output_pos;
651                     self.output_pos = 0;
652                     (res, 0, 0, 1)
653                 }
654                 _ => {
655                     self.dfa_state = s;
656                     (res, 0, 0, 0)
657                 }
658             };
659         }
660         if output.is_empty() {
661             return (ReadRecordResult::OutputFull, 0, 0, 0);
662         }
663         if ends.is_empty() {
664             return (ReadRecordResult::OutputEndsFull, 0, 0, 0);
665         }
666         let (mut nin, mut nout, mut nend) = (0, 0, 0);
667         let mut state = self.dfa_state;
668         while nin < input.len() && nout < output.len() && nend < ends.len() {
669             let (s, has_out) = self.dfa.get_output(state, input[nin]);
670             self.line += (input[nin] == b'\n') as u64;
671             state = s;
672             if has_out {
673                 output[nout] = input[nin];
674                 nout += 1;
675             }
676             nin += 1;
677             if state >= self.dfa.final_field {
678                 ends[nend] = self.output_pos + nout;
679                 nend += 1;
680                 if state > self.dfa.final_field {
681                     break;
682                 }
683             }
684             if state == self.dfa.in_field || state == self.dfa.in_quoted {
685                 self.dfa
686                     .classes
687                     .scan_and_copy(input, &mut nin, output, &mut nout);
688             }
689         }
690         let res = self.dfa.new_read_record_result(
691             state,
692             false,
693             nin >= input.len(),
694             nout >= output.len(),
695             nend >= ends.len(),
696         );
697         self.dfa_state = state;
698         if res.is_record() {
699             self.output_pos = 0;
700         } else {
701             self.output_pos += nout;
702         }
703         (res, nin, nout, nend)
704     }
705 
706     #[inline(always)]
read_field_dfa( &mut self, input: &[u8], output: &mut [u8], ) -> (ReadFieldResult, usize, usize)707     fn read_field_dfa(
708         &mut self,
709         input: &[u8],
710         output: &mut [u8],
711     ) -> (ReadFieldResult, usize, usize) {
712         if input.is_empty() {
713             self.dfa_state = self.transition_final_dfa(self.dfa_state);
714             let res = self.dfa.new_read_field_result(
715                 self.dfa_state,
716                 true,
717                 false,
718                 false,
719             );
720             return (res, 0, 0);
721         }
722         if output.is_empty() {
723             return (ReadFieldResult::OutputFull, 0, 0);
724         }
725         let (mut nin, mut nout) = (0, 0);
726         let mut state = self.dfa_state;
727         while nin < input.len() && nout < output.len() {
728             let b = input[nin];
729             self.line += (b == b'\n') as u64;
730             let (s, has_out) = self.dfa.get_output(state, b);
731             state = s;
732             if has_out {
733                 output[nout] = b;
734                 nout += 1;
735             }
736             nin += 1;
737             if state >= self.dfa.final_field {
738                 break;
739             }
740         }
741         let res = self.dfa.new_read_field_result(
742             state,
743             false,
744             nin >= input.len(),
745             nout >= output.len(),
746         );
747         self.dfa_state = state;
748         (res, nin, nout)
749     }
750 
751     /// Perform the final state transition, i.e., when the caller indicates
752     /// that the input has been exhausted.
transition_final_dfa(&self, state: DfaState) -> DfaState753     fn transition_final_dfa(&self, state: DfaState) -> DfaState {
754         // If we''ve already emitted a record or think we're ready to start
755         // parsing a new record, then we should sink into the final state
756         // and never move from there. (pro-tip: the start state doubles as
757         // the final state!)
758         if state >= self.dfa.final_record || state.is_start() {
759             self.dfa.new_state_final_end()
760         } else {
761             self.dfa.new_state_final_record()
762         }
763     }
764 
765     /// Write the transition tables for the DFA based on this parser's
766     /// configuration.
build_dfa(&mut self)767     fn build_dfa(&mut self) {
768         // A naive DFA transition table has
769         // `cells = (# number of states) * (# size of alphabet)`. While we
770         // could get away with that, the table would have `10 * 256 = 2560`
771         // entries. Even worse, in order to avoid a multiplication instruction
772         // when computing the next transition, we store the starting index of
773         // each state's row, which would not be representible in a single byte.
774         // So we'd need a `u16`, which doubles our transition table size to
775         // ~5KB. This is a lot to put on the stack, even though it probably
776         // fits in the L1 cache of most modern CPUs.
777         //
778         // To avoid this, we note that while our "true" alphabet
779         // has 256 distinct possibilities, the DFA itself is only
780         // discriminatory on a very small subset of that alphabet. For
781         // example, assuming neither `a` nor `b` are set as special
782         // quote/comment/escape/delimiter/terminator bytes, they are otherwise
783         // indistinguishable to the DFA, so it would be OK to treat them as
784         // if they were equivalent. That is, they are in the same equivalence
785         // class.
786         //
787         // As it turns out, using this logic, we can shrink our effective
788         // alphabet down to 7 equivalence classes:
789         //
790         //   1. The field delimiter.
791         //   2. The record terminator.
792         //   3. If the record terminator is CRLF, then CR and LF are
793         //      distinct equivalence classes.
794         //   4. The quote byte.
795         //   5. The escape byte.
796         //   6. The comment byte.
797         //   7. Everything else.
798         //
799         // We add those equivalence classes here. If more configuration knobs
800         // are added to the parser with more discriminating bytes, then this
801         // logic will need to be adjusted further.
802         //
803         // Even though this requires an extra bit of indirection when computing
804         // the next transition, microbenchmarks say that it doesn't make much
805         // of a difference. Perhaps because everything fits into the L1 cache.
806         self.dfa.classes.add(self.delimiter);
807         if self.quoting {
808             self.dfa.classes.add(self.quote);
809             if let Some(escape) = self.escape {
810                 self.dfa.classes.add(escape);
811             }
812         }
813         if let Some(comment) = self.comment {
814             self.dfa.classes.add(comment);
815         }
816         match self.term {
817             Terminator::Any(b) => self.dfa.classes.add(b),
818             Terminator::CRLF => {
819                 self.dfa.classes.add(b'\r');
820                 self.dfa.classes.add(b'\n');
821             }
822             _ => unreachable!(),
823         }
824         // Build the DFA transition table by computing the DFA state for all
825         // possible combinations of state and input byte.
826         for &state in NFA_STATES {
827             for c in (0..256).map(|c| c as u8) {
828                 let mut nfa_result = (state, NfaInputAction::Epsilon);
829                 // Consume NFA states until we hit a non-epsilon transition.
830                 while nfa_result.0 != NfaState::End
831                     && nfa_result.1 == NfaInputAction::Epsilon
832                 {
833                     nfa_result = self.transition_nfa(nfa_result.0, c);
834                 }
835                 let from = self.dfa.new_state(state);
836                 let to = self.dfa.new_state(nfa_result.0);
837                 self.dfa.set(
838                     from,
839                     c,
840                     to,
841                     nfa_result.1 == NfaInputAction::CopyToOutput,
842                 );
843             }
844         }
845         self.dfa_state = self.dfa.new_state(NfaState::StartRecord);
846         self.dfa.finish();
847     }
848 
849     // The NFA implementation follows. The transition_final_nfa and
850     // transition_nfa methods are required for the DFA to operate. The
851     // rest are included for completeness (and debugging). Note that this
852     // NFA implementation is included in most of the CSV parser tests below.
853 
854     #[inline(always)]
read_record_nfa( &mut self, input: &[u8], output: &mut [u8], ends: &mut [usize], ) -> (ReadRecordResult, usize, usize, usize)855     fn read_record_nfa(
856         &mut self,
857         input: &[u8],
858         output: &mut [u8],
859         ends: &mut [usize],
860     ) -> (ReadRecordResult, usize, usize, usize) {
861         if input.is_empty() {
862             let s = self.transition_final_nfa(self.nfa_state);
863             let res = ReadRecordResult::from_nfa(s, false, false, false);
864             return match res {
865                 ReadRecordResult::Record => {
866                     if ends.is_empty() {
867                         return (ReadRecordResult::OutputEndsFull, 0, 0, 0);
868                     }
869                     self.nfa_state = s;
870                     ends[0] = self.output_pos;
871                     self.output_pos = 0;
872                     (res, 0, 0, 1)
873                 }
874                 _ => {
875                     self.nfa_state = s;
876                     (res, 0, 0, 0)
877                 }
878             };
879         }
880         if output.is_empty() {
881             return (ReadRecordResult::OutputFull, 0, 0, 0);
882         }
883         if ends.is_empty() {
884             return (ReadRecordResult::OutputEndsFull, 0, 0, 0);
885         }
886         let (mut nin, mut nout, mut nend) = (0, self.output_pos, 0);
887         let mut state = self.nfa_state;
888         while nin < input.len() && nout < output.len() && nend < ends.len() {
889             let (s, io) = self.transition_nfa(state, input[nin]);
890             match io {
891                 NfaInputAction::CopyToOutput => {
892                     output[nout] = input[nin];
893                     nout += 1;
894                     nin += 1;
895                 }
896                 NfaInputAction::Discard => {
897                     nin += 1;
898                 }
899                 NfaInputAction::Epsilon => {}
900             }
901             state = s;
902             if state.is_field_final() {
903                 ends[nend] = nout;
904                 nend += 1;
905                 if state != NfaState::EndFieldDelim {
906                     break;
907                 }
908             }
909         }
910         let res = ReadRecordResult::from_nfa(
911             state,
912             nin >= input.len(),
913             nout >= output.len(),
914             nend >= ends.len(),
915         );
916         self.nfa_state = state;
917         self.output_pos = if res.is_record() { 0 } else { nout };
918         (res, nin, nout, nend)
919     }
920 
921     #[inline(always)]
read_field_nfa( &mut self, input: &[u8], output: &mut [u8], ) -> (ReadFieldResult, usize, usize)922     fn read_field_nfa(
923         &mut self,
924         input: &[u8],
925         output: &mut [u8],
926     ) -> (ReadFieldResult, usize, usize) {
927         if input.is_empty() {
928             self.nfa_state = self.transition_final_nfa(self.nfa_state);
929             let res = ReadFieldResult::from_nfa(self.nfa_state, false, false);
930             return (res, 0, 0);
931         }
932         if output.is_empty() {
933             // If the output buffer is empty, then we can never make progress,
934             // so just quit now.
935             return (ReadFieldResult::OutputFull, 0, 0);
936         }
937         let (mut nin, mut nout) = (0, 0);
938         let mut state = self.nfa_state;
939         while nin < input.len() && nout < output.len() {
940             let (s, io) = self.transition_nfa(state, input[nin]);
941             match io {
942                 NfaInputAction::CopyToOutput => {
943                     output[nout] = input[nin];
944                     nout += 1;
945                     nin += 1;
946                 }
947                 NfaInputAction::Discard => {
948                     nin += 1;
949                 }
950                 NfaInputAction::Epsilon => (),
951             }
952             state = s;
953             if state.is_field_final() {
954                 break;
955             }
956         }
957         let res = ReadFieldResult::from_nfa(
958             state,
959             nin >= input.len(),
960             nout >= output.len(),
961         );
962         self.nfa_state = state;
963         (res, nin, nout)
964     }
965 
966     /// Compute the final NFA transition after all caller-provided input has
967     /// been exhausted.
968     #[inline(always)]
transition_final_nfa(&self, state: NfaState) -> NfaState969     fn transition_final_nfa(&self, state: NfaState) -> NfaState {
970         use self::NfaState::*;
971         match state {
972             End | StartRecord | EndRecord | InComment | CRLF => End,
973             StartField | EndFieldDelim | EndFieldTerm | InField
974             | InQuotedField | InEscapedQuote | InDoubleEscapedQuote
975             | InRecordTerm => EndRecord,
976         }
977     }
978 
979     /// Compute the next NFA state given the current NFA state and the current
980     /// input byte.
981     ///
982     /// This returns the next NFA state along with an NfaInputAction that
983     /// indicates what should be done with the input byte (nothing for an epsilon
984     /// transition, copied to a caller provided output buffer, or discarded).
985     #[inline(always)]
transition_nfa( &self, state: NfaState, c: u8, ) -> (NfaState, NfaInputAction)986     fn transition_nfa(
987         &self,
988         state: NfaState,
989         c: u8,
990     ) -> (NfaState, NfaInputAction) {
991         use self::NfaState::*;
992         match state {
993             End => (End, NfaInputAction::Epsilon),
994             StartRecord => {
995                 if self.term.equals(c) {
996                     (StartRecord, NfaInputAction::Discard)
997                 } else if self.comment == Some(c) {
998                     (InComment, NfaInputAction::Discard)
999                 } else {
1000                     (StartField, NfaInputAction::Epsilon)
1001                 }
1002             }
1003             EndRecord => (StartRecord, NfaInputAction::Epsilon),
1004             StartField => {
1005                 if self.quoting && self.quote == c {
1006                     (InQuotedField, NfaInputAction::Discard)
1007                 } else if self.delimiter == c {
1008                     (EndFieldDelim, NfaInputAction::Discard)
1009                 } else if self.term.equals(c) {
1010                     (EndFieldTerm, NfaInputAction::Epsilon)
1011                 } else {
1012                     (InField, NfaInputAction::CopyToOutput)
1013                 }
1014             }
1015             EndFieldDelim => (StartField, NfaInputAction::Epsilon),
1016             EndFieldTerm => (InRecordTerm, NfaInputAction::Epsilon),
1017             InField => {
1018                 if self.delimiter == c {
1019                     (EndFieldDelim, NfaInputAction::Discard)
1020                 } else if self.term.equals(c) {
1021                     (EndFieldTerm, NfaInputAction::Epsilon)
1022                 } else {
1023                     (InField, NfaInputAction::CopyToOutput)
1024                 }
1025             }
1026             InQuotedField => {
1027                 if self.quoting && self.quote == c {
1028                     (InDoubleEscapedQuote, NfaInputAction::Discard)
1029                 } else if self.quoting && self.escape == Some(c) {
1030                     (InEscapedQuote, NfaInputAction::Discard)
1031                 } else {
1032                     (InQuotedField, NfaInputAction::CopyToOutput)
1033                 }
1034             }
1035             InEscapedQuote => (InQuotedField, NfaInputAction::CopyToOutput),
1036             InDoubleEscapedQuote => {
1037                 if self.quoting && self.double_quote && self.quote == c {
1038                     (InQuotedField, NfaInputAction::CopyToOutput)
1039                 } else if self.delimiter == c {
1040                     (EndFieldDelim, NfaInputAction::Discard)
1041                 } else if self.term.equals(c) {
1042                     (EndFieldTerm, NfaInputAction::Epsilon)
1043                 } else {
1044                     (InField, NfaInputAction::CopyToOutput)
1045                 }
1046             }
1047             InComment => {
1048                 if b'\n' == c {
1049                     (StartRecord, NfaInputAction::Discard)
1050                 } else {
1051                     (InComment, NfaInputAction::Discard)
1052                 }
1053             }
1054             InRecordTerm => {
1055                 if self.term.is_crlf() && b'\r' == c {
1056                     (CRLF, NfaInputAction::Discard)
1057                 } else {
1058                     (EndRecord, NfaInputAction::Discard)
1059                 }
1060             }
1061             CRLF => {
1062                 if b'\n' == c {
1063                     (StartRecord, NfaInputAction::Discard)
1064                 } else {
1065                     (StartRecord, NfaInputAction::Epsilon)
1066                 }
1067             }
1068         }
1069     }
1070 }
1071 
1072 /// The number of slots in the DFA transition table.
1073 ///
1074 /// This number is computed by multiplying the maximum number of transition
1075 /// classes (7) by the total number of NFA states that are used in the DFA
1076 /// (10).
1077 ///
1078 /// The number of transition classes is determined by an equivalence class of
1079 /// bytes, where every byte in the same equivalence classes is
1080 /// indistinguishable from any other byte with respect to the DFA. For example,
1081 /// if neither `a` nor `b` are specifed as a delimiter/quote/terminator/escape,
1082 /// then the DFA will never discriminate between `a` or `b`, so they can
1083 /// effectively be treated as identical. This reduces storage space
1084 /// substantially.
1085 ///
1086 /// The total number of NFA states (13) is greater than the total number of
1087 /// NFA states that are in the DFA. In particular, any NFA state that can only
1088 /// be reached by epsilon transitions will never have explicit usage in the
1089 /// DFA.
1090 const TRANS_CLASSES: usize = 7;
1091 const DFA_STATES: usize = 10;
1092 const TRANS_SIZE: usize = TRANS_CLASSES * DFA_STATES;
1093 
1094 /// The number of possible transition classes. (See the comment on `TRANS_SIZE`
1095 /// for more details.)
1096 const CLASS_SIZE: usize = 256;
1097 
1098 /// A representation of a DFA.
1099 ///
1100 /// For the most part, this is a transition table, but various optimizations
1101 /// have been applied to reduce its memory footprint.
1102 struct Dfa {
1103     /// The core transition table. Each row corresponds to the transitions for
1104     /// each input equivalence class. (Input bytes are mapped to their
1105     /// corresponding equivalence class with the `classes` map.)
1106     ///
1107     /// DFA states are represented as an index corresponding to the start of
1108     /// its row in this table.
1109     trans: [DfaState; TRANS_SIZE],
1110     /// A table with the same layout as `trans`, except its values indicate
1111     /// whether a particular `(state, equivalence class)` pair should emit an
1112     /// output byte.
1113     has_output: [bool; TRANS_SIZE],
1114     /// A map from input byte to equivalence class.
1115     ///
1116     /// This is responsible for reducing the effective alphabet size from
1117     /// 256 to `TRANS_CLASSES`.
1118     classes: DfaClasses,
1119     /// The DFA state corresponding to being inside an unquoted field.
1120     in_field: DfaState,
1121     /// The DFA state corresponding to being inside an quoted field.
1122     in_quoted: DfaState,
1123     /// The minimum DFA state that indicates a field has been parsed. All DFA
1124     /// states greater than this are also final-field states.
1125     final_field: DfaState,
1126     /// The minimum DFA state that indicates a record has been parsed. All DFA
1127     /// states greater than this are also final-record states.
1128     final_record: DfaState,
1129 }
1130 
1131 impl Dfa {
new() -> Dfa1132     fn new() -> Dfa {
1133         Dfa {
1134             trans: [DfaState(0); TRANS_SIZE],
1135             has_output: [false; TRANS_SIZE],
1136             classes: DfaClasses::new(),
1137             in_field: DfaState(0),
1138             in_quoted: DfaState(0),
1139             final_field: DfaState(0),
1140             final_record: DfaState(0),
1141         }
1142     }
1143 
new_state(&self, nfa_state: NfaState) -> DfaState1144     fn new_state(&self, nfa_state: NfaState) -> DfaState {
1145         let nclasses = self.classes.num_classes() as u8;
1146         let idx = (nfa_state as u8).checked_mul(nclasses).unwrap();
1147         DfaState(idx)
1148     }
1149 
new_state_final_end(&self) -> DfaState1150     fn new_state_final_end(&self) -> DfaState {
1151         self.new_state(NfaState::StartRecord)
1152     }
1153 
new_state_final_record(&self) -> DfaState1154     fn new_state_final_record(&self) -> DfaState {
1155         self.new_state(NfaState::EndRecord)
1156     }
1157 
get_output(&self, state: DfaState, c: u8) -> (DfaState, bool)1158     fn get_output(&self, state: DfaState, c: u8) -> (DfaState, bool) {
1159         let cls = self.classes.classes[c as usize];
1160         let idx = state.0 as usize + cls as usize;
1161         (self.trans[idx], self.has_output[idx])
1162     }
1163 
set(&mut self, from: DfaState, c: u8, to: DfaState, output: bool)1164     fn set(&mut self, from: DfaState, c: u8, to: DfaState, output: bool) {
1165         let cls = self.classes.classes[c as usize];
1166         let idx = from.0 as usize + cls as usize;
1167         self.trans[idx] = to;
1168         self.has_output[idx] = output;
1169     }
1170 
finish(&mut self)1171     fn finish(&mut self) {
1172         self.in_field = self.new_state(NfaState::InField);
1173         self.in_quoted = self.new_state(NfaState::InQuotedField);
1174         self.final_field = self.new_state(NfaState::EndFieldDelim);
1175         self.final_record = self.new_state(NfaState::EndRecord);
1176     }
1177 
new_read_field_result( &self, state: DfaState, is_final_trans: bool, inpdone: bool, outdone: bool, ) -> ReadFieldResult1178     fn new_read_field_result(
1179         &self,
1180         state: DfaState,
1181         is_final_trans: bool,
1182         inpdone: bool,
1183         outdone: bool,
1184     ) -> ReadFieldResult {
1185         if state >= self.final_record {
1186             ReadFieldResult::Field { record_end: true }
1187         } else if state == self.final_field {
1188             ReadFieldResult::Field { record_end: false }
1189         } else if is_final_trans && state.is_start() {
1190             ReadFieldResult::End
1191         } else {
1192             debug_assert!(state < self.final_field);
1193             if !inpdone && outdone {
1194                 ReadFieldResult::OutputFull
1195             } else {
1196                 ReadFieldResult::InputEmpty
1197             }
1198         }
1199     }
1200 
new_read_record_result( &self, state: DfaState, is_final_trans: bool, inpdone: bool, outdone: bool, endsdone: bool, ) -> ReadRecordResult1201     fn new_read_record_result(
1202         &self,
1203         state: DfaState,
1204         is_final_trans: bool,
1205         inpdone: bool,
1206         outdone: bool,
1207         endsdone: bool,
1208     ) -> ReadRecordResult {
1209         if state >= self.final_record {
1210             ReadRecordResult::Record
1211         } else if is_final_trans && state.is_start() {
1212             ReadRecordResult::End
1213         } else {
1214             debug_assert!(state < self.final_record);
1215             if !inpdone && outdone {
1216                 ReadRecordResult::OutputFull
1217             } else if !inpdone && endsdone {
1218                 ReadRecordResult::OutputEndsFull
1219             } else {
1220                 ReadRecordResult::InputEmpty
1221             }
1222         }
1223     }
1224 }
1225 
1226 /// A map from input byte to equivalence class.
1227 struct DfaClasses {
1228     classes: [u8; CLASS_SIZE],
1229     next_class: usize,
1230 }
1231 
1232 impl DfaClasses {
new() -> DfaClasses1233     fn new() -> DfaClasses {
1234         DfaClasses { classes: [0; CLASS_SIZE], next_class: 1 }
1235     }
1236 
add(&mut self, b: u8)1237     fn add(&mut self, b: u8) {
1238         if self.next_class > CLASS_SIZE {
1239             panic!("added too many classes")
1240         }
1241         self.classes[b as usize] = self.next_class as u8;
1242         self.next_class = self.next_class + 1;
1243     }
1244 
num_classes(&self) -> usize1245     fn num_classes(&self) -> usize {
1246         self.next_class as usize
1247     }
1248 
1249     /// Scan and copy the input bytes to the output buffer quickly.
1250     ///
1251     /// This assumes that the current state of the DFA is either `InField` or
1252     /// `InQuotedField`. In this case, all bytes corresponding to the first
1253     /// equivalence class (i.e., not a delimiter/quote/escape/etc.) are
1254     /// guaranteed to never result in a state transition out of the current
1255     /// state. This function takes advantage of that copies every byte from
1256     /// `input` in the first equivalence class to `output`. Once a byte is seen
1257     /// outside the first equivalence class, we quit and should fall back to
1258     /// the main DFA loop.
1259     #[inline(always)]
scan_and_copy( &self, input: &[u8], nin: &mut usize, output: &mut [u8], nout: &mut usize, )1260     fn scan_and_copy(
1261         &self,
1262         input: &[u8],
1263         nin: &mut usize,
1264         output: &mut [u8],
1265         nout: &mut usize,
1266     ) {
1267         while *nin < input.len()
1268             && *nout < output.len()
1269             && self.classes[input[*nin] as usize] == 0
1270         {
1271             output[*nout] = input[*nin];
1272             *nin += 1;
1273             *nout += 1;
1274         }
1275     }
1276 }
1277 
1278 /// A single DFA state.
1279 ///
1280 /// A DFA state is represented by the starting index of its corresponding row
1281 /// in the DFA transition table. This representation allows us to elide a
1282 /// single multiplication instruction when computing the next transition for
1283 /// a particular input byte.
1284 #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)]
1285 struct DfaState(u8);
1286 
1287 impl DfaState {
start() -> DfaState1288     fn start() -> DfaState {
1289         DfaState(0)
1290     }
1291 
is_start(&self) -> bool1292     fn is_start(&self) -> bool {
1293         self.0 == 0
1294     }
1295 }
1296 
1297 impl fmt::Debug for Dfa {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1298     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1299         write!(f, "Dfa(N/A)")
1300     }
1301 }
1302 
1303 impl fmt::Debug for DfaClasses {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1304     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1305         write!(
1306             f,
1307             "DfaClasses {{ classes: N/A, next_class: {:?} }}",
1308             self.next_class
1309         )
1310     }
1311 }
1312 
1313 impl Clone for Dfa {
clone(&self) -> Dfa1314     fn clone(&self) -> Dfa {
1315         let mut dfa = Dfa::new();
1316         dfa.trans.copy_from_slice(&self.trans);
1317         dfa
1318     }
1319 }
1320 
1321 impl Clone for DfaClasses {
clone(&self) -> DfaClasses1322     fn clone(&self) -> DfaClasses {
1323         let mut x = DfaClasses::new();
1324         x.classes.copy_from_slice(&self.classes);
1325         x
1326     }
1327 }
1328 
1329 #[cfg(test)]
1330 mod tests {
1331     use core::str;
1332 
1333     use arrayvec::{ArrayString, ArrayVec};
1334 
1335     use super::{ReadFieldResult, Reader, ReaderBuilder, Terminator};
1336 
1337     type Csv = ArrayVec<[Row; 10]>;
1338     type Row = ArrayVec<[Field; 10]>;
1339     type Field = ArrayString<[u8; 10]>;
1340 
1341     // OMG I HATE BYTE STRING LITERALS SO MUCH.
b(s: &str) -> &[u8]1342     fn b(s: &str) -> &[u8] {
1343         s.as_bytes()
1344     }
1345 
1346     macro_rules! csv {
1347         ($([$($field:expr),*]),*) => {{
1348             #[allow(unused_mut)]
1349             fn x() -> Csv {
1350                 let mut csv = Csv::new();
1351                 $(
1352                     let mut row = Row::new();
1353                     $(
1354                         row.push(Field::from($field).unwrap());
1355                     )*
1356                     csv.push(row);
1357                 )*
1358                 csv
1359             }
1360             x()
1361         }}
1362     }
1363 
1364     macro_rules! parses_to {
1365         ($name:ident, $data:expr, $expected:expr) => {
1366             parses_to!($name, $data, $expected, |builder| builder);
1367         };
1368         ($name:ident, $data:expr, $expected:expr, $config:expr) => {
1369             #[test]
1370             fn $name() {
1371                 let mut builder = ReaderBuilder::new();
1372                 builder.nfa(true);
1373                 $config(&mut builder);
1374                 let mut rdr = builder.build();
1375                 let got = parse_by_field(&mut rdr, $data);
1376                 let expected = $expected;
1377                 assert_eq!(expected, got, "nfa by field");
1378 
1379                 let mut builder = ReaderBuilder::new();
1380                 builder.nfa(true);
1381                 $config(&mut builder);
1382                 let mut rdr = builder.build();
1383                 let got = parse_by_record(&mut rdr, $data);
1384                 let expected = $expected;
1385                 assert_eq!(expected, got, "nfa by record");
1386 
1387                 let mut builder = ReaderBuilder::new();
1388                 $config(&mut builder);
1389                 let mut rdr = builder.build();
1390                 let got = parse_by_field(&mut rdr, $data);
1391                 let expected = $expected;
1392                 assert_eq!(expected, got, "dfa by field");
1393 
1394                 let mut builder = ReaderBuilder::new();
1395                 $config(&mut builder);
1396                 let mut rdr = builder.build();
1397                 let got = parse_by_record(&mut rdr, $data);
1398                 let expected = $expected;
1399                 assert_eq!(expected, got, "dfa by record");
1400             }
1401         };
1402     }
1403 
parse_by_field(rdr: &mut Reader, data: &str) -> Csv1404     fn parse_by_field(rdr: &mut Reader, data: &str) -> Csv {
1405         let mut data = data.as_bytes();
1406         let mut field = [0u8; 10];
1407         let mut csv = Csv::new();
1408         let mut row = Row::new();
1409         let mut outpos = 0;
1410         loop {
1411             let (res, nin, nout) = rdr.read_field(data, &mut field[outpos..]);
1412             data = &data[nin..];
1413             outpos += nout;
1414 
1415             match res {
1416                 ReadFieldResult::InputEmpty => {
1417                     if !data.is_empty() {
1418                         panic!("missing input data")
1419                     }
1420                 }
1421                 ReadFieldResult::OutputFull => panic!("field too large"),
1422                 ReadFieldResult::Field { record_end } => {
1423                     let s = str::from_utf8(&field[..outpos]).unwrap();
1424                     row.push(Field::from(s).unwrap());
1425                     outpos = 0;
1426                     if record_end {
1427                         csv.push(row);
1428                         row = Row::new();
1429                     }
1430                 }
1431                 ReadFieldResult::End => {
1432                     return csv;
1433                 }
1434             }
1435         }
1436     }
1437 
parse_by_record(rdr: &mut Reader, data: &str) -> Csv1438     fn parse_by_record(rdr: &mut Reader, data: &str) -> Csv {
1439         use crate::ReadRecordResult::*;
1440 
1441         let mut data = data.as_bytes();
1442         let mut record = [0; 1024];
1443         let mut ends = [0; 10];
1444 
1445         let mut csv = Csv::new();
1446         let (mut outpos, mut endpos) = (0, 0);
1447         loop {
1448             let (res, nin, nout, nend) = rdr.read_record(
1449                 data,
1450                 &mut record[outpos..],
1451                 &mut ends[endpos..],
1452             );
1453             data = &data[nin..];
1454             outpos += nout;
1455             endpos += nend;
1456 
1457             match res {
1458                 InputEmpty => {
1459                     if !data.is_empty() {
1460                         panic!("missing input data")
1461                     }
1462                 }
1463                 OutputFull => panic!("record too large (out buffer)"),
1464                 OutputEndsFull => panic!("record too large (end buffer)"),
1465                 Record => {
1466                     let s = str::from_utf8(&record[..outpos]).unwrap();
1467                     let mut start = 0;
1468                     let mut row = Row::new();
1469                     for &end in &ends[..endpos] {
1470                         row.push(Field::from(&s[start..end]).unwrap());
1471                         start = end;
1472                     }
1473                     csv.push(row);
1474                     outpos = 0;
1475                     endpos = 0;
1476                 }
1477                 End => return csv,
1478             }
1479         }
1480     }
1481 
1482     parses_to!(one_row_one_field, "a", csv![["a"]]);
1483     parses_to!(one_row_many_fields, "a,b,c", csv![["a", "b", "c"]]);
1484     parses_to!(one_row_trailing_comma, "a,b,", csv![["a", "b", ""]]);
1485     parses_to!(one_row_one_field_lf, "a\n", csv![["a"]]);
1486     parses_to!(one_row_many_fields_lf, "a,b,c\n", csv![["a", "b", "c"]]);
1487     parses_to!(one_row_trailing_comma_lf, "a,b,\n", csv![["a", "b", ""]]);
1488     parses_to!(one_row_one_field_crlf, "a\r\n", csv![["a"]]);
1489     parses_to!(one_row_many_fields_crlf, "a,b,c\r\n", csv![["a", "b", "c"]]);
1490     parses_to!(one_row_trailing_comma_crlf, "a,b,\r\n", csv![["a", "b", ""]]);
1491     parses_to!(one_row_one_field_cr, "a\r", csv![["a"]]);
1492     parses_to!(one_row_many_fields_cr, "a,b,c\r", csv![["a", "b", "c"]]);
1493     parses_to!(one_row_trailing_comma_cr, "a,b,\r", csv![["a", "b", ""]]);
1494 
1495     parses_to!(many_rows_one_field, "a\nb", csv![["a"], ["b"]]);
1496     parses_to!(
1497         many_rows_many_fields,
1498         "a,b,c\nx,y,z",
1499         csv![["a", "b", "c"], ["x", "y", "z"]]
1500     );
1501     parses_to!(
1502         many_rows_trailing_comma,
1503         "a,b,\nx,y,",
1504         csv![["a", "b", ""], ["x", "y", ""]]
1505     );
1506     parses_to!(many_rows_one_field_lf, "a\nb\n", csv![["a"], ["b"]]);
1507     parses_to!(
1508         many_rows_many_fields_lf,
1509         "a,b,c\nx,y,z\n",
1510         csv![["a", "b", "c"], ["x", "y", "z"]]
1511     );
1512     parses_to!(
1513         many_rows_trailing_comma_lf,
1514         "a,b,\nx,y,\n",
1515         csv![["a", "b", ""], ["x", "y", ""]]
1516     );
1517     parses_to!(many_rows_one_field_crlf, "a\r\nb\r\n", csv![["a"], ["b"]]);
1518     parses_to!(
1519         many_rows_many_fields_crlf,
1520         "a,b,c\r\nx,y,z\r\n",
1521         csv![["a", "b", "c"], ["x", "y", "z"]]
1522     );
1523     parses_to!(
1524         many_rows_trailing_comma_crlf,
1525         "a,b,\r\nx,y,\r\n",
1526         csv![["a", "b", ""], ["x", "y", ""]]
1527     );
1528     parses_to!(many_rows_one_field_cr, "a\rb\r", csv![["a"], ["b"]]);
1529     parses_to!(
1530         many_rows_many_fields_cr,
1531         "a,b,c\rx,y,z\r",
1532         csv![["a", "b", "c"], ["x", "y", "z"]]
1533     );
1534     parses_to!(
1535         many_rows_trailing_comma_cr,
1536         "a,b,\rx,y,\r",
1537         csv![["a", "b", ""], ["x", "y", ""]]
1538     );
1539 
1540     parses_to!(
1541         trailing_lines_no_record,
1542         "\n\n\na,b,c\nx,y,z\n\n\n",
1543         csv![["a", "b", "c"], ["x", "y", "z"]]
1544     );
1545     parses_to!(
1546         trailing_lines_no_record_cr,
1547         "\r\r\ra,b,c\rx,y,z\r\r\r",
1548         csv![["a", "b", "c"], ["x", "y", "z"]]
1549     );
1550     parses_to!(
1551         trailing_lines_no_record_crlf,
1552         "\r\n\r\n\r\na,b,c\r\nx,y,z\r\n\r\n\r\n",
1553         csv![["a", "b", "c"], ["x", "y", "z"]]
1554     );
1555 
1556     parses_to!(empty, "", csv![]);
1557     parses_to!(empty_lines, "\n\n\n\n", csv![]);
1558     parses_to!(
1559         empty_lines_interspersed,
1560         "\n\na,b\n\n\nx,y\n\n\nm,n\n",
1561         csv![["a", "b"], ["x", "y"], ["m", "n"]]
1562     );
1563     parses_to!(empty_lines_crlf, "\r\n\r\n\r\n\r\n", csv![]);
1564     parses_to!(
1565         empty_lines_interspersed_crlf,
1566         "\r\n\r\na,b\r\n\r\n\r\nx,y\r\n\r\n\r\nm,n\r\n",
1567         csv![["a", "b"], ["x", "y"], ["m", "n"]]
1568     );
1569     parses_to!(empty_lines_mixed, "\r\n\n\r\n\n", csv![]);
1570     parses_to!(
1571         empty_lines_interspersed_mixed,
1572         "\n\r\na,b\r\n\n\r\nx,y\r\n\n\r\nm,n\r\n",
1573         csv![["a", "b"], ["x", "y"], ["m", "n"]]
1574     );
1575     parses_to!(empty_lines_cr, "\r\r\r\r", csv![]);
1576     parses_to!(
1577         empty_lines_interspersed_cr,
1578         "\r\ra,b\r\r\rx,y\r\r\rm,n\r",
1579         csv![["a", "b"], ["x", "y"], ["m", "n"]]
1580     );
1581 
1582     parses_to!(
1583         term_weird,
1584         "zza,bzc,dzz",
1585         csv![["a", "b"], ["c", "d"]],
1586         |b: &mut ReaderBuilder| {
1587             b.terminator(Terminator::Any(b'z'));
1588         }
1589     );
1590 
1591     parses_to!(
1592         ascii_delimited,
1593         "a\x1fb\x1ec\x1fd",
1594         csv![["a", "b"], ["c", "d"]],
1595         |b: &mut ReaderBuilder| {
1596             b.ascii();
1597         }
1598     );
1599 
1600     parses_to!(bom_at_start, "\u{feff}a", csv![["a"]]);
1601     parses_to!(bom_in_field, "a\u{feff}", csv![["a\u{feff}"]]);
1602     parses_to!(bom_at_field_start, "a,\u{feff}b", csv![["a", "\u{feff}b"]]);
1603 
1604     parses_to!(quote_empty, "\"\"", csv![[""]]);
1605     parses_to!(quote_lf, "\"\"\n", csv![[""]]);
1606     parses_to!(quote_space, "\" \"", csv![[" "]]);
1607     parses_to!(quote_inner_space, "\" a \"", csv![[" a "]]);
1608     parses_to!(quote_outer_space, "  \"a\"  ", csv![["  \"a\"  "]]);
1609 
1610     parses_to!(quote_change, "zaz", csv![["a"]], |b: &mut ReaderBuilder| {
1611         b.quote(b'z');
1612     });
1613 
1614     // This one is pretty hokey.
1615     // I don't really know what the "right" behavior is.
1616     parses_to!(
1617         quote_delimiter,
1618         ",a,,b",
1619         csv![["a,b"]],
1620         |b: &mut ReaderBuilder| {
1621             b.quote(b',');
1622         }
1623     );
1624 
1625     parses_to!(quote_no_escapes, r#""a\"b""#, csv![[r#"a\b""#]]);
1626     parses_to!(
1627         quote_escapes_no_double,
1628         r#""a""b""#,
1629         csv![[r#"a"b""#]],
1630         |b: &mut ReaderBuilder| {
1631             b.double_quote(false);
1632         }
1633     );
1634     parses_to!(
1635         quote_escapes,
1636         r#""a\"b""#,
1637         csv![[r#"a"b"#]],
1638         |b: &mut ReaderBuilder| {
1639             b.escape(Some(b'\\'));
1640         }
1641     );
1642     parses_to!(
1643         quote_escapes_change,
1644         r#""az"b""#,
1645         csv![[r#"a"b"#]],
1646         |b: &mut ReaderBuilder| {
1647             b.escape(Some(b'z'));
1648         }
1649     );
1650 
1651     parses_to!(
1652         quote_escapes_with_comma,
1653         r#""\"A,B\"""#,
1654         csv![[r#""A,B""#]],
1655         |b: &mut ReaderBuilder| {
1656             b.escape(Some(b'\\')).double_quote(false);
1657         }
1658     );
1659 
1660     parses_to!(
1661         quoting_disabled,
1662         r#""abc,foo""#,
1663         csv![[r#""abc"#, r#"foo""#]],
1664         |b: &mut ReaderBuilder| {
1665             b.quoting(false);
1666         }
1667     );
1668 
1669     parses_to!(
1670         delimiter_tabs,
1671         "a\tb",
1672         csv![["a", "b"]],
1673         |b: &mut ReaderBuilder| {
1674             b.delimiter(b'\t');
1675         }
1676     );
1677     parses_to!(
1678         delimiter_weird,
1679         "azb",
1680         csv![["a", "b"]],
1681         |b: &mut ReaderBuilder| {
1682             b.delimiter(b'z');
1683         }
1684     );
1685 
1686     parses_to!(extra_record_crlf_1, "foo\n1\n", csv![["foo"], ["1"]]);
1687     parses_to!(extra_record_crlf_2, "foo\r\n1\r\n", csv![["foo"], ["1"]]);
1688 
1689     parses_to!(
1690         comment_1,
1691         "foo\n# hi\nbar\n",
1692         csv![["foo"], ["bar"]],
1693         |b: &mut ReaderBuilder| {
1694             b.comment(Some(b'#'));
1695         }
1696     );
1697     parses_to!(
1698         comment_2,
1699         "foo\n # hi\nbar\n",
1700         csv![["foo"], [" # hi"], ["bar"]],
1701         |b: &mut ReaderBuilder| {
1702             b.comment(Some(b'#'));
1703         }
1704     );
1705     parses_to!(
1706         comment_3,
1707         "foo\n# hi\nbar\n",
1708         csv![["foo"], ["# hi"], ["bar"]],
1709         |b: &mut ReaderBuilder| {
1710             b.comment(Some(b'\n'));
1711         }
1712     );
1713     parses_to!(
1714         comment_4,
1715         "foo,b#ar,baz",
1716         csv![["foo", "b#ar", "baz"]],
1717         |b: &mut ReaderBuilder| {
1718             b.comment(Some(b'#'));
1719         }
1720     );
1721     parses_to!(
1722         comment_5,
1723         "foo,#bar,baz",
1724         csv![["foo", "#bar", "baz"]],
1725         |b: &mut ReaderBuilder| {
1726             b.comment(Some(b'#'));
1727         }
1728     );
1729 
1730     macro_rules! assert_read {
1731         (
1732             $rdr:expr, $input:expr, $output:expr,
1733             $expect_in:expr, $expect_out:expr, $expect_res:expr
1734         ) => {{
1735             let (res, nin, nout) = $rdr.read_field($input, $output);
1736             assert_eq!($expect_in, nin);
1737             assert_eq!($expect_out, nout);
1738             assert_eq!($expect_res, res);
1739         }};
1740     }
1741 
1742     // This tests that feeding a new reader with an empty buffer sends us
1743     // straight to End.
1744     #[test]
stream_empty()1745     fn stream_empty() {
1746         use crate::ReadFieldResult::*;
1747 
1748         let mut rdr = Reader::new();
1749         assert_read!(rdr, &[], &mut [], 0, 0, End);
1750     }
1751 
1752     // Test that a single space is treated as a single field.
1753     #[test]
stream_space()1754     fn stream_space() {
1755         use crate::ReadFieldResult::*;
1756 
1757         let mut rdr = Reader::new();
1758         assert_read!(rdr, b(" "), &mut [0], 1, 1, InputEmpty);
1759         assert_read!(rdr, &[], &mut [0], 0, 0, Field { record_end: true });
1760         assert_read!(rdr, &[], &mut [0], 0, 0, End);
1761     }
1762 
1763     // Test that a single comma ...
1764     #[test]
stream_comma()1765     fn stream_comma() {
1766         use crate::ReadFieldResult::*;
1767 
1768         let mut rdr = Reader::new();
1769         assert_read!(rdr, b(","), &mut [0], 1, 0, Field { record_end: false });
1770         assert_read!(rdr, &[], &mut [0], 0, 0, Field { record_end: true });
1771         assert_read!(rdr, &[], &mut [0], 0, 0, End);
1772     }
1773 
1774     // Test that we can read a single large field in multiple output
1775     // buffers.
1776     #[test]
stream_output_chunks()1777     fn stream_output_chunks() {
1778         use crate::ReadFieldResult::*;
1779 
1780         let mut inp = b("fooquux");
1781         let out = &mut [0; 2];
1782         let mut rdr = Reader::new();
1783 
1784         assert_read!(rdr, inp, out, 2, 2, OutputFull);
1785         assert_eq!(out, b("fo"));
1786         inp = &inp[2..];
1787 
1788         assert_read!(rdr, inp, out, 2, 2, OutputFull);
1789         assert_eq!(out, b("oq"));
1790         inp = &inp[2..];
1791 
1792         assert_read!(rdr, inp, out, 2, 2, OutputFull);
1793         assert_eq!(out, b("uu"));
1794         inp = &inp[2..];
1795 
1796         assert_read!(rdr, inp, out, 1, 1, InputEmpty);
1797         assert_eq!(&out[..1], b("x"));
1798         inp = &inp[1..];
1799         assert!(inp.is_empty());
1800 
1801         assert_read!(rdr, &[], out, 0, 0, Field { record_end: true });
1802         assert_read!(rdr, inp, out, 0, 0, End);
1803     }
1804 
1805     // Test that we can read a single large field across multiple input
1806     // buffers.
1807     #[test]
stream_input_chunks()1808     fn stream_input_chunks() {
1809         use crate::ReadFieldResult::*;
1810 
1811         let out = &mut [0; 10];
1812         let mut rdr = Reader::new();
1813 
1814         assert_read!(rdr, b("fo"), out, 2, 2, InputEmpty);
1815         assert_eq!(&out[..2], b("fo"));
1816 
1817         assert_read!(rdr, b("oq"), &mut out[2..], 2, 2, InputEmpty);
1818         assert_eq!(&out[..4], b("fooq"));
1819 
1820         assert_read!(rdr, b("uu"), &mut out[4..], 2, 2, InputEmpty);
1821         assert_eq!(&out[..6], b("fooquu"));
1822 
1823         assert_read!(rdr, b("x"), &mut out[6..], 1, 1, InputEmpty);
1824         assert_eq!(&out[..7], b("fooquux"));
1825 
1826         assert_read!(rdr, &[], out, 0, 0, Field { record_end: true });
1827         assert_read!(rdr, &[], out, 0, 0, End);
1828     }
1829 
1830     // Test we can read doubled quotes correctly in a stream.
1831     #[test]
stream_doubled_quotes()1832     fn stream_doubled_quotes() {
1833         use crate::ReadFieldResult::*;
1834 
1835         let out = &mut [0; 10];
1836         let mut rdr = Reader::new();
1837 
1838         assert_read!(rdr, b("\"fo\""), out, 4, 2, InputEmpty);
1839         assert_eq!(&out[..2], b("fo"));
1840 
1841         assert_read!(rdr, b("\"o"), &mut out[2..], 2, 2, InputEmpty);
1842         assert_eq!(&out[..4], b("fo\"o"));
1843 
1844         assert_read!(rdr, &[], out, 0, 0, Field { record_end: true });
1845         assert_read!(rdr, &[], out, 0, 0, End);
1846     }
1847 
1848     // Test we can read escaped quotes correctly in a stream.
1849     #[test]
stream_escaped_quotes()1850     fn stream_escaped_quotes() {
1851         use crate::ReadFieldResult::*;
1852 
1853         let out = &mut [0; 10];
1854         let mut builder = ReaderBuilder::new();
1855         let mut rdr = builder.escape(Some(b'\\')).build();
1856 
1857         assert_read!(rdr, b("\"fo\\"), out, 4, 2, InputEmpty);
1858         assert_eq!(&out[..2], b("fo"));
1859 
1860         assert_read!(rdr, b("\"o"), &mut out[2..], 2, 2, InputEmpty);
1861         assert_eq!(&out[..4], b("fo\"o"));
1862 
1863         assert_read!(rdr, &[], out, 0, 0, Field { record_end: true });
1864         assert_read!(rdr, &[], out, 0, 0, End);
1865     }
1866 
1867     // Test that empty output buffers don't wreak havoc.
1868     #[test]
stream_empty_output()1869     fn stream_empty_output() {
1870         use crate::ReadFieldResult::*;
1871 
1872         let out = &mut [0; 10];
1873         let mut rdr = Reader::new();
1874 
1875         assert_read!(
1876             rdr,
1877             b("foo,bar"),
1878             out,
1879             4,
1880             3,
1881             Field { record_end: false }
1882         );
1883         assert_eq!(&out[..3], b("foo"));
1884 
1885         assert_read!(rdr, b("bar"), &mut [], 0, 0, OutputFull);
1886 
1887         assert_read!(rdr, b("bar"), out, 3, 3, InputEmpty);
1888         assert_eq!(&out[..3], b("bar"));
1889 
1890         assert_read!(rdr, &[], out, 0, 0, Field { record_end: true });
1891         assert_read!(rdr, &[], out, 0, 0, End);
1892     }
1893 
1894     // Test that we can reset the parser mid-stream and count on it to do
1895     // the right thing.
1896     #[test]
reset_works()1897     fn reset_works() {
1898         use crate::ReadFieldResult::*;
1899 
1900         let out = &mut [0; 10];
1901         let mut rdr = Reader::new();
1902 
1903         assert_read!(rdr, b("\"foo"), out, 4, 3, InputEmpty);
1904         assert_eq!(&out[..3], b("foo"));
1905 
1906         // Without reseting the parser state, the reader will remember that
1907         // we're in a quoted field, and therefore interpret the leading double
1908         // quotes below as a single quote and the trailing quote as a matching
1909         // terminator. With the reset, however, the parser forgets the quoted
1910         // field and treats the leading double quotes as a syntax quirk and
1911         // drops them, in addition to hanging on to the trailing unmatched
1912         // quote. (Matches Python's behavior.)
1913         rdr.reset();
1914 
1915         assert_read!(rdr, b("\"\"bar\""), out, 6, 4, InputEmpty);
1916         assert_eq!(&out[..4], b("bar\""));
1917     }
1918 
1919     // Test the line number reporting is correct.
1920     #[test]
line_numbers()1921     fn line_numbers() {
1922         use crate::ReadFieldResult::*;
1923 
1924         let out = &mut [0; 10];
1925         let mut rdr = Reader::new();
1926 
1927         assert_eq!(1, rdr.line());
1928 
1929         assert_read!(rdr, b("\n\n\n\n"), out, 4, 0, InputEmpty);
1930         assert_eq!(5, rdr.line());
1931 
1932         assert_read!(rdr, b("foo,"), out, 4, 3, Field { record_end: false });
1933         assert_eq!(5, rdr.line());
1934 
1935         assert_read!(rdr, b("bar\n"), out, 4, 3, Field { record_end: true });
1936         assert_eq!(6, rdr.line());
1937 
1938         assert_read!(rdr, &[], &mut [0], 0, 0, End);
1939         assert_eq!(6, rdr.line());
1940     }
1941 
1942     macro_rules! assert_read_record {
1943         (
1944             $rdr:expr, $input:expr, $output:expr, $ends:expr,
1945             $expect_in:expr, $expect_out:expr,
1946             $expect_end:expr, $expect_res:expr
1947         ) => {{
1948             let (res, nin, nout, nend) =
1949                 $rdr.read_record($input, $output, $ends);
1950             assert_eq!($expect_res, res, "result");
1951             assert_eq!($expect_in, nin, "input");
1952             assert_eq!($expect_out, nout, "output");
1953             assert_eq!($expect_end, nend, "ends");
1954         }};
1955     }
1956 
1957     // Test that we can incrementally read a record.
1958     #[test]
stream_record()1959     fn stream_record() {
1960         use crate::ReadRecordResult::*;
1961 
1962         let mut inp = b("foo,bar\nbaz");
1963         let out = &mut [0; 1024];
1964         let ends = &mut [0; 10];
1965         let mut rdr = Reader::new();
1966 
1967         assert_read_record!(rdr, &inp, out, ends, 8, 6, 2, Record);
1968         assert_eq!(ends[0], 3);
1969         assert_eq!(ends[1], 6);
1970         inp = &inp[8..];
1971 
1972         assert_read_record!(rdr, &inp, out, ends, 3, 3, 0, InputEmpty);
1973         inp = &inp[3..];
1974 
1975         assert_read_record!(rdr, &inp, out, ends, 0, 0, 1, Record);
1976         assert_eq!(ends[0], 3);
1977 
1978         assert_read_record!(rdr, &inp, out, ends, 0, 0, 0, End);
1979     }
1980 
1981     // Test that if our output ends are full during the last read that
1982     // we get an appropriate state returned.
1983     #[test]
stream_record_last_end_output_full()1984     fn stream_record_last_end_output_full() {
1985         use crate::ReadRecordResult::*;
1986 
1987         let mut inp = b("foo,bar\nbaz");
1988         let out = &mut [0; 1024];
1989         let ends = &mut [0; 10];
1990         let mut rdr = Reader::new();
1991 
1992         assert_read_record!(rdr, &inp, out, ends, 8, 6, 2, Record);
1993         assert_eq!(ends[0], 3);
1994         assert_eq!(ends[1], 6);
1995         inp = &inp[8..];
1996 
1997         assert_read_record!(rdr, &inp, out, ends, 3, 3, 0, InputEmpty);
1998         inp = &inp[3..];
1999 
2000         assert_read_record!(rdr, &inp, out, &mut [], 0, 0, 0, OutputEndsFull);
2001         assert_read_record!(rdr, &inp, out, ends, 0, 0, 1, Record);
2002         assert_eq!(ends[0], 3);
2003 
2004         assert_read_record!(rdr, &inp, out, ends, 0, 0, 0, End);
2005     }
2006 
2007     #[test]
reset_input_partial()2008     fn reset_input_partial() {
2009         use crate::ReadRecordResult::*;
2010 
2011         let inp = b("foo,bar\nbaz");
2012         let out = &mut [0; 1024];
2013         let ends = &mut [0; 10];
2014         let mut rdr = Reader::new();
2015 
2016         assert_read_record!(rdr, &inp, out, ends, 8, 6, 2, Record);
2017 
2018         // Try to read incomplete record.
2019         let (result, _, _, _) = rdr.read_record(&inp[8..], out, ends);
2020         assert_eq!(result, InputEmpty);
2021 
2022         rdr.reset();
2023 
2024         let inp = b("baz,raz\n");
2025         let (result, _, _, _) = rdr.read_record(inp, out, ends);
2026         assert_eq!(result, Record);
2027         assert_eq!(ends[0], 3);
2028     }
2029 }
2030