use core::fmt; use crate::Terminator; // BE ADVISED // // This may just be one of the more complicated CSV parsers you'll come across. // The implementation never allocates and consists of both a functional NFA // parser and a DFA parser. The DFA parser is the work horse and we could elide // much of the work involved in making the NFA parser work, but the NFA parser // is much easier to debug. The NFA parser is tested alongside the DFA parser, // so they should never be out of sync. // // The basic structure of the implementation is to encode the NFA parser as // an explicit state machine in code. The DFA is then generated by populating // a transition table on the stack by exhaustively enumerating all possible // states on all possible inputs (this is possible because the number of states // and the number of inputs is very small). // // Note that some pieces of the NFA parser (such as the NFA state machine) are // required. In particular, the translation from the NFA to the DFA depends on // the configuration of the CSV parser as given by the caller, and indeed, this // is one of the key performance benefits of the DFA: it doesn't have any // overhead (other than a bigger transition table) associated with the number // of configuration options. // // ADVICE FOR HACKERS // // This code is too clever for its own good. As such, changes to some parts of // the code may have a non-obvious impact on other parts. This is mostly // motivated by trying to keep the DFA transition table as small as possible, // since it is stored on the stack. Here are some tips that may save you some // time: // // * If you add a new NFA state, then you also need to consider how it impacts // the DFA. If all of the incoming transitions into an NFA state are // epsilon transitions, then it probably isn't materialized in the DFA. // If the NFA state indicates that a field or a record has been parsed, then // it should be considered final. Let the comments in `NfaState` be your // guide. // * If you add a new configuration knob to the parser, then you may need to // modify the `TRANS_CLASSES` constant below. The `TRANS_CLASSES` constant // indicates the total number of discriminating bytes in the DFA. And if you // modify `TRANS_CLASSES`, you probably also need to modify `build_dfa` to // add a new class. For example, in order to add parsing support for // comments, I bumped `TRANS_CLASSES` from `6` to `7` and added the comment // byte (if one exists) to the list of classes in `build_dfa`. // * The special DFA start state doubles as the final state once all input // from the caller has been exhausted. We must be careful to guard this // case analysis on whether the input is actually exhausted, since the start // state is an otherwise valid state. /// A pull based CSV reader. /// /// This reader parses CSV data using a finite state machine. Callers can /// extract parsed data incrementally using one of the `read` methods. /// /// Note that this CSV reader is somewhat encoding agnostic. The source data /// needs to be at least ASCII compatible. There is no support for specifying /// the full gamut of Unicode delimiters/terminators/quotes/escapes. Instead, /// any byte can be used, although callers probably want to stick to the ASCII /// subset (`<= 0x7F`). /// /// # Usage /// /// A reader has two different ways to read CSV data, each with their own /// trade offs. /// /// * `read_field` - Copies a single CSV field into an output buffer while /// unescaping quotes. This is simple to use and doesn't require storing an /// entire record contiguously in memory, but it is slower. /// * `read_record` - Copies an entire CSV record into an output buffer while /// unescaping quotes. The ending positions of each field are copied into /// an additional buffer. This is harder to use and requires larger output /// buffers, but it is faster than `read_field` since it amortizes more /// costs. /// /// # RFC 4180 /// /// [RFC 4180](https://tools.ietf.org/html/rfc4180) /// is the closest thing to a specification for CSV data. Unfortunately, /// CSV data that is seen in the wild can vary significantly. Often, the CSV /// data is outright invalid. Instead of fixing the producers of bad CSV data, /// we have seen fit to make consumers much more flexible in what they accept. /// This reader continues that tradition, and therefore, isn't technically /// compliant with RFC 4180. In particular, this reader will never return an /// error and will always find *a* parse. /// /// Here are some detailed differences from RFC 4180: /// /// * CRLF, LF and CR are each treated as a single record terminator by /// default. /// * Records are permitted to be of varying length. /// * Empty lines (that do not include other whitespace) are ignored. #[derive(Clone, Debug)] pub struct Reader { /// A table-based DFA for parsing CSV. dfa: Dfa, /// The current DFA state, if the DFA is used. dfa_state: DfaState, /// The current NFA state, if the NFA is used. nfa_state: NfaState, /// The delimiter that separates fields. delimiter: u8, /// The terminator that separates records. term: Terminator, /// The quotation byte. quote: u8, /// Whether to recognize escaped quotes. escape: Option, /// Whether to recognized doubled quotes. double_quote: bool, /// If enabled, lines beginning with this byte are ignored. comment: Option, /// If enabled (the default), then quotes are respected. When disabled, /// quotes are not treated specially. quoting: bool, /// Whether to use the NFA for parsing. /// /// Generally this is for debugging. There's otherwise no good reason /// to avoid the DFA. use_nfa: bool, /// The current line number. line: u64, /// Whether this parser has ever read anything. has_read: bool, /// The current position in the output buffer when reading a record. output_pos: usize, } impl Default for Reader { fn default() -> Reader { Reader { dfa: Dfa::new(), dfa_state: DfaState::start(), nfa_state: NfaState::StartRecord, delimiter: b',', term: Terminator::default(), quote: b'"', escape: None, double_quote: true, comment: None, quoting: true, use_nfa: false, line: 1, has_read: false, output_pos: 0, } } } /// Builds a CSV reader with various configuration knobs. /// /// This builder can be used to tweak the field delimiter, record terminator /// and more for parsing CSV. Once a CSV `Reader` is built, its configuration /// cannot be changed. #[derive(Debug, Default)] pub struct ReaderBuilder { rdr: Reader, } impl ReaderBuilder { /// Create a new builder. pub fn new() -> ReaderBuilder { ReaderBuilder::default() } /// Build a CSV parser from this configuration. pub fn build(&self) -> Reader { let mut rdr = self.rdr.clone(); rdr.build_dfa(); rdr } /// The field delimiter to use when parsing CSV. /// /// The default is `b','`. pub fn delimiter(&mut self, delimiter: u8) -> &mut ReaderBuilder { self.rdr.delimiter = delimiter; self } /// The record terminator to use when parsing CSV. /// /// A record terminator can be any single byte. The default is a special /// value, `Terminator::CRLF`, which treats any occurrence of `\r`, `\n` /// or `\r\n` as a single record terminator. pub fn terminator(&mut self, term: Terminator) -> &mut ReaderBuilder { self.rdr.term = term; self } /// The quote character to use when parsing CSV. /// /// The default is `b'"'`. pub fn quote(&mut self, quote: u8) -> &mut ReaderBuilder { self.rdr.quote = quote; self } /// The escape character to use when parsing CSV. /// /// In some variants of CSV, quotes are escaped using a special escape /// character like `\` (instead of escaping quotes by doubling them). /// /// By default, recognizing these idiosyncratic escapes is disabled. pub fn escape(&mut self, escape: Option) -> &mut ReaderBuilder { self.rdr.escape = escape; self } /// Enable double quote escapes. /// /// This is enabled by default, but it may be disabled. When disabled, /// doubled quotes are not interpreted as escapes. pub fn double_quote(&mut self, yes: bool) -> &mut ReaderBuilder { self.rdr.double_quote = yes; self } /// Enable or disable quoting. /// /// This is enabled by default, but it may be disabled. When disabled, /// quotes are not treated specially. pub fn quoting(&mut self, yes: bool) -> &mut ReaderBuilder { self.rdr.quoting = yes; self } /// The comment character to use when parsing CSV. /// /// If the start of a record begins with the byte given here, then that /// line is ignored by the CSV parser. /// /// This is disabled by default. pub fn comment(&mut self, comment: Option) -> &mut ReaderBuilder { self.rdr.comment = comment; self } /// A convenience method for specifying a configuration to read ASCII /// delimited text. /// /// This sets the delimiter and record terminator to the ASCII unit /// separator (`\x1F`) and record separator (`\x1E`), respectively. pub fn ascii(&mut self) -> &mut ReaderBuilder { self.delimiter(b'\x1F').terminator(Terminator::Any(b'\x1E')) } /// Enable or disable the NFA for parsing CSV. /// /// This is intended to be a debug option useful for debugging. The NFA /// is always slower than the DFA. #[doc(hidden)] pub fn nfa(&mut self, yes: bool) -> &mut ReaderBuilder { self.rdr.use_nfa = yes; self } } /// The result of parsing at most one field from CSV data. #[derive(Clone, Debug, Eq, PartialEq)] pub enum ReadFieldResult { /// The caller provided input was exhausted before the end of a field or /// record was found. InputEmpty, /// The caller provided output buffer was filled before an entire field /// could be written to it. OutputFull, /// The end of a field was found. /// /// Note that when `record_end` is true, then the end of this field also /// corresponds to the end of a record. Field { /// Whether this was the last field in a record or not. record_end: bool, }, /// All CSV data has been read. /// /// This state can only be returned when an empty input buffer is provided /// by the caller. End, } impl ReadFieldResult { fn from_nfa( state: NfaState, inpdone: bool, outdone: bool, ) -> ReadFieldResult { match state { NfaState::End => ReadFieldResult::End, NfaState::EndRecord | NfaState::CRLF => { ReadFieldResult::Field { record_end: true } } NfaState::EndFieldDelim => { ReadFieldResult::Field { record_end: false } } _ => { assert!(!state.is_field_final()); if !inpdone && outdone { ReadFieldResult::OutputFull } else { ReadFieldResult::InputEmpty } } } } } /// The result of parsing at most one field from CSV data while ignoring the /// output. #[derive(Clone, Debug, Eq, PartialEq)] pub enum ReadFieldNoCopyResult { /// The caller provided input was exhausted before the end of a field or /// record was found. InputEmpty, /// The end of a field was found. /// /// Note that when `record_end` is true, then the end of this field also /// corresponds to the end of a record. Field { /// Whether this was the last field in a record or not. record_end: bool, }, /// All CSV data has been read. /// /// This state can only be returned when an empty input buffer is provided /// by the caller. End, } /// The result of parsing at most one record from CSV data. #[derive(Clone, Debug, Eq, PartialEq)] pub enum ReadRecordResult { /// The caller provided input was exhausted before the end of a record was /// found. InputEmpty, /// The caller provided output buffer was filled before an entire field /// could be written to it. OutputFull, /// The caller provided output buffer of field end poisitions was filled /// before the next field could be parsed. OutputEndsFull, /// The end of a record was found. Record, /// All CSV data has been read. /// /// This state can only be returned when an empty input buffer is provided /// by the caller. End, } impl ReadRecordResult { fn is_record(&self) -> bool { *self == ReadRecordResult::Record } fn from_nfa( state: NfaState, inpdone: bool, outdone: bool, endsdone: bool, ) -> ReadRecordResult { match state { NfaState::End => ReadRecordResult::End, NfaState::EndRecord | NfaState::CRLF => ReadRecordResult::Record, _ => { assert!(!state.is_record_final()); if !inpdone && outdone { ReadRecordResult::OutputFull } else if !inpdone && endsdone { ReadRecordResult::OutputEndsFull } else { ReadRecordResult::InputEmpty } } } } } /// The result of parsing at most one record from CSV data while ignoring /// output. #[derive(Clone, Debug, Eq, PartialEq)] pub enum ReadRecordNoCopyResult { /// The caller provided input was exhausted before the end of a record was /// found. InputEmpty, /// The end of a record was found. Record, /// All CSV data has been read. /// /// This state can only be returned when an empty input buffer is provided /// by the caller. End, } /// What should be done with input bytes during an NFA transition #[derive(Clone, Debug, Eq, PartialEq)] enum NfaInputAction { // Do not consume an input byte Epsilon, // Copy input byte to a caller-provided output buffer CopyToOutput, // Consume but do not copy input byte (for example, seeing a field // delimiter will consume an input byte but should not copy it to the // output buffer. Discard, } /// An NFA state is a state that can be visited in the NFA parser. /// /// Given the simplicity of the machine, a subset of NFA states double as DFA /// states. NFA states that only have incoming epsilon transitions are /// optimized out when converting the machine to a DFA. #[derive(Copy, Clone, Debug, Eq, PartialEq)] enum NfaState { // These states aren't used in the DFA, so we // assign them meaningless numbers. EndFieldTerm = 200, InRecordTerm = 201, End = 202, // All states below are DFA states. StartRecord = 0, StartField = 1, InField = 2, InQuotedField = 3, InEscapedQuote = 4, InDoubleEscapedQuote = 5, InComment = 6, // All states below are "final field" states. // Namely, they indicate that a field has been parsed. EndFieldDelim = 7, // All states below are "final record" states. // Namely, they indicate that a record has been parsed. EndRecord = 8, CRLF = 9, } /// A list of NFA states that have an explicit representation in the DFA. const NFA_STATES: &'static [NfaState] = &[ NfaState::StartRecord, NfaState::StartField, NfaState::EndFieldDelim, NfaState::InField, NfaState::InQuotedField, NfaState::InEscapedQuote, NfaState::InDoubleEscapedQuote, NfaState::InComment, NfaState::EndRecord, NfaState::CRLF, ]; impl NfaState { /// Returns true if this state indicates that a field has been parsed. fn is_field_final(&self) -> bool { match *self { NfaState::End | NfaState::EndRecord | NfaState::CRLF | NfaState::EndFieldDelim => true, _ => false, } } /// Returns true if this state indicates that a record has been parsed. fn is_record_final(&self) -> bool { match *self { NfaState::End | NfaState::EndRecord | NfaState::CRLF => true, _ => false, } } } impl Reader { /// Create a new CSV reader with a default parser configuration. pub fn new() -> Reader { ReaderBuilder::new().build() } /// Reset the parser such that it behaves as if it had never been used. /// /// This may be useful when reading CSV data in a random access pattern. pub fn reset(&mut self) { self.dfa_state = self.dfa.new_state(NfaState::StartRecord); self.nfa_state = NfaState::StartRecord; self.line = 1; self.has_read = false; self.output_pos = 0; } /// Return the current line number as measured by the number of occurrences /// of `\n`. /// /// Line numbers starts at `1` and are reset when `reset` is called. pub fn line(&self) -> u64 { self.line } /// Set the line number. /// /// This is useful after a call to `reset` where the caller knows the /// line number from some additional context. pub fn set_line(&mut self, line: u64) { self.line = line; } /// Parse a single CSV field in `input` and copy field data to `output`. /// /// This routine requires a caller provided buffer of CSV data as the /// `input` and a caller provided buffer, `output`, in which to store field /// data extracted from `input`. The field data copied to `output` will /// have its quotes unescaped. /// /// Calling this routine parses at most a single field and returns /// three values indicating the state of the parser. The first value, a /// `ReadFieldResult`, tells the caller what to do next. For example, if /// the entire input was read or if the output buffer was filled before /// a full field had been read, then `ReadFieldResult::InputEmpty` or /// `ReadFieldResult::OutputFull` is returned, respectively. See the /// documentation for `ReadFieldResult` for more details. /// /// The other two values returned correspond to the number of bytes /// read from `input` and written to `output`, respectively. /// /// # Termination /// /// This reader interprets an empty `input` buffer as an indication that /// there is no CSV data left to read. Namely, when the caller has /// exhausted all CSV data, the caller should continue to call `read` with /// an empty input buffer until `ReadFieldResult::End` is returned. /// /// # Errors /// /// This CSV reader can never return an error. Instead, it prefers *a* /// parse over *no* parse. pub fn read_field( &mut self, input: &[u8], output: &mut [u8], ) -> (ReadFieldResult, usize, usize) { let (input, bom_nin) = self.strip_utf8_bom(input); let (res, nin, nout) = if self.use_nfa { self.read_field_nfa(input, output) } else { self.read_field_dfa(input, output) }; self.has_read = true; (res, nin + bom_nin, nout) } /// Parse a single CSV record in `input` and copy each field contiguously /// to `output`, with the end position of each field written to `ends`. /// /// **NOTE**: This method is more cumbersome to use than `read_field`, but /// it can be faster since it amortizes more work. /// /// This routine requires a caller provided buffer of CSV data as the /// `input` and two caller provided buffers to store the unescaped field /// data (`output`) and the end position of each field in the record /// (`fields`). /// /// Calling this routine parses at most a single record and returns four /// values indicating the state of the parser. The first value, a /// `ReadRecordResult`, tells the caller what to do next. For example, if /// the entire input was read or if the output buffer was filled before a /// full field had been read, then `ReadRecordResult::InputEmpty` or /// `ReadRecordResult::OutputFull` is returned, respectively. Similarly, if /// the `ends` buffer is full, then `ReadRecordResult::OutputEndsFull` is /// returned. See the documentation for `ReadRecordResult` for more /// details. /// /// The other three values correspond to the number of bytes read from /// `input`, the number of bytes written to `output` and the number of /// end positions written to `ends`, respectively. /// /// The end positions written to `ends` are constructed as if there was /// a single contiguous buffer in memory containing the entire row, even /// if `ReadRecordResult::OutputFull` was returned in the middle of reading /// a row. /// /// # Termination /// /// This reader interprets an empty `input` buffer as an indication that /// there is no CSV data left to read. Namely, when the caller has /// exhausted all CSV data, the caller should continue to call `read` with /// an empty input buffer until `ReadRecordResult::End` is returned. /// /// # Errors /// /// This CSV reader can never return an error. Instead, it prefers *a* /// parse over *no* parse. pub fn read_record( &mut self, input: &[u8], output: &mut [u8], ends: &mut [usize], ) -> (ReadRecordResult, usize, usize, usize) { let (input, bom_nin) = self.strip_utf8_bom(input); let (res, nin, nout, nend) = if self.use_nfa { self.read_record_nfa(input, output, ends) } else { self.read_record_dfa(input, output, ends) }; self.has_read = true; (res, nin + bom_nin, nout, nend) } /// Strip off a possible UTF-8 BOM at the start of a file. Quick note that /// this method will fail to strip off the BOM if only part of the BOM is /// buffered. Hopefully that won't happen very often. fn strip_utf8_bom<'a>(&self, input: &'a [u8]) -> (&'a [u8], usize) { let (input, nin) = if { !self.has_read && input.len() >= 3 && &input[0..3] == b"\xef\xbb\xbf" } { (&input[3..], 3) } else { (input, 0) }; (input, nin) } #[inline(always)] fn read_record_dfa( &mut self, input: &[u8], output: &mut [u8], ends: &mut [usize], ) -> (ReadRecordResult, usize, usize, usize) { if input.is_empty() { let s = self.transition_final_dfa(self.dfa_state); let res = self.dfa.new_read_record_result(s, true, false, false, false); // This part is a little tricky. When reading the final record, // the last result the caller will get is an InputEmpty, and while // they'll have everything they need in `output`, they'll be // missing the final end position of the final field in `ends`. // We insert that here, but we must take care to handle the case // where `ends` doesn't have enough space. If it doesn't have // enough space, then we also can't transition to the next state. return match res { ReadRecordResult::Record => { if ends.is_empty() { return (ReadRecordResult::OutputEndsFull, 0, 0, 0); } self.dfa_state = s; ends[0] = self.output_pos; self.output_pos = 0; (res, 0, 0, 1) } _ => { self.dfa_state = s; (res, 0, 0, 0) } }; } if output.is_empty() { return (ReadRecordResult::OutputFull, 0, 0, 0); } if ends.is_empty() { return (ReadRecordResult::OutputEndsFull, 0, 0, 0); } let (mut nin, mut nout, mut nend) = (0, 0, 0); let mut state = self.dfa_state; while nin < input.len() && nout < output.len() && nend < ends.len() { let (s, has_out) = self.dfa.get_output(state, input[nin]); self.line += (input[nin] == b'\n') as u64; state = s; if has_out { output[nout] = input[nin]; nout += 1; } nin += 1; if state >= self.dfa.final_field { ends[nend] = self.output_pos + nout; nend += 1; if state > self.dfa.final_field { break; } } if state == self.dfa.in_field || state == self.dfa.in_quoted { self.dfa .classes .scan_and_copy(input, &mut nin, output, &mut nout); } } let res = self.dfa.new_read_record_result( state, false, nin >= input.len(), nout >= output.len(), nend >= ends.len(), ); self.dfa_state = state; if res.is_record() { self.output_pos = 0; } else { self.output_pos += nout; } (res, nin, nout, nend) } #[inline(always)] fn read_field_dfa( &mut self, input: &[u8], output: &mut [u8], ) -> (ReadFieldResult, usize, usize) { if input.is_empty() { self.dfa_state = self.transition_final_dfa(self.dfa_state); let res = self.dfa.new_read_field_result( self.dfa_state, true, false, false, ); return (res, 0, 0); } if output.is_empty() { return (ReadFieldResult::OutputFull, 0, 0); } let (mut nin, mut nout) = (0, 0); let mut state = self.dfa_state; while nin < input.len() && nout < output.len() { let b = input[nin]; self.line += (b == b'\n') as u64; let (s, has_out) = self.dfa.get_output(state, b); state = s; if has_out { output[nout] = b; nout += 1; } nin += 1; if state >= self.dfa.final_field { break; } } let res = self.dfa.new_read_field_result( state, false, nin >= input.len(), nout >= output.len(), ); self.dfa_state = state; (res, nin, nout) } /// Perform the final state transition, i.e., when the caller indicates /// that the input has been exhausted. fn transition_final_dfa(&self, state: DfaState) -> DfaState { // If we''ve already emitted a record or think we're ready to start // parsing a new record, then we should sink into the final state // and never move from there. (pro-tip: the start state doubles as // the final state!) if state >= self.dfa.final_record || state.is_start() { self.dfa.new_state_final_end() } else { self.dfa.new_state_final_record() } } /// Write the transition tables for the DFA based on this parser's /// configuration. fn build_dfa(&mut self) { // A naive DFA transition table has // `cells = (# number of states) * (# size of alphabet)`. While we // could get away with that, the table would have `10 * 256 = 2560` // entries. Even worse, in order to avoid a multiplication instruction // when computing the next transition, we store the starting index of // each state's row, which would not be representible in a single byte. // So we'd need a `u16`, which doubles our transition table size to // ~5KB. This is a lot to put on the stack, even though it probably // fits in the L1 cache of most modern CPUs. // // To avoid this, we note that while our "true" alphabet // has 256 distinct possibilities, the DFA itself is only // discriminatory on a very small subset of that alphabet. For // example, assuming neither `a` nor `b` are set as special // quote/comment/escape/delimiter/terminator bytes, they are otherwise // indistinguishable to the DFA, so it would be OK to treat them as // if they were equivalent. That is, they are in the same equivalence // class. // // As it turns out, using this logic, we can shrink our effective // alphabet down to 7 equivalence classes: // // 1. The field delimiter. // 2. The record terminator. // 3. If the record terminator is CRLF, then CR and LF are // distinct equivalence classes. // 4. The quote byte. // 5. The escape byte. // 6. The comment byte. // 7. Everything else. // // We add those equivalence classes here. If more configuration knobs // are added to the parser with more discriminating bytes, then this // logic will need to be adjusted further. // // Even though this requires an extra bit of indirection when computing // the next transition, microbenchmarks say that it doesn't make much // of a difference. Perhaps because everything fits into the L1 cache. self.dfa.classes.add(self.delimiter); if self.quoting { self.dfa.classes.add(self.quote); if let Some(escape) = self.escape { self.dfa.classes.add(escape); } } if let Some(comment) = self.comment { self.dfa.classes.add(comment); } match self.term { Terminator::Any(b) => self.dfa.classes.add(b), Terminator::CRLF => { self.dfa.classes.add(b'\r'); self.dfa.classes.add(b'\n'); } _ => unreachable!(), } // Build the DFA transition table by computing the DFA state for all // possible combinations of state and input byte. for &state in NFA_STATES { for c in (0..256).map(|c| c as u8) { let mut nfa_result = (state, NfaInputAction::Epsilon); // Consume NFA states until we hit a non-epsilon transition. while nfa_result.0 != NfaState::End && nfa_result.1 == NfaInputAction::Epsilon { nfa_result = self.transition_nfa(nfa_result.0, c); } let from = self.dfa.new_state(state); let to = self.dfa.new_state(nfa_result.0); self.dfa.set( from, c, to, nfa_result.1 == NfaInputAction::CopyToOutput, ); } } self.dfa_state = self.dfa.new_state(NfaState::StartRecord); self.dfa.finish(); } // The NFA implementation follows. The transition_final_nfa and // transition_nfa methods are required for the DFA to operate. The // rest are included for completeness (and debugging). Note that this // NFA implementation is included in most of the CSV parser tests below. #[inline(always)] fn read_record_nfa( &mut self, input: &[u8], output: &mut [u8], ends: &mut [usize], ) -> (ReadRecordResult, usize, usize, usize) { if input.is_empty() { let s = self.transition_final_nfa(self.nfa_state); let res = ReadRecordResult::from_nfa(s, false, false, false); return match res { ReadRecordResult::Record => { if ends.is_empty() { return (ReadRecordResult::OutputEndsFull, 0, 0, 0); } self.nfa_state = s; ends[0] = self.output_pos; self.output_pos = 0; (res, 0, 0, 1) } _ => { self.nfa_state = s; (res, 0, 0, 0) } }; } if output.is_empty() { return (ReadRecordResult::OutputFull, 0, 0, 0); } if ends.is_empty() { return (ReadRecordResult::OutputEndsFull, 0, 0, 0); } let (mut nin, mut nout, mut nend) = (0, self.output_pos, 0); let mut state = self.nfa_state; while nin < input.len() && nout < output.len() && nend < ends.len() { let (s, io) = self.transition_nfa(state, input[nin]); match io { NfaInputAction::CopyToOutput => { output[nout] = input[nin]; nout += 1; nin += 1; } NfaInputAction::Discard => { nin += 1; } NfaInputAction::Epsilon => {} } state = s; if state.is_field_final() { ends[nend] = nout; nend += 1; if state != NfaState::EndFieldDelim { break; } } } let res = ReadRecordResult::from_nfa( state, nin >= input.len(), nout >= output.len(), nend >= ends.len(), ); self.nfa_state = state; self.output_pos = if res.is_record() { 0 } else { nout }; (res, nin, nout, nend) } #[inline(always)] fn read_field_nfa( &mut self, input: &[u8], output: &mut [u8], ) -> (ReadFieldResult, usize, usize) { if input.is_empty() { self.nfa_state = self.transition_final_nfa(self.nfa_state); let res = ReadFieldResult::from_nfa(self.nfa_state, false, false); return (res, 0, 0); } if output.is_empty() { // If the output buffer is empty, then we can never make progress, // so just quit now. return (ReadFieldResult::OutputFull, 0, 0); } let (mut nin, mut nout) = (0, 0); let mut state = self.nfa_state; while nin < input.len() && nout < output.len() { let (s, io) = self.transition_nfa(state, input[nin]); match io { NfaInputAction::CopyToOutput => { output[nout] = input[nin]; nout += 1; nin += 1; } NfaInputAction::Discard => { nin += 1; } NfaInputAction::Epsilon => (), } state = s; if state.is_field_final() { break; } } let res = ReadFieldResult::from_nfa( state, nin >= input.len(), nout >= output.len(), ); self.nfa_state = state; (res, nin, nout) } /// Compute the final NFA transition after all caller-provided input has /// been exhausted. #[inline(always)] fn transition_final_nfa(&self, state: NfaState) -> NfaState { use self::NfaState::*; match state { End | StartRecord | EndRecord | InComment | CRLF => End, StartField | EndFieldDelim | EndFieldTerm | InField | InQuotedField | InEscapedQuote | InDoubleEscapedQuote | InRecordTerm => EndRecord, } } /// Compute the next NFA state given the current NFA state and the current /// input byte. /// /// This returns the next NFA state along with an NfaInputAction that /// indicates what should be done with the input byte (nothing for an epsilon /// transition, copied to a caller provided output buffer, or discarded). #[inline(always)] fn transition_nfa( &self, state: NfaState, c: u8, ) -> (NfaState, NfaInputAction) { use self::NfaState::*; match state { End => (End, NfaInputAction::Epsilon), StartRecord => { if self.term.equals(c) { (StartRecord, NfaInputAction::Discard) } else if self.comment == Some(c) { (InComment, NfaInputAction::Discard) } else { (StartField, NfaInputAction::Epsilon) } } EndRecord => (StartRecord, NfaInputAction::Epsilon), StartField => { if self.quoting && self.quote == c { (InQuotedField, NfaInputAction::Discard) } else if self.delimiter == c { (EndFieldDelim, NfaInputAction::Discard) } else if self.term.equals(c) { (EndFieldTerm, NfaInputAction::Epsilon) } else { (InField, NfaInputAction::CopyToOutput) } } EndFieldDelim => (StartField, NfaInputAction::Epsilon), EndFieldTerm => (InRecordTerm, NfaInputAction::Epsilon), InField => { if self.delimiter == c { (EndFieldDelim, NfaInputAction::Discard) } else if self.term.equals(c) { (EndFieldTerm, NfaInputAction::Epsilon) } else { (InField, NfaInputAction::CopyToOutput) } } InQuotedField => { if self.quoting && self.quote == c { (InDoubleEscapedQuote, NfaInputAction::Discard) } else if self.quoting && self.escape == Some(c) { (InEscapedQuote, NfaInputAction::Discard) } else { (InQuotedField, NfaInputAction::CopyToOutput) } } InEscapedQuote => (InQuotedField, NfaInputAction::CopyToOutput), InDoubleEscapedQuote => { if self.quoting && self.double_quote && self.quote == c { (InQuotedField, NfaInputAction::CopyToOutput) } else if self.delimiter == c { (EndFieldDelim, NfaInputAction::Discard) } else if self.term.equals(c) { (EndFieldTerm, NfaInputAction::Epsilon) } else { (InField, NfaInputAction::CopyToOutput) } } InComment => { if b'\n' == c { (StartRecord, NfaInputAction::Discard) } else { (InComment, NfaInputAction::Discard) } } InRecordTerm => { if self.term.is_crlf() && b'\r' == c { (CRLF, NfaInputAction::Discard) } else { (EndRecord, NfaInputAction::Discard) } } CRLF => { if b'\n' == c { (StartRecord, NfaInputAction::Discard) } else { (StartRecord, NfaInputAction::Epsilon) } } } } } /// The number of slots in the DFA transition table. /// /// This number is computed by multiplying the maximum number of transition /// classes (7) by the total number of NFA states that are used in the DFA /// (10). /// /// The number of transition classes is determined by an equivalence class of /// bytes, where every byte in the same equivalence classes is /// indistinguishable from any other byte with respect to the DFA. For example, /// if neither `a` nor `b` are specifed as a delimiter/quote/terminator/escape, /// then the DFA will never discriminate between `a` or `b`, so they can /// effectively be treated as identical. This reduces storage space /// substantially. /// /// The total number of NFA states (13) is greater than the total number of /// NFA states that are in the DFA. In particular, any NFA state that can only /// be reached by epsilon transitions will never have explicit usage in the /// DFA. const TRANS_CLASSES: usize = 7; const DFA_STATES: usize = 10; const TRANS_SIZE: usize = TRANS_CLASSES * DFA_STATES; /// The number of possible transition classes. (See the comment on `TRANS_SIZE` /// for more details.) const CLASS_SIZE: usize = 256; /// A representation of a DFA. /// /// For the most part, this is a transition table, but various optimizations /// have been applied to reduce its memory footprint. struct Dfa { /// The core transition table. Each row corresponds to the transitions for /// each input equivalence class. (Input bytes are mapped to their /// corresponding equivalence class with the `classes` map.) /// /// DFA states are represented as an index corresponding to the start of /// its row in this table. trans: [DfaState; TRANS_SIZE], /// A table with the same layout as `trans`, except its values indicate /// whether a particular `(state, equivalence class)` pair should emit an /// output byte. has_output: [bool; TRANS_SIZE], /// A map from input byte to equivalence class. /// /// This is responsible for reducing the effective alphabet size from /// 256 to `TRANS_CLASSES`. classes: DfaClasses, /// The DFA state corresponding to being inside an unquoted field. in_field: DfaState, /// The DFA state corresponding to being inside an quoted field. in_quoted: DfaState, /// The minimum DFA state that indicates a field has been parsed. All DFA /// states greater than this are also final-field states. final_field: DfaState, /// The minimum DFA state that indicates a record has been parsed. All DFA /// states greater than this are also final-record states. final_record: DfaState, } impl Dfa { fn new() -> Dfa { Dfa { trans: [DfaState(0); TRANS_SIZE], has_output: [false; TRANS_SIZE], classes: DfaClasses::new(), in_field: DfaState(0), in_quoted: DfaState(0), final_field: DfaState(0), final_record: DfaState(0), } } fn new_state(&self, nfa_state: NfaState) -> DfaState { let nclasses = self.classes.num_classes() as u8; let idx = (nfa_state as u8).checked_mul(nclasses).unwrap(); DfaState(idx) } fn new_state_final_end(&self) -> DfaState { self.new_state(NfaState::StartRecord) } fn new_state_final_record(&self) -> DfaState { self.new_state(NfaState::EndRecord) } fn get_output(&self, state: DfaState, c: u8) -> (DfaState, bool) { let cls = self.classes.classes[c as usize]; let idx = state.0 as usize + cls as usize; (self.trans[idx], self.has_output[idx]) } fn set(&mut self, from: DfaState, c: u8, to: DfaState, output: bool) { let cls = self.classes.classes[c as usize]; let idx = from.0 as usize + cls as usize; self.trans[idx] = to; self.has_output[idx] = output; } fn finish(&mut self) { self.in_field = self.new_state(NfaState::InField); self.in_quoted = self.new_state(NfaState::InQuotedField); self.final_field = self.new_state(NfaState::EndFieldDelim); self.final_record = self.new_state(NfaState::EndRecord); } fn new_read_field_result( &self, state: DfaState, is_final_trans: bool, inpdone: bool, outdone: bool, ) -> ReadFieldResult { if state >= self.final_record { ReadFieldResult::Field { record_end: true } } else if state == self.final_field { ReadFieldResult::Field { record_end: false } } else if is_final_trans && state.is_start() { ReadFieldResult::End } else { debug_assert!(state < self.final_field); if !inpdone && outdone { ReadFieldResult::OutputFull } else { ReadFieldResult::InputEmpty } } } fn new_read_record_result( &self, state: DfaState, is_final_trans: bool, inpdone: bool, outdone: bool, endsdone: bool, ) -> ReadRecordResult { if state >= self.final_record { ReadRecordResult::Record } else if is_final_trans && state.is_start() { ReadRecordResult::End } else { debug_assert!(state < self.final_record); if !inpdone && outdone { ReadRecordResult::OutputFull } else if !inpdone && endsdone { ReadRecordResult::OutputEndsFull } else { ReadRecordResult::InputEmpty } } } } /// A map from input byte to equivalence class. struct DfaClasses { classes: [u8; CLASS_SIZE], next_class: usize, } impl DfaClasses { fn new() -> DfaClasses { DfaClasses { classes: [0; CLASS_SIZE], next_class: 1 } } fn add(&mut self, b: u8) { if self.next_class > CLASS_SIZE { panic!("added too many classes") } self.classes[b as usize] = self.next_class as u8; self.next_class = self.next_class + 1; } fn num_classes(&self) -> usize { self.next_class as usize } /// Scan and copy the input bytes to the output buffer quickly. /// /// This assumes that the current state of the DFA is either `InField` or /// `InQuotedField`. In this case, all bytes corresponding to the first /// equivalence class (i.e., not a delimiter/quote/escape/etc.) are /// guaranteed to never result in a state transition out of the current /// state. This function takes advantage of that copies every byte from /// `input` in the first equivalence class to `output`. Once a byte is seen /// outside the first equivalence class, we quit and should fall back to /// the main DFA loop. #[inline(always)] fn scan_and_copy( &self, input: &[u8], nin: &mut usize, output: &mut [u8], nout: &mut usize, ) { while *nin < input.len() && *nout < output.len() && self.classes[input[*nin] as usize] == 0 { output[*nout] = input[*nin]; *nin += 1; *nout += 1; } } } /// A single DFA state. /// /// A DFA state is represented by the starting index of its corresponding row /// in the DFA transition table. This representation allows us to elide a /// single multiplication instruction when computing the next transition for /// a particular input byte. #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)] struct DfaState(u8); impl DfaState { fn start() -> DfaState { DfaState(0) } fn is_start(&self) -> bool { self.0 == 0 } } impl fmt::Debug for Dfa { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "Dfa(N/A)") } } impl fmt::Debug for DfaClasses { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!( f, "DfaClasses {{ classes: N/A, next_class: {:?} }}", self.next_class ) } } impl Clone for Dfa { fn clone(&self) -> Dfa { let mut dfa = Dfa::new(); dfa.trans.copy_from_slice(&self.trans); dfa } } impl Clone for DfaClasses { fn clone(&self) -> DfaClasses { let mut x = DfaClasses::new(); x.classes.copy_from_slice(&self.classes); x } } #[cfg(test)] mod tests { use core::str; use arrayvec::{ArrayString, ArrayVec}; use super::{ReadFieldResult, Reader, ReaderBuilder, Terminator}; type Csv = ArrayVec<[Row; 10]>; type Row = ArrayVec<[Field; 10]>; type Field = ArrayString<[u8; 10]>; // OMG I HATE BYTE STRING LITERALS SO MUCH. fn b(s: &str) -> &[u8] { s.as_bytes() } macro_rules! csv { ($([$($field:expr),*]),*) => {{ #[allow(unused_mut)] fn x() -> Csv { let mut csv = Csv::new(); $( let mut row = Row::new(); $( row.push(Field::from($field).unwrap()); )* csv.push(row); )* csv } x() }} } macro_rules! parses_to { ($name:ident, $data:expr, $expected:expr) => { parses_to!($name, $data, $expected, |builder| builder); }; ($name:ident, $data:expr, $expected:expr, $config:expr) => { #[test] fn $name() { let mut builder = ReaderBuilder::new(); builder.nfa(true); $config(&mut builder); let mut rdr = builder.build(); let got = parse_by_field(&mut rdr, $data); let expected = $expected; assert_eq!(expected, got, "nfa by field"); let mut builder = ReaderBuilder::new(); builder.nfa(true); $config(&mut builder); let mut rdr = builder.build(); let got = parse_by_record(&mut rdr, $data); let expected = $expected; assert_eq!(expected, got, "nfa by record"); let mut builder = ReaderBuilder::new(); $config(&mut builder); let mut rdr = builder.build(); let got = parse_by_field(&mut rdr, $data); let expected = $expected; assert_eq!(expected, got, "dfa by field"); let mut builder = ReaderBuilder::new(); $config(&mut builder); let mut rdr = builder.build(); let got = parse_by_record(&mut rdr, $data); let expected = $expected; assert_eq!(expected, got, "dfa by record"); } }; } fn parse_by_field(rdr: &mut Reader, data: &str) -> Csv { let mut data = data.as_bytes(); let mut field = [0u8; 10]; let mut csv = Csv::new(); let mut row = Row::new(); let mut outpos = 0; loop { let (res, nin, nout) = rdr.read_field(data, &mut field[outpos..]); data = &data[nin..]; outpos += nout; match res { ReadFieldResult::InputEmpty => { if !data.is_empty() { panic!("missing input data") } } ReadFieldResult::OutputFull => panic!("field too large"), ReadFieldResult::Field { record_end } => { let s = str::from_utf8(&field[..outpos]).unwrap(); row.push(Field::from(s).unwrap()); outpos = 0; if record_end { csv.push(row); row = Row::new(); } } ReadFieldResult::End => { return csv; } } } } fn parse_by_record(rdr: &mut Reader, data: &str) -> Csv { use crate::ReadRecordResult::*; let mut data = data.as_bytes(); let mut record = [0; 1024]; let mut ends = [0; 10]; let mut csv = Csv::new(); let (mut outpos, mut endpos) = (0, 0); loop { let (res, nin, nout, nend) = rdr.read_record( data, &mut record[outpos..], &mut ends[endpos..], ); data = &data[nin..]; outpos += nout; endpos += nend; match res { InputEmpty => { if !data.is_empty() { panic!("missing input data") } } OutputFull => panic!("record too large (out buffer)"), OutputEndsFull => panic!("record too large (end buffer)"), Record => { let s = str::from_utf8(&record[..outpos]).unwrap(); let mut start = 0; let mut row = Row::new(); for &end in &ends[..endpos] { row.push(Field::from(&s[start..end]).unwrap()); start = end; } csv.push(row); outpos = 0; endpos = 0; } End => return csv, } } } parses_to!(one_row_one_field, "a", csv![["a"]]); parses_to!(one_row_many_fields, "a,b,c", csv![["a", "b", "c"]]); parses_to!(one_row_trailing_comma, "a,b,", csv![["a", "b", ""]]); parses_to!(one_row_one_field_lf, "a\n", csv![["a"]]); parses_to!(one_row_many_fields_lf, "a,b,c\n", csv![["a", "b", "c"]]); parses_to!(one_row_trailing_comma_lf, "a,b,\n", csv![["a", "b", ""]]); parses_to!(one_row_one_field_crlf, "a\r\n", csv![["a"]]); parses_to!(one_row_many_fields_crlf, "a,b,c\r\n", csv![["a", "b", "c"]]); parses_to!(one_row_trailing_comma_crlf, "a,b,\r\n", csv![["a", "b", ""]]); parses_to!(one_row_one_field_cr, "a\r", csv![["a"]]); parses_to!(one_row_many_fields_cr, "a,b,c\r", csv![["a", "b", "c"]]); parses_to!(one_row_trailing_comma_cr, "a,b,\r", csv![["a", "b", ""]]); parses_to!(many_rows_one_field, "a\nb", csv![["a"], ["b"]]); parses_to!( many_rows_many_fields, "a,b,c\nx,y,z", csv![["a", "b", "c"], ["x", "y", "z"]] ); parses_to!( many_rows_trailing_comma, "a,b,\nx,y,", csv![["a", "b", ""], ["x", "y", ""]] ); parses_to!(many_rows_one_field_lf, "a\nb\n", csv![["a"], ["b"]]); parses_to!( many_rows_many_fields_lf, "a,b,c\nx,y,z\n", csv![["a", "b", "c"], ["x", "y", "z"]] ); parses_to!( many_rows_trailing_comma_lf, "a,b,\nx,y,\n", csv![["a", "b", ""], ["x", "y", ""]] ); parses_to!(many_rows_one_field_crlf, "a\r\nb\r\n", csv![["a"], ["b"]]); parses_to!( many_rows_many_fields_crlf, "a,b,c\r\nx,y,z\r\n", csv![["a", "b", "c"], ["x", "y", "z"]] ); parses_to!( many_rows_trailing_comma_crlf, "a,b,\r\nx,y,\r\n", csv![["a", "b", ""], ["x", "y", ""]] ); parses_to!(many_rows_one_field_cr, "a\rb\r", csv![["a"], ["b"]]); parses_to!( many_rows_many_fields_cr, "a,b,c\rx,y,z\r", csv![["a", "b", "c"], ["x", "y", "z"]] ); parses_to!( many_rows_trailing_comma_cr, "a,b,\rx,y,\r", csv![["a", "b", ""], ["x", "y", ""]] ); parses_to!( trailing_lines_no_record, "\n\n\na,b,c\nx,y,z\n\n\n", csv![["a", "b", "c"], ["x", "y", "z"]] ); parses_to!( trailing_lines_no_record_cr, "\r\r\ra,b,c\rx,y,z\r\r\r", csv![["a", "b", "c"], ["x", "y", "z"]] ); parses_to!( trailing_lines_no_record_crlf, "\r\n\r\n\r\na,b,c\r\nx,y,z\r\n\r\n\r\n", csv![["a", "b", "c"], ["x", "y", "z"]] ); parses_to!(empty, "", csv![]); parses_to!(empty_lines, "\n\n\n\n", csv![]); parses_to!( empty_lines_interspersed, "\n\na,b\n\n\nx,y\n\n\nm,n\n", csv![["a", "b"], ["x", "y"], ["m", "n"]] ); parses_to!(empty_lines_crlf, "\r\n\r\n\r\n\r\n", csv![]); parses_to!( empty_lines_interspersed_crlf, "\r\n\r\na,b\r\n\r\n\r\nx,y\r\n\r\n\r\nm,n\r\n", csv![["a", "b"], ["x", "y"], ["m", "n"]] ); parses_to!(empty_lines_mixed, "\r\n\n\r\n\n", csv![]); parses_to!( empty_lines_interspersed_mixed, "\n\r\na,b\r\n\n\r\nx,y\r\n\n\r\nm,n\r\n", csv![["a", "b"], ["x", "y"], ["m", "n"]] ); parses_to!(empty_lines_cr, "\r\r\r\r", csv![]); parses_to!( empty_lines_interspersed_cr, "\r\ra,b\r\r\rx,y\r\r\rm,n\r", csv![["a", "b"], ["x", "y"], ["m", "n"]] ); parses_to!( term_weird, "zza,bzc,dzz", csv![["a", "b"], ["c", "d"]], |b: &mut ReaderBuilder| { b.terminator(Terminator::Any(b'z')); } ); parses_to!( ascii_delimited, "a\x1fb\x1ec\x1fd", csv![["a", "b"], ["c", "d"]], |b: &mut ReaderBuilder| { b.ascii(); } ); parses_to!(bom_at_start, "\u{feff}a", csv![["a"]]); parses_to!(bom_in_field, "a\u{feff}", csv![["a\u{feff}"]]); parses_to!(bom_at_field_start, "a,\u{feff}b", csv![["a", "\u{feff}b"]]); parses_to!(quote_empty, "\"\"", csv![[""]]); parses_to!(quote_lf, "\"\"\n", csv![[""]]); parses_to!(quote_space, "\" \"", csv![[" "]]); parses_to!(quote_inner_space, "\" a \"", csv![[" a "]]); parses_to!(quote_outer_space, " \"a\" ", csv![[" \"a\" "]]); parses_to!(quote_change, "zaz", csv![["a"]], |b: &mut ReaderBuilder| { b.quote(b'z'); }); // This one is pretty hokey. // I don't really know what the "right" behavior is. parses_to!( quote_delimiter, ",a,,b", csv![["a,b"]], |b: &mut ReaderBuilder| { b.quote(b','); } ); parses_to!(quote_no_escapes, r#""a\"b""#, csv![[r#"a\b""#]]); parses_to!( quote_escapes_no_double, r#""a""b""#, csv![[r#"a"b""#]], |b: &mut ReaderBuilder| { b.double_quote(false); } ); parses_to!( quote_escapes, r#""a\"b""#, csv![[r#"a"b"#]], |b: &mut ReaderBuilder| { b.escape(Some(b'\\')); } ); parses_to!( quote_escapes_change, r#""az"b""#, csv![[r#"a"b"#]], |b: &mut ReaderBuilder| { b.escape(Some(b'z')); } ); parses_to!( quote_escapes_with_comma, r#""\"A,B\"""#, csv![[r#""A,B""#]], |b: &mut ReaderBuilder| { b.escape(Some(b'\\')).double_quote(false); } ); parses_to!( quoting_disabled, r#""abc,foo""#, csv![[r#""abc"#, r#"foo""#]], |b: &mut ReaderBuilder| { b.quoting(false); } ); parses_to!( delimiter_tabs, "a\tb", csv![["a", "b"]], |b: &mut ReaderBuilder| { b.delimiter(b'\t'); } ); parses_to!( delimiter_weird, "azb", csv![["a", "b"]], |b: &mut ReaderBuilder| { b.delimiter(b'z'); } ); parses_to!(extra_record_crlf_1, "foo\n1\n", csv![["foo"], ["1"]]); parses_to!(extra_record_crlf_2, "foo\r\n1\r\n", csv![["foo"], ["1"]]); parses_to!( comment_1, "foo\n# hi\nbar\n", csv![["foo"], ["bar"]], |b: &mut ReaderBuilder| { b.comment(Some(b'#')); } ); parses_to!( comment_2, "foo\n # hi\nbar\n", csv![["foo"], [" # hi"], ["bar"]], |b: &mut ReaderBuilder| { b.comment(Some(b'#')); } ); parses_to!( comment_3, "foo\n# hi\nbar\n", csv![["foo"], ["# hi"], ["bar"]], |b: &mut ReaderBuilder| { b.comment(Some(b'\n')); } ); parses_to!( comment_4, "foo,b#ar,baz", csv![["foo", "b#ar", "baz"]], |b: &mut ReaderBuilder| { b.comment(Some(b'#')); } ); parses_to!( comment_5, "foo,#bar,baz", csv![["foo", "#bar", "baz"]], |b: &mut ReaderBuilder| { b.comment(Some(b'#')); } ); macro_rules! assert_read { ( $rdr:expr, $input:expr, $output:expr, $expect_in:expr, $expect_out:expr, $expect_res:expr ) => {{ let (res, nin, nout) = $rdr.read_field($input, $output); assert_eq!($expect_in, nin); assert_eq!($expect_out, nout); assert_eq!($expect_res, res); }}; } // This tests that feeding a new reader with an empty buffer sends us // straight to End. #[test] fn stream_empty() { use crate::ReadFieldResult::*; let mut rdr = Reader::new(); assert_read!(rdr, &[], &mut [], 0, 0, End); } // Test that a single space is treated as a single field. #[test] fn stream_space() { use crate::ReadFieldResult::*; let mut rdr = Reader::new(); assert_read!(rdr, b(" "), &mut [0], 1, 1, InputEmpty); assert_read!(rdr, &[], &mut [0], 0, 0, Field { record_end: true }); assert_read!(rdr, &[], &mut [0], 0, 0, End); } // Test that a single comma ... #[test] fn stream_comma() { use crate::ReadFieldResult::*; let mut rdr = Reader::new(); assert_read!(rdr, b(","), &mut [0], 1, 0, Field { record_end: false }); assert_read!(rdr, &[], &mut [0], 0, 0, Field { record_end: true }); assert_read!(rdr, &[], &mut [0], 0, 0, End); } // Test that we can read a single large field in multiple output // buffers. #[test] fn stream_output_chunks() { use crate::ReadFieldResult::*; let mut inp = b("fooquux"); let out = &mut [0; 2]; let mut rdr = Reader::new(); assert_read!(rdr, inp, out, 2, 2, OutputFull); assert_eq!(out, b("fo")); inp = &inp[2..]; assert_read!(rdr, inp, out, 2, 2, OutputFull); assert_eq!(out, b("oq")); inp = &inp[2..]; assert_read!(rdr, inp, out, 2, 2, OutputFull); assert_eq!(out, b("uu")); inp = &inp[2..]; assert_read!(rdr, inp, out, 1, 1, InputEmpty); assert_eq!(&out[..1], b("x")); inp = &inp[1..]; assert!(inp.is_empty()); assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); assert_read!(rdr, inp, out, 0, 0, End); } // Test that we can read a single large field across multiple input // buffers. #[test] fn stream_input_chunks() { use crate::ReadFieldResult::*; let out = &mut [0; 10]; let mut rdr = Reader::new(); assert_read!(rdr, b("fo"), out, 2, 2, InputEmpty); assert_eq!(&out[..2], b("fo")); assert_read!(rdr, b("oq"), &mut out[2..], 2, 2, InputEmpty); assert_eq!(&out[..4], b("fooq")); assert_read!(rdr, b("uu"), &mut out[4..], 2, 2, InputEmpty); assert_eq!(&out[..6], b("fooquu")); assert_read!(rdr, b("x"), &mut out[6..], 1, 1, InputEmpty); assert_eq!(&out[..7], b("fooquux")); assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); assert_read!(rdr, &[], out, 0, 0, End); } // Test we can read doubled quotes correctly in a stream. #[test] fn stream_doubled_quotes() { use crate::ReadFieldResult::*; let out = &mut [0; 10]; let mut rdr = Reader::new(); assert_read!(rdr, b("\"fo\""), out, 4, 2, InputEmpty); assert_eq!(&out[..2], b("fo")); assert_read!(rdr, b("\"o"), &mut out[2..], 2, 2, InputEmpty); assert_eq!(&out[..4], b("fo\"o")); assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); assert_read!(rdr, &[], out, 0, 0, End); } // Test we can read escaped quotes correctly in a stream. #[test] fn stream_escaped_quotes() { use crate::ReadFieldResult::*; let out = &mut [0; 10]; let mut builder = ReaderBuilder::new(); let mut rdr = builder.escape(Some(b'\\')).build(); assert_read!(rdr, b("\"fo\\"), out, 4, 2, InputEmpty); assert_eq!(&out[..2], b("fo")); assert_read!(rdr, b("\"o"), &mut out[2..], 2, 2, InputEmpty); assert_eq!(&out[..4], b("fo\"o")); assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); assert_read!(rdr, &[], out, 0, 0, End); } // Test that empty output buffers don't wreak havoc. #[test] fn stream_empty_output() { use crate::ReadFieldResult::*; let out = &mut [0; 10]; let mut rdr = Reader::new(); assert_read!( rdr, b("foo,bar"), out, 4, 3, Field { record_end: false } ); assert_eq!(&out[..3], b("foo")); assert_read!(rdr, b("bar"), &mut [], 0, 0, OutputFull); assert_read!(rdr, b("bar"), out, 3, 3, InputEmpty); assert_eq!(&out[..3], b("bar")); assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); assert_read!(rdr, &[], out, 0, 0, End); } // Test that we can reset the parser mid-stream and count on it to do // the right thing. #[test] fn reset_works() { use crate::ReadFieldResult::*; let out = &mut [0; 10]; let mut rdr = Reader::new(); assert_read!(rdr, b("\"foo"), out, 4, 3, InputEmpty); assert_eq!(&out[..3], b("foo")); // Without reseting the parser state, the reader will remember that // we're in a quoted field, and therefore interpret the leading double // quotes below as a single quote and the trailing quote as a matching // terminator. With the reset, however, the parser forgets the quoted // field and treats the leading double quotes as a syntax quirk and // drops them, in addition to hanging on to the trailing unmatched // quote. (Matches Python's behavior.) rdr.reset(); assert_read!(rdr, b("\"\"bar\""), out, 6, 4, InputEmpty); assert_eq!(&out[..4], b("bar\"")); } // Test the line number reporting is correct. #[test] fn line_numbers() { use crate::ReadFieldResult::*; let out = &mut [0; 10]; let mut rdr = Reader::new(); assert_eq!(1, rdr.line()); assert_read!(rdr, b("\n\n\n\n"), out, 4, 0, InputEmpty); assert_eq!(5, rdr.line()); assert_read!(rdr, b("foo,"), out, 4, 3, Field { record_end: false }); assert_eq!(5, rdr.line()); assert_read!(rdr, b("bar\n"), out, 4, 3, Field { record_end: true }); assert_eq!(6, rdr.line()); assert_read!(rdr, &[], &mut [0], 0, 0, End); assert_eq!(6, rdr.line()); } macro_rules! assert_read_record { ( $rdr:expr, $input:expr, $output:expr, $ends:expr, $expect_in:expr, $expect_out:expr, $expect_end:expr, $expect_res:expr ) => {{ let (res, nin, nout, nend) = $rdr.read_record($input, $output, $ends); assert_eq!($expect_res, res, "result"); assert_eq!($expect_in, nin, "input"); assert_eq!($expect_out, nout, "output"); assert_eq!($expect_end, nend, "ends"); }}; } // Test that we can incrementally read a record. #[test] fn stream_record() { use crate::ReadRecordResult::*; let mut inp = b("foo,bar\nbaz"); let out = &mut [0; 1024]; let ends = &mut [0; 10]; let mut rdr = Reader::new(); assert_read_record!(rdr, &inp, out, ends, 8, 6, 2, Record); assert_eq!(ends[0], 3); assert_eq!(ends[1], 6); inp = &inp[8..]; assert_read_record!(rdr, &inp, out, ends, 3, 3, 0, InputEmpty); inp = &inp[3..]; assert_read_record!(rdr, &inp, out, ends, 0, 0, 1, Record); assert_eq!(ends[0], 3); assert_read_record!(rdr, &inp, out, ends, 0, 0, 0, End); } // Test that if our output ends are full during the last read that // we get an appropriate state returned. #[test] fn stream_record_last_end_output_full() { use crate::ReadRecordResult::*; let mut inp = b("foo,bar\nbaz"); let out = &mut [0; 1024]; let ends = &mut [0; 10]; let mut rdr = Reader::new(); assert_read_record!(rdr, &inp, out, ends, 8, 6, 2, Record); assert_eq!(ends[0], 3); assert_eq!(ends[1], 6); inp = &inp[8..]; assert_read_record!(rdr, &inp, out, ends, 3, 3, 0, InputEmpty); inp = &inp[3..]; assert_read_record!(rdr, &inp, out, &mut [], 0, 0, 0, OutputEndsFull); assert_read_record!(rdr, &inp, out, ends, 0, 0, 1, Record); assert_eq!(ends[0], 3); assert_read_record!(rdr, &inp, out, ends, 0, 0, 0, End); } #[test] fn reset_input_partial() { use crate::ReadRecordResult::*; let inp = b("foo,bar\nbaz"); let out = &mut [0; 1024]; let ends = &mut [0; 10]; let mut rdr = Reader::new(); assert_read_record!(rdr, &inp, out, ends, 8, 6, 2, Record); // Try to read incomplete record. let (result, _, _, _) = rdr.read_record(&inp[8..], out, ends); assert_eq!(result, InputEmpty); rdr.reset(); let inp = b("baz,raz\n"); let (result, _, _, _) = rdr.read_record(inp, out, ends); assert_eq!(result, Record); assert_eq!(ends[0], 3); } }