1 use core::fmt; 2 3 use crate::Terminator; 4 5 // BE ADVISED 6 // 7 // This may just be one of the more complicated CSV parsers you'll come across. 8 // The implementation never allocates and consists of both a functional NFA 9 // parser and a DFA parser. The DFA parser is the work horse and we could elide 10 // much of the work involved in making the NFA parser work, but the NFA parser 11 // is much easier to debug. The NFA parser is tested alongside the DFA parser, 12 // so they should never be out of sync. 13 // 14 // The basic structure of the implementation is to encode the NFA parser as 15 // an explicit state machine in code. The DFA is then generated by populating 16 // a transition table on the stack by exhaustively enumerating all possible 17 // states on all possible inputs (this is possible because the number of states 18 // and the number of inputs is very small). 19 // 20 // Note that some pieces of the NFA parser (such as the NFA state machine) are 21 // required. In particular, the translation from the NFA to the DFA depends on 22 // the configuration of the CSV parser as given by the caller, and indeed, this 23 // is one of the key performance benefits of the DFA: it doesn't have any 24 // overhead (other than a bigger transition table) associated with the number 25 // of configuration options. 26 // 27 // ADVICE FOR HACKERS 28 // 29 // This code is too clever for its own good. As such, changes to some parts of 30 // the code may have a non-obvious impact on other parts. This is mostly 31 // motivated by trying to keep the DFA transition table as small as possible, 32 // since it is stored on the stack. Here are some tips that may save you some 33 // time: 34 // 35 // * If you add a new NFA state, then you also need to consider how it impacts 36 // the DFA. If all of the incoming transitions into an NFA state are 37 // epsilon transitions, then it probably isn't materialized in the DFA. 38 // If the NFA state indicates that a field or a record has been parsed, then 39 // it should be considered final. Let the comments in `NfaState` be your 40 // guide. 41 // * If you add a new configuration knob to the parser, then you may need to 42 // modify the `TRANS_CLASSES` constant below. The `TRANS_CLASSES` constant 43 // indicates the total number of discriminating bytes in the DFA. And if you 44 // modify `TRANS_CLASSES`, you probably also need to modify `build_dfa` to 45 // add a new class. For example, in order to add parsing support for 46 // comments, I bumped `TRANS_CLASSES` from `6` to `7` and added the comment 47 // byte (if one exists) to the list of classes in `build_dfa`. 48 // * The special DFA start state doubles as the final state once all input 49 // from the caller has been exhausted. We must be careful to guard this 50 // case analysis on whether the input is actually exhausted, since the start 51 // state is an otherwise valid state. 52 53 /// A pull based CSV reader. 54 /// 55 /// This reader parses CSV data using a finite state machine. Callers can 56 /// extract parsed data incrementally using one of the `read` methods. 57 /// 58 /// Note that this CSV reader is somewhat encoding agnostic. The source data 59 /// needs to be at least ASCII compatible. There is no support for specifying 60 /// the full gamut of Unicode delimiters/terminators/quotes/escapes. Instead, 61 /// any byte can be used, although callers probably want to stick to the ASCII 62 /// subset (`<= 0x7F`). 63 /// 64 /// # Usage 65 /// 66 /// A reader has two different ways to read CSV data, each with their own 67 /// trade offs. 68 /// 69 /// * `read_field` - Copies a single CSV field into an output buffer while 70 /// unescaping quotes. This is simple to use and doesn't require storing an 71 /// entire record contiguously in memory, but it is slower. 72 /// * `read_record` - Copies an entire CSV record into an output buffer while 73 /// unescaping quotes. The ending positions of each field are copied into 74 /// an additional buffer. This is harder to use and requires larger output 75 /// buffers, but it is faster than `read_field` since it amortizes more 76 /// costs. 77 /// 78 /// # RFC 4180 79 /// 80 /// [RFC 4180](https://tools.ietf.org/html/rfc4180) 81 /// is the closest thing to a specification for CSV data. Unfortunately, 82 /// CSV data that is seen in the wild can vary significantly. Often, the CSV 83 /// data is outright invalid. Instead of fixing the producers of bad CSV data, 84 /// we have seen fit to make consumers much more flexible in what they accept. 85 /// This reader continues that tradition, and therefore, isn't technically 86 /// compliant with RFC 4180. In particular, this reader will never return an 87 /// error and will always find *a* parse. 88 /// 89 /// Here are some detailed differences from RFC 4180: 90 /// 91 /// * CRLF, LF and CR are each treated as a single record terminator by 92 /// default. 93 /// * Records are permitted to be of varying length. 94 /// * Empty lines (that do not include other whitespace) are ignored. 95 #[derive(Clone, Debug)] 96 pub struct Reader { 97 /// A table-based DFA for parsing CSV. 98 dfa: Dfa, 99 /// The current DFA state, if the DFA is used. 100 dfa_state: DfaState, 101 /// The current NFA state, if the NFA is used. 102 nfa_state: NfaState, 103 /// The delimiter that separates fields. 104 delimiter: u8, 105 /// The terminator that separates records. 106 term: Terminator, 107 /// The quotation byte. 108 quote: u8, 109 /// Whether to recognize escaped quotes. 110 escape: Option<u8>, 111 /// Whether to recognized doubled quotes. 112 double_quote: bool, 113 /// If enabled, lines beginning with this byte are ignored. 114 comment: Option<u8>, 115 /// If enabled (the default), then quotes are respected. When disabled, 116 /// quotes are not treated specially. 117 quoting: bool, 118 /// Whether to use the NFA for parsing. 119 /// 120 /// Generally this is for debugging. There's otherwise no good reason 121 /// to avoid the DFA. 122 use_nfa: bool, 123 /// The current line number. 124 line: u64, 125 /// Whether this parser has ever read anything. 126 has_read: bool, 127 /// The current position in the output buffer when reading a record. 128 output_pos: usize, 129 } 130 131 impl Default for Reader { default() -> Reader132 fn default() -> Reader { 133 Reader { 134 dfa: Dfa::new(), 135 dfa_state: DfaState::start(), 136 nfa_state: NfaState::StartRecord, 137 delimiter: b',', 138 term: Terminator::default(), 139 quote: b'"', 140 escape: None, 141 double_quote: true, 142 comment: None, 143 quoting: true, 144 use_nfa: false, 145 line: 1, 146 has_read: false, 147 output_pos: 0, 148 } 149 } 150 } 151 152 /// Builds a CSV reader with various configuration knobs. 153 /// 154 /// This builder can be used to tweak the field delimiter, record terminator 155 /// and more for parsing CSV. Once a CSV `Reader` is built, its configuration 156 /// cannot be changed. 157 #[derive(Debug, Default)] 158 pub struct ReaderBuilder { 159 rdr: Reader, 160 } 161 162 impl ReaderBuilder { 163 /// Create a new builder. new() -> ReaderBuilder164 pub fn new() -> ReaderBuilder { 165 ReaderBuilder::default() 166 } 167 168 /// Build a CSV parser from this configuration. build(&self) -> Reader169 pub fn build(&self) -> Reader { 170 let mut rdr = self.rdr.clone(); 171 rdr.build_dfa(); 172 rdr 173 } 174 175 /// The field delimiter to use when parsing CSV. 176 /// 177 /// The default is `b','`. delimiter(&mut self, delimiter: u8) -> &mut ReaderBuilder178 pub fn delimiter(&mut self, delimiter: u8) -> &mut ReaderBuilder { 179 self.rdr.delimiter = delimiter; 180 self 181 } 182 183 /// The record terminator to use when parsing CSV. 184 /// 185 /// A record terminator can be any single byte. The default is a special 186 /// value, `Terminator::CRLF`, which treats any occurrence of `\r`, `\n` 187 /// or `\r\n` as a single record terminator. terminator(&mut self, term: Terminator) -> &mut ReaderBuilder188 pub fn terminator(&mut self, term: Terminator) -> &mut ReaderBuilder { 189 self.rdr.term = term; 190 self 191 } 192 193 /// The quote character to use when parsing CSV. 194 /// 195 /// The default is `b'"'`. quote(&mut self, quote: u8) -> &mut ReaderBuilder196 pub fn quote(&mut self, quote: u8) -> &mut ReaderBuilder { 197 self.rdr.quote = quote; 198 self 199 } 200 201 /// The escape character to use when parsing CSV. 202 /// 203 /// In some variants of CSV, quotes are escaped using a special escape 204 /// character like `\` (instead of escaping quotes by doubling them). 205 /// 206 /// By default, recognizing these idiosyncratic escapes is disabled. escape(&mut self, escape: Option<u8>) -> &mut ReaderBuilder207 pub fn escape(&mut self, escape: Option<u8>) -> &mut ReaderBuilder { 208 self.rdr.escape = escape; 209 self 210 } 211 212 /// Enable double quote escapes. 213 /// 214 /// This is enabled by default, but it may be disabled. When disabled, 215 /// doubled quotes are not interpreted as escapes. double_quote(&mut self, yes: bool) -> &mut ReaderBuilder216 pub fn double_quote(&mut self, yes: bool) -> &mut ReaderBuilder { 217 self.rdr.double_quote = yes; 218 self 219 } 220 221 /// Enable or disable quoting. 222 /// 223 /// This is enabled by default, but it may be disabled. When disabled, 224 /// quotes are not treated specially. quoting(&mut self, yes: bool) -> &mut ReaderBuilder225 pub fn quoting(&mut self, yes: bool) -> &mut ReaderBuilder { 226 self.rdr.quoting = yes; 227 self 228 } 229 230 /// The comment character to use when parsing CSV. 231 /// 232 /// If the start of a record begins with the byte given here, then that 233 /// line is ignored by the CSV parser. 234 /// 235 /// This is disabled by default. comment(&mut self, comment: Option<u8>) -> &mut ReaderBuilder236 pub fn comment(&mut self, comment: Option<u8>) -> &mut ReaderBuilder { 237 self.rdr.comment = comment; 238 self 239 } 240 241 /// A convenience method for specifying a configuration to read ASCII 242 /// delimited text. 243 /// 244 /// This sets the delimiter and record terminator to the ASCII unit 245 /// separator (`\x1F`) and record separator (`\x1E`), respectively. ascii(&mut self) -> &mut ReaderBuilder246 pub fn ascii(&mut self) -> &mut ReaderBuilder { 247 self.delimiter(b'\x1F').terminator(Terminator::Any(b'\x1E')) 248 } 249 250 /// Enable or disable the NFA for parsing CSV. 251 /// 252 /// This is intended to be a debug option useful for debugging. The NFA 253 /// is always slower than the DFA. 254 #[doc(hidden)] nfa(&mut self, yes: bool) -> &mut ReaderBuilder255 pub fn nfa(&mut self, yes: bool) -> &mut ReaderBuilder { 256 self.rdr.use_nfa = yes; 257 self 258 } 259 } 260 261 /// The result of parsing at most one field from CSV data. 262 #[derive(Clone, Debug, Eq, PartialEq)] 263 pub enum ReadFieldResult { 264 /// The caller provided input was exhausted before the end of a field or 265 /// record was found. 266 InputEmpty, 267 /// The caller provided output buffer was filled before an entire field 268 /// could be written to it. 269 OutputFull, 270 /// The end of a field was found. 271 /// 272 /// Note that when `record_end` is true, then the end of this field also 273 /// corresponds to the end of a record. 274 Field { 275 /// Whether this was the last field in a record or not. 276 record_end: bool, 277 }, 278 /// All CSV data has been read. 279 /// 280 /// This state can only be returned when an empty input buffer is provided 281 /// by the caller. 282 End, 283 } 284 285 impl ReadFieldResult { from_nfa( state: NfaState, inpdone: bool, outdone: bool, ) -> ReadFieldResult286 fn from_nfa( 287 state: NfaState, 288 inpdone: bool, 289 outdone: bool, 290 ) -> ReadFieldResult { 291 match state { 292 NfaState::End => ReadFieldResult::End, 293 NfaState::EndRecord | NfaState::CRLF => { 294 ReadFieldResult::Field { record_end: true } 295 } 296 NfaState::EndFieldDelim => { 297 ReadFieldResult::Field { record_end: false } 298 } 299 _ => { 300 assert!(!state.is_field_final()); 301 if !inpdone && outdone { 302 ReadFieldResult::OutputFull 303 } else { 304 ReadFieldResult::InputEmpty 305 } 306 } 307 } 308 } 309 } 310 311 /// The result of parsing at most one field from CSV data while ignoring the 312 /// output. 313 #[derive(Clone, Debug, Eq, PartialEq)] 314 pub enum ReadFieldNoCopyResult { 315 /// The caller provided input was exhausted before the end of a field or 316 /// record was found. 317 InputEmpty, 318 /// The end of a field was found. 319 /// 320 /// Note that when `record_end` is true, then the end of this field also 321 /// corresponds to the end of a record. 322 Field { 323 /// Whether this was the last field in a record or not. 324 record_end: bool, 325 }, 326 /// All CSV data has been read. 327 /// 328 /// This state can only be returned when an empty input buffer is provided 329 /// by the caller. 330 End, 331 } 332 333 /// The result of parsing at most one record from CSV data. 334 #[derive(Clone, Debug, Eq, PartialEq)] 335 pub enum ReadRecordResult { 336 /// The caller provided input was exhausted before the end of a record was 337 /// found. 338 InputEmpty, 339 /// The caller provided output buffer was filled before an entire field 340 /// could be written to it. 341 OutputFull, 342 /// The caller provided output buffer of field end poisitions was filled 343 /// before the next field could be parsed. 344 OutputEndsFull, 345 /// The end of a record was found. 346 Record, 347 /// All CSV data has been read. 348 /// 349 /// This state can only be returned when an empty input buffer is provided 350 /// by the caller. 351 End, 352 } 353 354 impl ReadRecordResult { is_record(&self) -> bool355 fn is_record(&self) -> bool { 356 *self == ReadRecordResult::Record 357 } 358 from_nfa( state: NfaState, inpdone: bool, outdone: bool, endsdone: bool, ) -> ReadRecordResult359 fn from_nfa( 360 state: NfaState, 361 inpdone: bool, 362 outdone: bool, 363 endsdone: bool, 364 ) -> ReadRecordResult { 365 match state { 366 NfaState::End => ReadRecordResult::End, 367 NfaState::EndRecord | NfaState::CRLF => ReadRecordResult::Record, 368 _ => { 369 assert!(!state.is_record_final()); 370 if !inpdone && outdone { 371 ReadRecordResult::OutputFull 372 } else if !inpdone && endsdone { 373 ReadRecordResult::OutputEndsFull 374 } else { 375 ReadRecordResult::InputEmpty 376 } 377 } 378 } 379 } 380 } 381 382 /// The result of parsing at most one record from CSV data while ignoring 383 /// output. 384 #[derive(Clone, Debug, Eq, PartialEq)] 385 pub enum ReadRecordNoCopyResult { 386 /// The caller provided input was exhausted before the end of a record was 387 /// found. 388 InputEmpty, 389 /// The end of a record was found. 390 Record, 391 /// All CSV data has been read. 392 /// 393 /// This state can only be returned when an empty input buffer is provided 394 /// by the caller. 395 End, 396 } 397 398 /// What should be done with input bytes during an NFA transition 399 #[derive(Clone, Debug, Eq, PartialEq)] 400 enum NfaInputAction { 401 // Do not consume an input byte 402 Epsilon, 403 // Copy input byte to a caller-provided output buffer 404 CopyToOutput, 405 // Consume but do not copy input byte (for example, seeing a field 406 // delimiter will consume an input byte but should not copy it to the 407 // output buffer. 408 Discard, 409 } 410 411 /// An NFA state is a state that can be visited in the NFA parser. 412 /// 413 /// Given the simplicity of the machine, a subset of NFA states double as DFA 414 /// states. NFA states that only have incoming epsilon transitions are 415 /// optimized out when converting the machine to a DFA. 416 #[derive(Copy, Clone, Debug, Eq, PartialEq)] 417 enum NfaState { 418 // These states aren't used in the DFA, so we 419 // assign them meaningless numbers. 420 EndFieldTerm = 200, 421 InRecordTerm = 201, 422 End = 202, 423 424 // All states below are DFA states. 425 StartRecord = 0, 426 StartField = 1, 427 InField = 2, 428 InQuotedField = 3, 429 InEscapedQuote = 4, 430 InDoubleEscapedQuote = 5, 431 InComment = 6, 432 // All states below are "final field" states. 433 // Namely, they indicate that a field has been parsed. 434 EndFieldDelim = 7, 435 // All states below are "final record" states. 436 // Namely, they indicate that a record has been parsed. 437 EndRecord = 8, 438 CRLF = 9, 439 } 440 441 /// A list of NFA states that have an explicit representation in the DFA. 442 const NFA_STATES: &'static [NfaState] = &[ 443 NfaState::StartRecord, 444 NfaState::StartField, 445 NfaState::EndFieldDelim, 446 NfaState::InField, 447 NfaState::InQuotedField, 448 NfaState::InEscapedQuote, 449 NfaState::InDoubleEscapedQuote, 450 NfaState::InComment, 451 NfaState::EndRecord, 452 NfaState::CRLF, 453 ]; 454 455 impl NfaState { 456 /// Returns true if this state indicates that a field has been parsed. is_field_final(&self) -> bool457 fn is_field_final(&self) -> bool { 458 match *self { 459 NfaState::End 460 | NfaState::EndRecord 461 | NfaState::CRLF 462 | NfaState::EndFieldDelim => true, 463 _ => false, 464 } 465 } 466 467 /// Returns true if this state indicates that a record has been parsed. is_record_final(&self) -> bool468 fn is_record_final(&self) -> bool { 469 match *self { 470 NfaState::End | NfaState::EndRecord | NfaState::CRLF => true, 471 _ => false, 472 } 473 } 474 } 475 476 impl Reader { 477 /// Create a new CSV reader with a default parser configuration. new() -> Reader478 pub fn new() -> Reader { 479 ReaderBuilder::new().build() 480 } 481 482 /// Reset the parser such that it behaves as if it had never been used. 483 /// 484 /// This may be useful when reading CSV data in a random access pattern. reset(&mut self)485 pub fn reset(&mut self) { 486 self.dfa_state = self.dfa.new_state(NfaState::StartRecord); 487 self.nfa_state = NfaState::StartRecord; 488 self.line = 1; 489 self.has_read = false; 490 self.output_pos = 0; 491 } 492 493 /// Return the current line number as measured by the number of occurrences 494 /// of `\n`. 495 /// 496 /// Line numbers starts at `1` and are reset when `reset` is called. line(&self) -> u64497 pub fn line(&self) -> u64 { 498 self.line 499 } 500 501 /// Set the line number. 502 /// 503 /// This is useful after a call to `reset` where the caller knows the 504 /// line number from some additional context. set_line(&mut self, line: u64)505 pub fn set_line(&mut self, line: u64) { 506 self.line = line; 507 } 508 509 /// Parse a single CSV field in `input` and copy field data to `output`. 510 /// 511 /// This routine requires a caller provided buffer of CSV data as the 512 /// `input` and a caller provided buffer, `output`, in which to store field 513 /// data extracted from `input`. The field data copied to `output` will 514 /// have its quotes unescaped. 515 /// 516 /// Calling this routine parses at most a single field and returns 517 /// three values indicating the state of the parser. The first value, a 518 /// `ReadFieldResult`, tells the caller what to do next. For example, if 519 /// the entire input was read or if the output buffer was filled before 520 /// a full field had been read, then `ReadFieldResult::InputEmpty` or 521 /// `ReadFieldResult::OutputFull` is returned, respectively. See the 522 /// documentation for `ReadFieldResult` for more details. 523 /// 524 /// The other two values returned correspond to the number of bytes 525 /// read from `input` and written to `output`, respectively. 526 /// 527 /// # Termination 528 /// 529 /// This reader interprets an empty `input` buffer as an indication that 530 /// there is no CSV data left to read. Namely, when the caller has 531 /// exhausted all CSV data, the caller should continue to call `read` with 532 /// an empty input buffer until `ReadFieldResult::End` is returned. 533 /// 534 /// # Errors 535 /// 536 /// This CSV reader can never return an error. Instead, it prefers *a* 537 /// parse over *no* parse. read_field( &mut self, input: &[u8], output: &mut [u8], ) -> (ReadFieldResult, usize, usize)538 pub fn read_field( 539 &mut self, 540 input: &[u8], 541 output: &mut [u8], 542 ) -> (ReadFieldResult, usize, usize) { 543 let (input, bom_nin) = self.strip_utf8_bom(input); 544 let (res, nin, nout) = if self.use_nfa { 545 self.read_field_nfa(input, output) 546 } else { 547 self.read_field_dfa(input, output) 548 }; 549 self.has_read = true; 550 (res, nin + bom_nin, nout) 551 } 552 553 /// Parse a single CSV record in `input` and copy each field contiguously 554 /// to `output`, with the end position of each field written to `ends`. 555 /// 556 /// **NOTE**: This method is more cumbersome to use than `read_field`, but 557 /// it can be faster since it amortizes more work. 558 /// 559 /// This routine requires a caller provided buffer of CSV data as the 560 /// `input` and two caller provided buffers to store the unescaped field 561 /// data (`output`) and the end position of each field in the record 562 /// (`fields`). 563 /// 564 /// Calling this routine parses at most a single record and returns four 565 /// values indicating the state of the parser. The first value, a 566 /// `ReadRecordResult`, tells the caller what to do next. For example, if 567 /// the entire input was read or if the output buffer was filled before a 568 /// full field had been read, then `ReadRecordResult::InputEmpty` or 569 /// `ReadRecordResult::OutputFull` is returned, respectively. Similarly, if 570 /// the `ends` buffer is full, then `ReadRecordResult::OutputEndsFull` is 571 /// returned. See the documentation for `ReadRecordResult` for more 572 /// details. 573 /// 574 /// The other three values correspond to the number of bytes read from 575 /// `input`, the number of bytes written to `output` and the number of 576 /// end positions written to `ends`, respectively. 577 /// 578 /// The end positions written to `ends` are constructed as if there was 579 /// a single contiguous buffer in memory containing the entire row, even 580 /// if `ReadRecordResult::OutputFull` was returned in the middle of reading 581 /// a row. 582 /// 583 /// # Termination 584 /// 585 /// This reader interprets an empty `input` buffer as an indication that 586 /// there is no CSV data left to read. Namely, when the caller has 587 /// exhausted all CSV data, the caller should continue to call `read` with 588 /// an empty input buffer until `ReadRecordResult::End` is returned. 589 /// 590 /// # Errors 591 /// 592 /// This CSV reader can never return an error. Instead, it prefers *a* 593 /// parse over *no* parse. read_record( &mut self, input: &[u8], output: &mut [u8], ends: &mut [usize], ) -> (ReadRecordResult, usize, usize, usize)594 pub fn read_record( 595 &mut self, 596 input: &[u8], 597 output: &mut [u8], 598 ends: &mut [usize], 599 ) -> (ReadRecordResult, usize, usize, usize) { 600 let (input, bom_nin) = self.strip_utf8_bom(input); 601 let (res, nin, nout, nend) = if self.use_nfa { 602 self.read_record_nfa(input, output, ends) 603 } else { 604 self.read_record_dfa(input, output, ends) 605 }; 606 self.has_read = true; 607 (res, nin + bom_nin, nout, nend) 608 } 609 610 /// Strip off a possible UTF-8 BOM at the start of a file. Quick note that 611 /// this method will fail to strip off the BOM if only part of the BOM is 612 /// buffered. Hopefully that won't happen very often. strip_utf8_bom<'a>(&self, input: &'a [u8]) -> (&'a [u8], usize)613 fn strip_utf8_bom<'a>(&self, input: &'a [u8]) -> (&'a [u8], usize) { 614 let (input, nin) = if { 615 !self.has_read 616 && input.len() >= 3 617 && &input[0..3] == b"\xef\xbb\xbf" 618 } { 619 (&input[3..], 3) 620 } else { 621 (input, 0) 622 }; 623 (input, nin) 624 } 625 626 #[inline(always)] read_record_dfa( &mut self, input: &[u8], output: &mut [u8], ends: &mut [usize], ) -> (ReadRecordResult, usize, usize, usize)627 fn read_record_dfa( 628 &mut self, 629 input: &[u8], 630 output: &mut [u8], 631 ends: &mut [usize], 632 ) -> (ReadRecordResult, usize, usize, usize) { 633 if input.is_empty() { 634 let s = self.transition_final_dfa(self.dfa_state); 635 let res = 636 self.dfa.new_read_record_result(s, true, false, false, false); 637 // This part is a little tricky. When reading the final record, 638 // the last result the caller will get is an InputEmpty, and while 639 // they'll have everything they need in `output`, they'll be 640 // missing the final end position of the final field in `ends`. 641 // We insert that here, but we must take care to handle the case 642 // where `ends` doesn't have enough space. If it doesn't have 643 // enough space, then we also can't transition to the next state. 644 return match res { 645 ReadRecordResult::Record => { 646 if ends.is_empty() { 647 return (ReadRecordResult::OutputEndsFull, 0, 0, 0); 648 } 649 self.dfa_state = s; 650 ends[0] = self.output_pos; 651 self.output_pos = 0; 652 (res, 0, 0, 1) 653 } 654 _ => { 655 self.dfa_state = s; 656 (res, 0, 0, 0) 657 } 658 }; 659 } 660 if output.is_empty() { 661 return (ReadRecordResult::OutputFull, 0, 0, 0); 662 } 663 if ends.is_empty() { 664 return (ReadRecordResult::OutputEndsFull, 0, 0, 0); 665 } 666 let (mut nin, mut nout, mut nend) = (0, 0, 0); 667 let mut state = self.dfa_state; 668 while nin < input.len() && nout < output.len() && nend < ends.len() { 669 let (s, has_out) = self.dfa.get_output(state, input[nin]); 670 self.line += (input[nin] == b'\n') as u64; 671 state = s; 672 if has_out { 673 output[nout] = input[nin]; 674 nout += 1; 675 } 676 nin += 1; 677 if state >= self.dfa.final_field { 678 ends[nend] = self.output_pos + nout; 679 nend += 1; 680 if state > self.dfa.final_field { 681 break; 682 } 683 } 684 if state == self.dfa.in_field || state == self.dfa.in_quoted { 685 self.dfa 686 .classes 687 .scan_and_copy(input, &mut nin, output, &mut nout); 688 } 689 } 690 let res = self.dfa.new_read_record_result( 691 state, 692 false, 693 nin >= input.len(), 694 nout >= output.len(), 695 nend >= ends.len(), 696 ); 697 self.dfa_state = state; 698 if res.is_record() { 699 self.output_pos = 0; 700 } else { 701 self.output_pos += nout; 702 } 703 (res, nin, nout, nend) 704 } 705 706 #[inline(always)] read_field_dfa( &mut self, input: &[u8], output: &mut [u8], ) -> (ReadFieldResult, usize, usize)707 fn read_field_dfa( 708 &mut self, 709 input: &[u8], 710 output: &mut [u8], 711 ) -> (ReadFieldResult, usize, usize) { 712 if input.is_empty() { 713 self.dfa_state = self.transition_final_dfa(self.dfa_state); 714 let res = self.dfa.new_read_field_result( 715 self.dfa_state, 716 true, 717 false, 718 false, 719 ); 720 return (res, 0, 0); 721 } 722 if output.is_empty() { 723 return (ReadFieldResult::OutputFull, 0, 0); 724 } 725 let (mut nin, mut nout) = (0, 0); 726 let mut state = self.dfa_state; 727 while nin < input.len() && nout < output.len() { 728 let b = input[nin]; 729 self.line += (b == b'\n') as u64; 730 let (s, has_out) = self.dfa.get_output(state, b); 731 state = s; 732 if has_out { 733 output[nout] = b; 734 nout += 1; 735 } 736 nin += 1; 737 if state >= self.dfa.final_field { 738 break; 739 } 740 } 741 let res = self.dfa.new_read_field_result( 742 state, 743 false, 744 nin >= input.len(), 745 nout >= output.len(), 746 ); 747 self.dfa_state = state; 748 (res, nin, nout) 749 } 750 751 /// Perform the final state transition, i.e., when the caller indicates 752 /// that the input has been exhausted. transition_final_dfa(&self, state: DfaState) -> DfaState753 fn transition_final_dfa(&self, state: DfaState) -> DfaState { 754 // If we''ve already emitted a record or think we're ready to start 755 // parsing a new record, then we should sink into the final state 756 // and never move from there. (pro-tip: the start state doubles as 757 // the final state!) 758 if state >= self.dfa.final_record || state.is_start() { 759 self.dfa.new_state_final_end() 760 } else { 761 self.dfa.new_state_final_record() 762 } 763 } 764 765 /// Write the transition tables for the DFA based on this parser's 766 /// configuration. build_dfa(&mut self)767 fn build_dfa(&mut self) { 768 // A naive DFA transition table has 769 // `cells = (# number of states) * (# size of alphabet)`. While we 770 // could get away with that, the table would have `10 * 256 = 2560` 771 // entries. Even worse, in order to avoid a multiplication instruction 772 // when computing the next transition, we store the starting index of 773 // each state's row, which would not be representible in a single byte. 774 // So we'd need a `u16`, which doubles our transition table size to 775 // ~5KB. This is a lot to put on the stack, even though it probably 776 // fits in the L1 cache of most modern CPUs. 777 // 778 // To avoid this, we note that while our "true" alphabet 779 // has 256 distinct possibilities, the DFA itself is only 780 // discriminatory on a very small subset of that alphabet. For 781 // example, assuming neither `a` nor `b` are set as special 782 // quote/comment/escape/delimiter/terminator bytes, they are otherwise 783 // indistinguishable to the DFA, so it would be OK to treat them as 784 // if they were equivalent. That is, they are in the same equivalence 785 // class. 786 // 787 // As it turns out, using this logic, we can shrink our effective 788 // alphabet down to 7 equivalence classes: 789 // 790 // 1. The field delimiter. 791 // 2. The record terminator. 792 // 3. If the record terminator is CRLF, then CR and LF are 793 // distinct equivalence classes. 794 // 4. The quote byte. 795 // 5. The escape byte. 796 // 6. The comment byte. 797 // 7. Everything else. 798 // 799 // We add those equivalence classes here. If more configuration knobs 800 // are added to the parser with more discriminating bytes, then this 801 // logic will need to be adjusted further. 802 // 803 // Even though this requires an extra bit of indirection when computing 804 // the next transition, microbenchmarks say that it doesn't make much 805 // of a difference. Perhaps because everything fits into the L1 cache. 806 self.dfa.classes.add(self.delimiter); 807 if self.quoting { 808 self.dfa.classes.add(self.quote); 809 if let Some(escape) = self.escape { 810 self.dfa.classes.add(escape); 811 } 812 } 813 if let Some(comment) = self.comment { 814 self.dfa.classes.add(comment); 815 } 816 match self.term { 817 Terminator::Any(b) => self.dfa.classes.add(b), 818 Terminator::CRLF => { 819 self.dfa.classes.add(b'\r'); 820 self.dfa.classes.add(b'\n'); 821 } 822 _ => unreachable!(), 823 } 824 // Build the DFA transition table by computing the DFA state for all 825 // possible combinations of state and input byte. 826 for &state in NFA_STATES { 827 for c in (0..256).map(|c| c as u8) { 828 let mut nfa_result = (state, NfaInputAction::Epsilon); 829 // Consume NFA states until we hit a non-epsilon transition. 830 while nfa_result.0 != NfaState::End 831 && nfa_result.1 == NfaInputAction::Epsilon 832 { 833 nfa_result = self.transition_nfa(nfa_result.0, c); 834 } 835 let from = self.dfa.new_state(state); 836 let to = self.dfa.new_state(nfa_result.0); 837 self.dfa.set( 838 from, 839 c, 840 to, 841 nfa_result.1 == NfaInputAction::CopyToOutput, 842 ); 843 } 844 } 845 self.dfa_state = self.dfa.new_state(NfaState::StartRecord); 846 self.dfa.finish(); 847 } 848 849 // The NFA implementation follows. The transition_final_nfa and 850 // transition_nfa methods are required for the DFA to operate. The 851 // rest are included for completeness (and debugging). Note that this 852 // NFA implementation is included in most of the CSV parser tests below. 853 854 #[inline(always)] read_record_nfa( &mut self, input: &[u8], output: &mut [u8], ends: &mut [usize], ) -> (ReadRecordResult, usize, usize, usize)855 fn read_record_nfa( 856 &mut self, 857 input: &[u8], 858 output: &mut [u8], 859 ends: &mut [usize], 860 ) -> (ReadRecordResult, usize, usize, usize) { 861 if input.is_empty() { 862 let s = self.transition_final_nfa(self.nfa_state); 863 let res = ReadRecordResult::from_nfa(s, false, false, false); 864 return match res { 865 ReadRecordResult::Record => { 866 if ends.is_empty() { 867 return (ReadRecordResult::OutputEndsFull, 0, 0, 0); 868 } 869 self.nfa_state = s; 870 ends[0] = self.output_pos; 871 self.output_pos = 0; 872 (res, 0, 0, 1) 873 } 874 _ => { 875 self.nfa_state = s; 876 (res, 0, 0, 0) 877 } 878 }; 879 } 880 if output.is_empty() { 881 return (ReadRecordResult::OutputFull, 0, 0, 0); 882 } 883 if ends.is_empty() { 884 return (ReadRecordResult::OutputEndsFull, 0, 0, 0); 885 } 886 let (mut nin, mut nout, mut nend) = (0, self.output_pos, 0); 887 let mut state = self.nfa_state; 888 while nin < input.len() && nout < output.len() && nend < ends.len() { 889 let (s, io) = self.transition_nfa(state, input[nin]); 890 match io { 891 NfaInputAction::CopyToOutput => { 892 output[nout] = input[nin]; 893 nout += 1; 894 nin += 1; 895 } 896 NfaInputAction::Discard => { 897 nin += 1; 898 } 899 NfaInputAction::Epsilon => {} 900 } 901 state = s; 902 if state.is_field_final() { 903 ends[nend] = nout; 904 nend += 1; 905 if state != NfaState::EndFieldDelim { 906 break; 907 } 908 } 909 } 910 let res = ReadRecordResult::from_nfa( 911 state, 912 nin >= input.len(), 913 nout >= output.len(), 914 nend >= ends.len(), 915 ); 916 self.nfa_state = state; 917 self.output_pos = if res.is_record() { 0 } else { nout }; 918 (res, nin, nout, nend) 919 } 920 921 #[inline(always)] read_field_nfa( &mut self, input: &[u8], output: &mut [u8], ) -> (ReadFieldResult, usize, usize)922 fn read_field_nfa( 923 &mut self, 924 input: &[u8], 925 output: &mut [u8], 926 ) -> (ReadFieldResult, usize, usize) { 927 if input.is_empty() { 928 self.nfa_state = self.transition_final_nfa(self.nfa_state); 929 let res = ReadFieldResult::from_nfa(self.nfa_state, false, false); 930 return (res, 0, 0); 931 } 932 if output.is_empty() { 933 // If the output buffer is empty, then we can never make progress, 934 // so just quit now. 935 return (ReadFieldResult::OutputFull, 0, 0); 936 } 937 let (mut nin, mut nout) = (0, 0); 938 let mut state = self.nfa_state; 939 while nin < input.len() && nout < output.len() { 940 let (s, io) = self.transition_nfa(state, input[nin]); 941 match io { 942 NfaInputAction::CopyToOutput => { 943 output[nout] = input[nin]; 944 nout += 1; 945 nin += 1; 946 } 947 NfaInputAction::Discard => { 948 nin += 1; 949 } 950 NfaInputAction::Epsilon => (), 951 } 952 state = s; 953 if state.is_field_final() { 954 break; 955 } 956 } 957 let res = ReadFieldResult::from_nfa( 958 state, 959 nin >= input.len(), 960 nout >= output.len(), 961 ); 962 self.nfa_state = state; 963 (res, nin, nout) 964 } 965 966 /// Compute the final NFA transition after all caller-provided input has 967 /// been exhausted. 968 #[inline(always)] transition_final_nfa(&self, state: NfaState) -> NfaState969 fn transition_final_nfa(&self, state: NfaState) -> NfaState { 970 use self::NfaState::*; 971 match state { 972 End | StartRecord | EndRecord | InComment | CRLF => End, 973 StartField | EndFieldDelim | EndFieldTerm | InField 974 | InQuotedField | InEscapedQuote | InDoubleEscapedQuote 975 | InRecordTerm => EndRecord, 976 } 977 } 978 979 /// Compute the next NFA state given the current NFA state and the current 980 /// input byte. 981 /// 982 /// This returns the next NFA state along with an NfaInputAction that 983 /// indicates what should be done with the input byte (nothing for an epsilon 984 /// transition, copied to a caller provided output buffer, or discarded). 985 #[inline(always)] transition_nfa( &self, state: NfaState, c: u8, ) -> (NfaState, NfaInputAction)986 fn transition_nfa( 987 &self, 988 state: NfaState, 989 c: u8, 990 ) -> (NfaState, NfaInputAction) { 991 use self::NfaState::*; 992 match state { 993 End => (End, NfaInputAction::Epsilon), 994 StartRecord => { 995 if self.term.equals(c) { 996 (StartRecord, NfaInputAction::Discard) 997 } else if self.comment == Some(c) { 998 (InComment, NfaInputAction::Discard) 999 } else { 1000 (StartField, NfaInputAction::Epsilon) 1001 } 1002 } 1003 EndRecord => (StartRecord, NfaInputAction::Epsilon), 1004 StartField => { 1005 if self.quoting && self.quote == c { 1006 (InQuotedField, NfaInputAction::Discard) 1007 } else if self.delimiter == c { 1008 (EndFieldDelim, NfaInputAction::Discard) 1009 } else if self.term.equals(c) { 1010 (EndFieldTerm, NfaInputAction::Epsilon) 1011 } else { 1012 (InField, NfaInputAction::CopyToOutput) 1013 } 1014 } 1015 EndFieldDelim => (StartField, NfaInputAction::Epsilon), 1016 EndFieldTerm => (InRecordTerm, NfaInputAction::Epsilon), 1017 InField => { 1018 if self.delimiter == c { 1019 (EndFieldDelim, NfaInputAction::Discard) 1020 } else if self.term.equals(c) { 1021 (EndFieldTerm, NfaInputAction::Epsilon) 1022 } else { 1023 (InField, NfaInputAction::CopyToOutput) 1024 } 1025 } 1026 InQuotedField => { 1027 if self.quoting && self.quote == c { 1028 (InDoubleEscapedQuote, NfaInputAction::Discard) 1029 } else if self.quoting && self.escape == Some(c) { 1030 (InEscapedQuote, NfaInputAction::Discard) 1031 } else { 1032 (InQuotedField, NfaInputAction::CopyToOutput) 1033 } 1034 } 1035 InEscapedQuote => (InQuotedField, NfaInputAction::CopyToOutput), 1036 InDoubleEscapedQuote => { 1037 if self.quoting && self.double_quote && self.quote == c { 1038 (InQuotedField, NfaInputAction::CopyToOutput) 1039 } else if self.delimiter == c { 1040 (EndFieldDelim, NfaInputAction::Discard) 1041 } else if self.term.equals(c) { 1042 (EndFieldTerm, NfaInputAction::Epsilon) 1043 } else { 1044 (InField, NfaInputAction::CopyToOutput) 1045 } 1046 } 1047 InComment => { 1048 if b'\n' == c { 1049 (StartRecord, NfaInputAction::Discard) 1050 } else { 1051 (InComment, NfaInputAction::Discard) 1052 } 1053 } 1054 InRecordTerm => { 1055 if self.term.is_crlf() && b'\r' == c { 1056 (CRLF, NfaInputAction::Discard) 1057 } else { 1058 (EndRecord, NfaInputAction::Discard) 1059 } 1060 } 1061 CRLF => { 1062 if b'\n' == c { 1063 (StartRecord, NfaInputAction::Discard) 1064 } else { 1065 (StartRecord, NfaInputAction::Epsilon) 1066 } 1067 } 1068 } 1069 } 1070 } 1071 1072 /// The number of slots in the DFA transition table. 1073 /// 1074 /// This number is computed by multiplying the maximum number of transition 1075 /// classes (7) by the total number of NFA states that are used in the DFA 1076 /// (10). 1077 /// 1078 /// The number of transition classes is determined by an equivalence class of 1079 /// bytes, where every byte in the same equivalence classes is 1080 /// indistinguishable from any other byte with respect to the DFA. For example, 1081 /// if neither `a` nor `b` are specifed as a delimiter/quote/terminator/escape, 1082 /// then the DFA will never discriminate between `a` or `b`, so they can 1083 /// effectively be treated as identical. This reduces storage space 1084 /// substantially. 1085 /// 1086 /// The total number of NFA states (13) is greater than the total number of 1087 /// NFA states that are in the DFA. In particular, any NFA state that can only 1088 /// be reached by epsilon transitions will never have explicit usage in the 1089 /// DFA. 1090 const TRANS_CLASSES: usize = 7; 1091 const DFA_STATES: usize = 10; 1092 const TRANS_SIZE: usize = TRANS_CLASSES * DFA_STATES; 1093 1094 /// The number of possible transition classes. (See the comment on `TRANS_SIZE` 1095 /// for more details.) 1096 const CLASS_SIZE: usize = 256; 1097 1098 /// A representation of a DFA. 1099 /// 1100 /// For the most part, this is a transition table, but various optimizations 1101 /// have been applied to reduce its memory footprint. 1102 struct Dfa { 1103 /// The core transition table. Each row corresponds to the transitions for 1104 /// each input equivalence class. (Input bytes are mapped to their 1105 /// corresponding equivalence class with the `classes` map.) 1106 /// 1107 /// DFA states are represented as an index corresponding to the start of 1108 /// its row in this table. 1109 trans: [DfaState; TRANS_SIZE], 1110 /// A table with the same layout as `trans`, except its values indicate 1111 /// whether a particular `(state, equivalence class)` pair should emit an 1112 /// output byte. 1113 has_output: [bool; TRANS_SIZE], 1114 /// A map from input byte to equivalence class. 1115 /// 1116 /// This is responsible for reducing the effective alphabet size from 1117 /// 256 to `TRANS_CLASSES`. 1118 classes: DfaClasses, 1119 /// The DFA state corresponding to being inside an unquoted field. 1120 in_field: DfaState, 1121 /// The DFA state corresponding to being inside an quoted field. 1122 in_quoted: DfaState, 1123 /// The minimum DFA state that indicates a field has been parsed. All DFA 1124 /// states greater than this are also final-field states. 1125 final_field: DfaState, 1126 /// The minimum DFA state that indicates a record has been parsed. All DFA 1127 /// states greater than this are also final-record states. 1128 final_record: DfaState, 1129 } 1130 1131 impl Dfa { new() -> Dfa1132 fn new() -> Dfa { 1133 Dfa { 1134 trans: [DfaState(0); TRANS_SIZE], 1135 has_output: [false; TRANS_SIZE], 1136 classes: DfaClasses::new(), 1137 in_field: DfaState(0), 1138 in_quoted: DfaState(0), 1139 final_field: DfaState(0), 1140 final_record: DfaState(0), 1141 } 1142 } 1143 new_state(&self, nfa_state: NfaState) -> DfaState1144 fn new_state(&self, nfa_state: NfaState) -> DfaState { 1145 let nclasses = self.classes.num_classes() as u8; 1146 let idx = (nfa_state as u8).checked_mul(nclasses).unwrap(); 1147 DfaState(idx) 1148 } 1149 new_state_final_end(&self) -> DfaState1150 fn new_state_final_end(&self) -> DfaState { 1151 self.new_state(NfaState::StartRecord) 1152 } 1153 new_state_final_record(&self) -> DfaState1154 fn new_state_final_record(&self) -> DfaState { 1155 self.new_state(NfaState::EndRecord) 1156 } 1157 get_output(&self, state: DfaState, c: u8) -> (DfaState, bool)1158 fn get_output(&self, state: DfaState, c: u8) -> (DfaState, bool) { 1159 let cls = self.classes.classes[c as usize]; 1160 let idx = state.0 as usize + cls as usize; 1161 (self.trans[idx], self.has_output[idx]) 1162 } 1163 set(&mut self, from: DfaState, c: u8, to: DfaState, output: bool)1164 fn set(&mut self, from: DfaState, c: u8, to: DfaState, output: bool) { 1165 let cls = self.classes.classes[c as usize]; 1166 let idx = from.0 as usize + cls as usize; 1167 self.trans[idx] = to; 1168 self.has_output[idx] = output; 1169 } 1170 finish(&mut self)1171 fn finish(&mut self) { 1172 self.in_field = self.new_state(NfaState::InField); 1173 self.in_quoted = self.new_state(NfaState::InQuotedField); 1174 self.final_field = self.new_state(NfaState::EndFieldDelim); 1175 self.final_record = self.new_state(NfaState::EndRecord); 1176 } 1177 new_read_field_result( &self, state: DfaState, is_final_trans: bool, inpdone: bool, outdone: bool, ) -> ReadFieldResult1178 fn new_read_field_result( 1179 &self, 1180 state: DfaState, 1181 is_final_trans: bool, 1182 inpdone: bool, 1183 outdone: bool, 1184 ) -> ReadFieldResult { 1185 if state >= self.final_record { 1186 ReadFieldResult::Field { record_end: true } 1187 } else if state == self.final_field { 1188 ReadFieldResult::Field { record_end: false } 1189 } else if is_final_trans && state.is_start() { 1190 ReadFieldResult::End 1191 } else { 1192 debug_assert!(state < self.final_field); 1193 if !inpdone && outdone { 1194 ReadFieldResult::OutputFull 1195 } else { 1196 ReadFieldResult::InputEmpty 1197 } 1198 } 1199 } 1200 new_read_record_result( &self, state: DfaState, is_final_trans: bool, inpdone: bool, outdone: bool, endsdone: bool, ) -> ReadRecordResult1201 fn new_read_record_result( 1202 &self, 1203 state: DfaState, 1204 is_final_trans: bool, 1205 inpdone: bool, 1206 outdone: bool, 1207 endsdone: bool, 1208 ) -> ReadRecordResult { 1209 if state >= self.final_record { 1210 ReadRecordResult::Record 1211 } else if is_final_trans && state.is_start() { 1212 ReadRecordResult::End 1213 } else { 1214 debug_assert!(state < self.final_record); 1215 if !inpdone && outdone { 1216 ReadRecordResult::OutputFull 1217 } else if !inpdone && endsdone { 1218 ReadRecordResult::OutputEndsFull 1219 } else { 1220 ReadRecordResult::InputEmpty 1221 } 1222 } 1223 } 1224 } 1225 1226 /// A map from input byte to equivalence class. 1227 struct DfaClasses { 1228 classes: [u8; CLASS_SIZE], 1229 next_class: usize, 1230 } 1231 1232 impl DfaClasses { new() -> DfaClasses1233 fn new() -> DfaClasses { 1234 DfaClasses { classes: [0; CLASS_SIZE], next_class: 1 } 1235 } 1236 add(&mut self, b: u8)1237 fn add(&mut self, b: u8) { 1238 if self.next_class > CLASS_SIZE { 1239 panic!("added too many classes") 1240 } 1241 self.classes[b as usize] = self.next_class as u8; 1242 self.next_class = self.next_class + 1; 1243 } 1244 num_classes(&self) -> usize1245 fn num_classes(&self) -> usize { 1246 self.next_class as usize 1247 } 1248 1249 /// Scan and copy the input bytes to the output buffer quickly. 1250 /// 1251 /// This assumes that the current state of the DFA is either `InField` or 1252 /// `InQuotedField`. In this case, all bytes corresponding to the first 1253 /// equivalence class (i.e., not a delimiter/quote/escape/etc.) are 1254 /// guaranteed to never result in a state transition out of the current 1255 /// state. This function takes advantage of that copies every byte from 1256 /// `input` in the first equivalence class to `output`. Once a byte is seen 1257 /// outside the first equivalence class, we quit and should fall back to 1258 /// the main DFA loop. 1259 #[inline(always)] scan_and_copy( &self, input: &[u8], nin: &mut usize, output: &mut [u8], nout: &mut usize, )1260 fn scan_and_copy( 1261 &self, 1262 input: &[u8], 1263 nin: &mut usize, 1264 output: &mut [u8], 1265 nout: &mut usize, 1266 ) { 1267 while *nin < input.len() 1268 && *nout < output.len() 1269 && self.classes[input[*nin] as usize] == 0 1270 { 1271 output[*nout] = input[*nin]; 1272 *nin += 1; 1273 *nout += 1; 1274 } 1275 } 1276 } 1277 1278 /// A single DFA state. 1279 /// 1280 /// A DFA state is represented by the starting index of its corresponding row 1281 /// in the DFA transition table. This representation allows us to elide a 1282 /// single multiplication instruction when computing the next transition for 1283 /// a particular input byte. 1284 #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)] 1285 struct DfaState(u8); 1286 1287 impl DfaState { start() -> DfaState1288 fn start() -> DfaState { 1289 DfaState(0) 1290 } 1291 is_start(&self) -> bool1292 fn is_start(&self) -> bool { 1293 self.0 == 0 1294 } 1295 } 1296 1297 impl fmt::Debug for Dfa { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1298 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 1299 write!(f, "Dfa(N/A)") 1300 } 1301 } 1302 1303 impl fmt::Debug for DfaClasses { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1304 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 1305 write!( 1306 f, 1307 "DfaClasses {{ classes: N/A, next_class: {:?} }}", 1308 self.next_class 1309 ) 1310 } 1311 } 1312 1313 impl Clone for Dfa { clone(&self) -> Dfa1314 fn clone(&self) -> Dfa { 1315 let mut dfa = Dfa::new(); 1316 dfa.trans.copy_from_slice(&self.trans); 1317 dfa 1318 } 1319 } 1320 1321 impl Clone for DfaClasses { clone(&self) -> DfaClasses1322 fn clone(&self) -> DfaClasses { 1323 let mut x = DfaClasses::new(); 1324 x.classes.copy_from_slice(&self.classes); 1325 x 1326 } 1327 } 1328 1329 #[cfg(test)] 1330 mod tests { 1331 use core::str; 1332 1333 use arrayvec::{ArrayString, ArrayVec}; 1334 1335 use super::{ReadFieldResult, Reader, ReaderBuilder, Terminator}; 1336 1337 type Csv = ArrayVec<[Row; 10]>; 1338 type Row = ArrayVec<[Field; 10]>; 1339 type Field = ArrayString<[u8; 10]>; 1340 1341 // OMG I HATE BYTE STRING LITERALS SO MUCH. b(s: &str) -> &[u8]1342 fn b(s: &str) -> &[u8] { 1343 s.as_bytes() 1344 } 1345 1346 macro_rules! csv { 1347 ($([$($field:expr),*]),*) => {{ 1348 #[allow(unused_mut)] 1349 fn x() -> Csv { 1350 let mut csv = Csv::new(); 1351 $( 1352 let mut row = Row::new(); 1353 $( 1354 row.push(Field::from($field).unwrap()); 1355 )* 1356 csv.push(row); 1357 )* 1358 csv 1359 } 1360 x() 1361 }} 1362 } 1363 1364 macro_rules! parses_to { 1365 ($name:ident, $data:expr, $expected:expr) => { 1366 parses_to!($name, $data, $expected, |builder| builder); 1367 }; 1368 ($name:ident, $data:expr, $expected:expr, $config:expr) => { 1369 #[test] 1370 fn $name() { 1371 let mut builder = ReaderBuilder::new(); 1372 builder.nfa(true); 1373 $config(&mut builder); 1374 let mut rdr = builder.build(); 1375 let got = parse_by_field(&mut rdr, $data); 1376 let expected = $expected; 1377 assert_eq!(expected, got, "nfa by field"); 1378 1379 let mut builder = ReaderBuilder::new(); 1380 builder.nfa(true); 1381 $config(&mut builder); 1382 let mut rdr = builder.build(); 1383 let got = parse_by_record(&mut rdr, $data); 1384 let expected = $expected; 1385 assert_eq!(expected, got, "nfa by record"); 1386 1387 let mut builder = ReaderBuilder::new(); 1388 $config(&mut builder); 1389 let mut rdr = builder.build(); 1390 let got = parse_by_field(&mut rdr, $data); 1391 let expected = $expected; 1392 assert_eq!(expected, got, "dfa by field"); 1393 1394 let mut builder = ReaderBuilder::new(); 1395 $config(&mut builder); 1396 let mut rdr = builder.build(); 1397 let got = parse_by_record(&mut rdr, $data); 1398 let expected = $expected; 1399 assert_eq!(expected, got, "dfa by record"); 1400 } 1401 }; 1402 } 1403 parse_by_field(rdr: &mut Reader, data: &str) -> Csv1404 fn parse_by_field(rdr: &mut Reader, data: &str) -> Csv { 1405 let mut data = data.as_bytes(); 1406 let mut field = [0u8; 10]; 1407 let mut csv = Csv::new(); 1408 let mut row = Row::new(); 1409 let mut outpos = 0; 1410 loop { 1411 let (res, nin, nout) = rdr.read_field(data, &mut field[outpos..]); 1412 data = &data[nin..]; 1413 outpos += nout; 1414 1415 match res { 1416 ReadFieldResult::InputEmpty => { 1417 if !data.is_empty() { 1418 panic!("missing input data") 1419 } 1420 } 1421 ReadFieldResult::OutputFull => panic!("field too large"), 1422 ReadFieldResult::Field { record_end } => { 1423 let s = str::from_utf8(&field[..outpos]).unwrap(); 1424 row.push(Field::from(s).unwrap()); 1425 outpos = 0; 1426 if record_end { 1427 csv.push(row); 1428 row = Row::new(); 1429 } 1430 } 1431 ReadFieldResult::End => { 1432 return csv; 1433 } 1434 } 1435 } 1436 } 1437 parse_by_record(rdr: &mut Reader, data: &str) -> Csv1438 fn parse_by_record(rdr: &mut Reader, data: &str) -> Csv { 1439 use crate::ReadRecordResult::*; 1440 1441 let mut data = data.as_bytes(); 1442 let mut record = [0; 1024]; 1443 let mut ends = [0; 10]; 1444 1445 let mut csv = Csv::new(); 1446 let (mut outpos, mut endpos) = (0, 0); 1447 loop { 1448 let (res, nin, nout, nend) = rdr.read_record( 1449 data, 1450 &mut record[outpos..], 1451 &mut ends[endpos..], 1452 ); 1453 data = &data[nin..]; 1454 outpos += nout; 1455 endpos += nend; 1456 1457 match res { 1458 InputEmpty => { 1459 if !data.is_empty() { 1460 panic!("missing input data") 1461 } 1462 } 1463 OutputFull => panic!("record too large (out buffer)"), 1464 OutputEndsFull => panic!("record too large (end buffer)"), 1465 Record => { 1466 let s = str::from_utf8(&record[..outpos]).unwrap(); 1467 let mut start = 0; 1468 let mut row = Row::new(); 1469 for &end in &ends[..endpos] { 1470 row.push(Field::from(&s[start..end]).unwrap()); 1471 start = end; 1472 } 1473 csv.push(row); 1474 outpos = 0; 1475 endpos = 0; 1476 } 1477 End => return csv, 1478 } 1479 } 1480 } 1481 1482 parses_to!(one_row_one_field, "a", csv![["a"]]); 1483 parses_to!(one_row_many_fields, "a,b,c", csv![["a", "b", "c"]]); 1484 parses_to!(one_row_trailing_comma, "a,b,", csv![["a", "b", ""]]); 1485 parses_to!(one_row_one_field_lf, "a\n", csv![["a"]]); 1486 parses_to!(one_row_many_fields_lf, "a,b,c\n", csv![["a", "b", "c"]]); 1487 parses_to!(one_row_trailing_comma_lf, "a,b,\n", csv![["a", "b", ""]]); 1488 parses_to!(one_row_one_field_crlf, "a\r\n", csv![["a"]]); 1489 parses_to!(one_row_many_fields_crlf, "a,b,c\r\n", csv![["a", "b", "c"]]); 1490 parses_to!(one_row_trailing_comma_crlf, "a,b,\r\n", csv![["a", "b", ""]]); 1491 parses_to!(one_row_one_field_cr, "a\r", csv![["a"]]); 1492 parses_to!(one_row_many_fields_cr, "a,b,c\r", csv![["a", "b", "c"]]); 1493 parses_to!(one_row_trailing_comma_cr, "a,b,\r", csv![["a", "b", ""]]); 1494 1495 parses_to!(many_rows_one_field, "a\nb", csv![["a"], ["b"]]); 1496 parses_to!( 1497 many_rows_many_fields, 1498 "a,b,c\nx,y,z", 1499 csv![["a", "b", "c"], ["x", "y", "z"]] 1500 ); 1501 parses_to!( 1502 many_rows_trailing_comma, 1503 "a,b,\nx,y,", 1504 csv![["a", "b", ""], ["x", "y", ""]] 1505 ); 1506 parses_to!(many_rows_one_field_lf, "a\nb\n", csv![["a"], ["b"]]); 1507 parses_to!( 1508 many_rows_many_fields_lf, 1509 "a,b,c\nx,y,z\n", 1510 csv![["a", "b", "c"], ["x", "y", "z"]] 1511 ); 1512 parses_to!( 1513 many_rows_trailing_comma_lf, 1514 "a,b,\nx,y,\n", 1515 csv![["a", "b", ""], ["x", "y", ""]] 1516 ); 1517 parses_to!(many_rows_one_field_crlf, "a\r\nb\r\n", csv![["a"], ["b"]]); 1518 parses_to!( 1519 many_rows_many_fields_crlf, 1520 "a,b,c\r\nx,y,z\r\n", 1521 csv![["a", "b", "c"], ["x", "y", "z"]] 1522 ); 1523 parses_to!( 1524 many_rows_trailing_comma_crlf, 1525 "a,b,\r\nx,y,\r\n", 1526 csv![["a", "b", ""], ["x", "y", ""]] 1527 ); 1528 parses_to!(many_rows_one_field_cr, "a\rb\r", csv![["a"], ["b"]]); 1529 parses_to!( 1530 many_rows_many_fields_cr, 1531 "a,b,c\rx,y,z\r", 1532 csv![["a", "b", "c"], ["x", "y", "z"]] 1533 ); 1534 parses_to!( 1535 many_rows_trailing_comma_cr, 1536 "a,b,\rx,y,\r", 1537 csv![["a", "b", ""], ["x", "y", ""]] 1538 ); 1539 1540 parses_to!( 1541 trailing_lines_no_record, 1542 "\n\n\na,b,c\nx,y,z\n\n\n", 1543 csv![["a", "b", "c"], ["x", "y", "z"]] 1544 ); 1545 parses_to!( 1546 trailing_lines_no_record_cr, 1547 "\r\r\ra,b,c\rx,y,z\r\r\r", 1548 csv![["a", "b", "c"], ["x", "y", "z"]] 1549 ); 1550 parses_to!( 1551 trailing_lines_no_record_crlf, 1552 "\r\n\r\n\r\na,b,c\r\nx,y,z\r\n\r\n\r\n", 1553 csv![["a", "b", "c"], ["x", "y", "z"]] 1554 ); 1555 1556 parses_to!(empty, "", csv![]); 1557 parses_to!(empty_lines, "\n\n\n\n", csv![]); 1558 parses_to!( 1559 empty_lines_interspersed, 1560 "\n\na,b\n\n\nx,y\n\n\nm,n\n", 1561 csv![["a", "b"], ["x", "y"], ["m", "n"]] 1562 ); 1563 parses_to!(empty_lines_crlf, "\r\n\r\n\r\n\r\n", csv![]); 1564 parses_to!( 1565 empty_lines_interspersed_crlf, 1566 "\r\n\r\na,b\r\n\r\n\r\nx,y\r\n\r\n\r\nm,n\r\n", 1567 csv![["a", "b"], ["x", "y"], ["m", "n"]] 1568 ); 1569 parses_to!(empty_lines_mixed, "\r\n\n\r\n\n", csv![]); 1570 parses_to!( 1571 empty_lines_interspersed_mixed, 1572 "\n\r\na,b\r\n\n\r\nx,y\r\n\n\r\nm,n\r\n", 1573 csv![["a", "b"], ["x", "y"], ["m", "n"]] 1574 ); 1575 parses_to!(empty_lines_cr, "\r\r\r\r", csv![]); 1576 parses_to!( 1577 empty_lines_interspersed_cr, 1578 "\r\ra,b\r\r\rx,y\r\r\rm,n\r", 1579 csv![["a", "b"], ["x", "y"], ["m", "n"]] 1580 ); 1581 1582 parses_to!( 1583 term_weird, 1584 "zza,bzc,dzz", 1585 csv![["a", "b"], ["c", "d"]], 1586 |b: &mut ReaderBuilder| { 1587 b.terminator(Terminator::Any(b'z')); 1588 } 1589 ); 1590 1591 parses_to!( 1592 ascii_delimited, 1593 "a\x1fb\x1ec\x1fd", 1594 csv![["a", "b"], ["c", "d"]], 1595 |b: &mut ReaderBuilder| { 1596 b.ascii(); 1597 } 1598 ); 1599 1600 parses_to!(bom_at_start, "\u{feff}a", csv![["a"]]); 1601 parses_to!(bom_in_field, "a\u{feff}", csv![["a\u{feff}"]]); 1602 parses_to!(bom_at_field_start, "a,\u{feff}b", csv![["a", "\u{feff}b"]]); 1603 1604 parses_to!(quote_empty, "\"\"", csv![[""]]); 1605 parses_to!(quote_lf, "\"\"\n", csv![[""]]); 1606 parses_to!(quote_space, "\" \"", csv![[" "]]); 1607 parses_to!(quote_inner_space, "\" a \"", csv![[" a "]]); 1608 parses_to!(quote_outer_space, " \"a\" ", csv![[" \"a\" "]]); 1609 1610 parses_to!(quote_change, "zaz", csv![["a"]], |b: &mut ReaderBuilder| { 1611 b.quote(b'z'); 1612 }); 1613 1614 // This one is pretty hokey. 1615 // I don't really know what the "right" behavior is. 1616 parses_to!( 1617 quote_delimiter, 1618 ",a,,b", 1619 csv![["a,b"]], 1620 |b: &mut ReaderBuilder| { 1621 b.quote(b','); 1622 } 1623 ); 1624 1625 parses_to!(quote_no_escapes, r#""a\"b""#, csv![[r#"a\b""#]]); 1626 parses_to!( 1627 quote_escapes_no_double, 1628 r#""a""b""#, 1629 csv![[r#"a"b""#]], 1630 |b: &mut ReaderBuilder| { 1631 b.double_quote(false); 1632 } 1633 ); 1634 parses_to!( 1635 quote_escapes, 1636 r#""a\"b""#, 1637 csv![[r#"a"b"#]], 1638 |b: &mut ReaderBuilder| { 1639 b.escape(Some(b'\\')); 1640 } 1641 ); 1642 parses_to!( 1643 quote_escapes_change, 1644 r#""az"b""#, 1645 csv![[r#"a"b"#]], 1646 |b: &mut ReaderBuilder| { 1647 b.escape(Some(b'z')); 1648 } 1649 ); 1650 1651 parses_to!( 1652 quote_escapes_with_comma, 1653 r#""\"A,B\"""#, 1654 csv![[r#""A,B""#]], 1655 |b: &mut ReaderBuilder| { 1656 b.escape(Some(b'\\')).double_quote(false); 1657 } 1658 ); 1659 1660 parses_to!( 1661 quoting_disabled, 1662 r#""abc,foo""#, 1663 csv![[r#""abc"#, r#"foo""#]], 1664 |b: &mut ReaderBuilder| { 1665 b.quoting(false); 1666 } 1667 ); 1668 1669 parses_to!( 1670 delimiter_tabs, 1671 "a\tb", 1672 csv![["a", "b"]], 1673 |b: &mut ReaderBuilder| { 1674 b.delimiter(b'\t'); 1675 } 1676 ); 1677 parses_to!( 1678 delimiter_weird, 1679 "azb", 1680 csv![["a", "b"]], 1681 |b: &mut ReaderBuilder| { 1682 b.delimiter(b'z'); 1683 } 1684 ); 1685 1686 parses_to!(extra_record_crlf_1, "foo\n1\n", csv![["foo"], ["1"]]); 1687 parses_to!(extra_record_crlf_2, "foo\r\n1\r\n", csv![["foo"], ["1"]]); 1688 1689 parses_to!( 1690 comment_1, 1691 "foo\n# hi\nbar\n", 1692 csv![["foo"], ["bar"]], 1693 |b: &mut ReaderBuilder| { 1694 b.comment(Some(b'#')); 1695 } 1696 ); 1697 parses_to!( 1698 comment_2, 1699 "foo\n # hi\nbar\n", 1700 csv![["foo"], [" # hi"], ["bar"]], 1701 |b: &mut ReaderBuilder| { 1702 b.comment(Some(b'#')); 1703 } 1704 ); 1705 parses_to!( 1706 comment_3, 1707 "foo\n# hi\nbar\n", 1708 csv![["foo"], ["# hi"], ["bar"]], 1709 |b: &mut ReaderBuilder| { 1710 b.comment(Some(b'\n')); 1711 } 1712 ); 1713 parses_to!( 1714 comment_4, 1715 "foo,b#ar,baz", 1716 csv![["foo", "b#ar", "baz"]], 1717 |b: &mut ReaderBuilder| { 1718 b.comment(Some(b'#')); 1719 } 1720 ); 1721 parses_to!( 1722 comment_5, 1723 "foo,#bar,baz", 1724 csv![["foo", "#bar", "baz"]], 1725 |b: &mut ReaderBuilder| { 1726 b.comment(Some(b'#')); 1727 } 1728 ); 1729 1730 macro_rules! assert_read { 1731 ( 1732 $rdr:expr, $input:expr, $output:expr, 1733 $expect_in:expr, $expect_out:expr, $expect_res:expr 1734 ) => {{ 1735 let (res, nin, nout) = $rdr.read_field($input, $output); 1736 assert_eq!($expect_in, nin); 1737 assert_eq!($expect_out, nout); 1738 assert_eq!($expect_res, res); 1739 }}; 1740 } 1741 1742 // This tests that feeding a new reader with an empty buffer sends us 1743 // straight to End. 1744 #[test] stream_empty()1745 fn stream_empty() { 1746 use crate::ReadFieldResult::*; 1747 1748 let mut rdr = Reader::new(); 1749 assert_read!(rdr, &[], &mut [], 0, 0, End); 1750 } 1751 1752 // Test that a single space is treated as a single field. 1753 #[test] stream_space()1754 fn stream_space() { 1755 use crate::ReadFieldResult::*; 1756 1757 let mut rdr = Reader::new(); 1758 assert_read!(rdr, b(" "), &mut [0], 1, 1, InputEmpty); 1759 assert_read!(rdr, &[], &mut [0], 0, 0, Field { record_end: true }); 1760 assert_read!(rdr, &[], &mut [0], 0, 0, End); 1761 } 1762 1763 // Test that a single comma ... 1764 #[test] stream_comma()1765 fn stream_comma() { 1766 use crate::ReadFieldResult::*; 1767 1768 let mut rdr = Reader::new(); 1769 assert_read!(rdr, b(","), &mut [0], 1, 0, Field { record_end: false }); 1770 assert_read!(rdr, &[], &mut [0], 0, 0, Field { record_end: true }); 1771 assert_read!(rdr, &[], &mut [0], 0, 0, End); 1772 } 1773 1774 // Test that we can read a single large field in multiple output 1775 // buffers. 1776 #[test] stream_output_chunks()1777 fn stream_output_chunks() { 1778 use crate::ReadFieldResult::*; 1779 1780 let mut inp = b("fooquux"); 1781 let out = &mut [0; 2]; 1782 let mut rdr = Reader::new(); 1783 1784 assert_read!(rdr, inp, out, 2, 2, OutputFull); 1785 assert_eq!(out, b("fo")); 1786 inp = &inp[2..]; 1787 1788 assert_read!(rdr, inp, out, 2, 2, OutputFull); 1789 assert_eq!(out, b("oq")); 1790 inp = &inp[2..]; 1791 1792 assert_read!(rdr, inp, out, 2, 2, OutputFull); 1793 assert_eq!(out, b("uu")); 1794 inp = &inp[2..]; 1795 1796 assert_read!(rdr, inp, out, 1, 1, InputEmpty); 1797 assert_eq!(&out[..1], b("x")); 1798 inp = &inp[1..]; 1799 assert!(inp.is_empty()); 1800 1801 assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); 1802 assert_read!(rdr, inp, out, 0, 0, End); 1803 } 1804 1805 // Test that we can read a single large field across multiple input 1806 // buffers. 1807 #[test] stream_input_chunks()1808 fn stream_input_chunks() { 1809 use crate::ReadFieldResult::*; 1810 1811 let out = &mut [0; 10]; 1812 let mut rdr = Reader::new(); 1813 1814 assert_read!(rdr, b("fo"), out, 2, 2, InputEmpty); 1815 assert_eq!(&out[..2], b("fo")); 1816 1817 assert_read!(rdr, b("oq"), &mut out[2..], 2, 2, InputEmpty); 1818 assert_eq!(&out[..4], b("fooq")); 1819 1820 assert_read!(rdr, b("uu"), &mut out[4..], 2, 2, InputEmpty); 1821 assert_eq!(&out[..6], b("fooquu")); 1822 1823 assert_read!(rdr, b("x"), &mut out[6..], 1, 1, InputEmpty); 1824 assert_eq!(&out[..7], b("fooquux")); 1825 1826 assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); 1827 assert_read!(rdr, &[], out, 0, 0, End); 1828 } 1829 1830 // Test we can read doubled quotes correctly in a stream. 1831 #[test] stream_doubled_quotes()1832 fn stream_doubled_quotes() { 1833 use crate::ReadFieldResult::*; 1834 1835 let out = &mut [0; 10]; 1836 let mut rdr = Reader::new(); 1837 1838 assert_read!(rdr, b("\"fo\""), out, 4, 2, InputEmpty); 1839 assert_eq!(&out[..2], b("fo")); 1840 1841 assert_read!(rdr, b("\"o"), &mut out[2..], 2, 2, InputEmpty); 1842 assert_eq!(&out[..4], b("fo\"o")); 1843 1844 assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); 1845 assert_read!(rdr, &[], out, 0, 0, End); 1846 } 1847 1848 // Test we can read escaped quotes correctly in a stream. 1849 #[test] stream_escaped_quotes()1850 fn stream_escaped_quotes() { 1851 use crate::ReadFieldResult::*; 1852 1853 let out = &mut [0; 10]; 1854 let mut builder = ReaderBuilder::new(); 1855 let mut rdr = builder.escape(Some(b'\\')).build(); 1856 1857 assert_read!(rdr, b("\"fo\\"), out, 4, 2, InputEmpty); 1858 assert_eq!(&out[..2], b("fo")); 1859 1860 assert_read!(rdr, b("\"o"), &mut out[2..], 2, 2, InputEmpty); 1861 assert_eq!(&out[..4], b("fo\"o")); 1862 1863 assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); 1864 assert_read!(rdr, &[], out, 0, 0, End); 1865 } 1866 1867 // Test that empty output buffers don't wreak havoc. 1868 #[test] stream_empty_output()1869 fn stream_empty_output() { 1870 use crate::ReadFieldResult::*; 1871 1872 let out = &mut [0; 10]; 1873 let mut rdr = Reader::new(); 1874 1875 assert_read!( 1876 rdr, 1877 b("foo,bar"), 1878 out, 1879 4, 1880 3, 1881 Field { record_end: false } 1882 ); 1883 assert_eq!(&out[..3], b("foo")); 1884 1885 assert_read!(rdr, b("bar"), &mut [], 0, 0, OutputFull); 1886 1887 assert_read!(rdr, b("bar"), out, 3, 3, InputEmpty); 1888 assert_eq!(&out[..3], b("bar")); 1889 1890 assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); 1891 assert_read!(rdr, &[], out, 0, 0, End); 1892 } 1893 1894 // Test that we can reset the parser mid-stream and count on it to do 1895 // the right thing. 1896 #[test] reset_works()1897 fn reset_works() { 1898 use crate::ReadFieldResult::*; 1899 1900 let out = &mut [0; 10]; 1901 let mut rdr = Reader::new(); 1902 1903 assert_read!(rdr, b("\"foo"), out, 4, 3, InputEmpty); 1904 assert_eq!(&out[..3], b("foo")); 1905 1906 // Without reseting the parser state, the reader will remember that 1907 // we're in a quoted field, and therefore interpret the leading double 1908 // quotes below as a single quote and the trailing quote as a matching 1909 // terminator. With the reset, however, the parser forgets the quoted 1910 // field and treats the leading double quotes as a syntax quirk and 1911 // drops them, in addition to hanging on to the trailing unmatched 1912 // quote. (Matches Python's behavior.) 1913 rdr.reset(); 1914 1915 assert_read!(rdr, b("\"\"bar\""), out, 6, 4, InputEmpty); 1916 assert_eq!(&out[..4], b("bar\"")); 1917 } 1918 1919 // Test the line number reporting is correct. 1920 #[test] line_numbers()1921 fn line_numbers() { 1922 use crate::ReadFieldResult::*; 1923 1924 let out = &mut [0; 10]; 1925 let mut rdr = Reader::new(); 1926 1927 assert_eq!(1, rdr.line()); 1928 1929 assert_read!(rdr, b("\n\n\n\n"), out, 4, 0, InputEmpty); 1930 assert_eq!(5, rdr.line()); 1931 1932 assert_read!(rdr, b("foo,"), out, 4, 3, Field { record_end: false }); 1933 assert_eq!(5, rdr.line()); 1934 1935 assert_read!(rdr, b("bar\n"), out, 4, 3, Field { record_end: true }); 1936 assert_eq!(6, rdr.line()); 1937 1938 assert_read!(rdr, &[], &mut [0], 0, 0, End); 1939 assert_eq!(6, rdr.line()); 1940 } 1941 1942 macro_rules! assert_read_record { 1943 ( 1944 $rdr:expr, $input:expr, $output:expr, $ends:expr, 1945 $expect_in:expr, $expect_out:expr, 1946 $expect_end:expr, $expect_res:expr 1947 ) => {{ 1948 let (res, nin, nout, nend) = 1949 $rdr.read_record($input, $output, $ends); 1950 assert_eq!($expect_res, res, "result"); 1951 assert_eq!($expect_in, nin, "input"); 1952 assert_eq!($expect_out, nout, "output"); 1953 assert_eq!($expect_end, nend, "ends"); 1954 }}; 1955 } 1956 1957 // Test that we can incrementally read a record. 1958 #[test] stream_record()1959 fn stream_record() { 1960 use crate::ReadRecordResult::*; 1961 1962 let mut inp = b("foo,bar\nbaz"); 1963 let out = &mut [0; 1024]; 1964 let ends = &mut [0; 10]; 1965 let mut rdr = Reader::new(); 1966 1967 assert_read_record!(rdr, &inp, out, ends, 8, 6, 2, Record); 1968 assert_eq!(ends[0], 3); 1969 assert_eq!(ends[1], 6); 1970 inp = &inp[8..]; 1971 1972 assert_read_record!(rdr, &inp, out, ends, 3, 3, 0, InputEmpty); 1973 inp = &inp[3..]; 1974 1975 assert_read_record!(rdr, &inp, out, ends, 0, 0, 1, Record); 1976 assert_eq!(ends[0], 3); 1977 1978 assert_read_record!(rdr, &inp, out, ends, 0, 0, 0, End); 1979 } 1980 1981 // Test that if our output ends are full during the last read that 1982 // we get an appropriate state returned. 1983 #[test] stream_record_last_end_output_full()1984 fn stream_record_last_end_output_full() { 1985 use crate::ReadRecordResult::*; 1986 1987 let mut inp = b("foo,bar\nbaz"); 1988 let out = &mut [0; 1024]; 1989 let ends = &mut [0; 10]; 1990 let mut rdr = Reader::new(); 1991 1992 assert_read_record!(rdr, &inp, out, ends, 8, 6, 2, Record); 1993 assert_eq!(ends[0], 3); 1994 assert_eq!(ends[1], 6); 1995 inp = &inp[8..]; 1996 1997 assert_read_record!(rdr, &inp, out, ends, 3, 3, 0, InputEmpty); 1998 inp = &inp[3..]; 1999 2000 assert_read_record!(rdr, &inp, out, &mut [], 0, 0, 0, OutputEndsFull); 2001 assert_read_record!(rdr, &inp, out, ends, 0, 0, 1, Record); 2002 assert_eq!(ends[0], 3); 2003 2004 assert_read_record!(rdr, &inp, out, ends, 0, 0, 0, End); 2005 } 2006 2007 #[test] reset_input_partial()2008 fn reset_input_partial() { 2009 use crate::ReadRecordResult::*; 2010 2011 let inp = b("foo,bar\nbaz"); 2012 let out = &mut [0; 1024]; 2013 let ends = &mut [0; 10]; 2014 let mut rdr = Reader::new(); 2015 2016 assert_read_record!(rdr, &inp, out, ends, 8, 6, 2, Record); 2017 2018 // Try to read incomplete record. 2019 let (result, _, _, _) = rdr.read_record(&inp[8..], out, ends); 2020 assert_eq!(result, InputEmpty); 2021 2022 rdr.reset(); 2023 2024 let inp = b("baz,raz\n"); 2025 let (result, _, _, _) = rdr.read_record(inp, out, ends); 2026 assert_eq!(result, Record); 2027 assert_eq!(ends[0], 3); 2028 } 2029 } 2030