1 use std::cmp; 2 use std::fmt; 3 use std::iter::FromIterator; 4 use std::ops::{self, Range}; 5 use std::result; 6 7 use bstr::{BString, ByteSlice}; 8 use serde::de::Deserialize; 9 10 use crate::deserializer::deserialize_byte_record; 11 use crate::error::{new_utf8_error, Result, Utf8Error}; 12 use crate::string_record::StringRecord; 13 14 /// A single CSV record stored as raw bytes. 15 /// 16 /// A byte record permits reading or writing CSV rows that are not UTF-8. 17 /// In general, you should prefer using a 18 /// [`StringRecord`](struct.StringRecord.html) 19 /// since it is more ergonomic, but a `ByteRecord` is provided in case you need 20 /// it. 21 /// 22 /// If you are using the Serde (de)serialization APIs, then you probably never 23 /// need to interact with a `ByteRecord` or a `StringRecord`. However, there 24 /// are some circumstances in which you might need to use a raw record type 25 /// while still using Serde. For example, if you need to deserialize possibly 26 /// invalid UTF-8 fields, then you'll need to first read your record into a 27 /// `ByteRecord`, and then use `ByteRecord::deserialize` to run Serde. Another 28 /// reason for using the raw record deserialization APIs is if you're using 29 /// Serde to read into borrowed data such as a `&'a str` or a `&'a [u8]`. 30 /// 31 /// Two `ByteRecord`s are compared on the basis of their field data. Any 32 /// position information associated with the records is ignored. 33 #[derive(Clone, Eq)] 34 pub struct ByteRecord(Box<ByteRecordInner>); 35 36 impl PartialEq for ByteRecord { eq(&self, other: &ByteRecord) -> bool37 fn eq(&self, other: &ByteRecord) -> bool { 38 if self.len() != other.len() { 39 return false; 40 } 41 self.iter().zip(other.iter()).all(|e| e.0 == e.1) 42 } 43 } 44 45 impl<T: AsRef<[u8]>> PartialEq<Vec<T>> for ByteRecord { eq(&self, other: &Vec<T>) -> bool46 fn eq(&self, other: &Vec<T>) -> bool { 47 self.iter_eq(other) 48 } 49 } 50 51 impl<'a, T: AsRef<[u8]>> PartialEq<Vec<T>> for &'a ByteRecord { eq(&self, other: &Vec<T>) -> bool52 fn eq(&self, other: &Vec<T>) -> bool { 53 self.iter_eq(other) 54 } 55 } 56 57 impl<T: AsRef<[u8]>> PartialEq<[T]> for ByteRecord { eq(&self, other: &[T]) -> bool58 fn eq(&self, other: &[T]) -> bool { 59 self.iter_eq(other) 60 } 61 } 62 63 impl<'a, T: AsRef<[u8]>> PartialEq<[T]> for &'a ByteRecord { eq(&self, other: &[T]) -> bool64 fn eq(&self, other: &[T]) -> bool { 65 self.iter_eq(other) 66 } 67 } 68 69 impl fmt::Debug for ByteRecord { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result70 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 71 let mut fields = vec![]; 72 for field in self { 73 fields.push(BString::from(field.to_vec())); 74 } 75 write!(f, "ByteRecord({:?})", fields) 76 } 77 } 78 79 /// The inner portion of a byte record. 80 /// 81 /// We use this memory layout so that moving a `ByteRecord` only requires 82 /// moving a single pointer. The optimization is dubious at best, but does 83 /// seem to result in slightly better numbers in microbenchmarks. Methinks this 84 /// may heavily depend on the underlying allocator. 85 #[derive(Clone, Debug, Eq, PartialEq)] 86 struct ByteRecordInner { 87 /// The position of this byte record. 88 pos: Option<Position>, 89 /// All fields in this record, stored contiguously. 90 fields: Vec<u8>, 91 /// The number of and location of each field in this record. 92 bounds: Bounds, 93 } 94 95 impl Default for ByteRecord { 96 #[inline] default() -> ByteRecord97 fn default() -> ByteRecord { 98 ByteRecord::new() 99 } 100 } 101 102 impl ByteRecord { 103 /// Create a new empty `ByteRecord`. 104 /// 105 /// Note that you may find the `ByteRecord::from` constructor more 106 /// convenient, which is provided by an impl on the `From` trait. 107 /// 108 /// # Example: create an empty record 109 /// 110 /// ``` 111 /// use csv::ByteRecord; 112 /// 113 /// let record = ByteRecord::new(); 114 /// assert_eq!(record.len(), 0); 115 /// ``` 116 /// 117 /// # Example: initialize a record from a `Vec` 118 /// 119 /// ``` 120 /// use csv::ByteRecord; 121 /// 122 /// let record = ByteRecord::from(vec!["a", "b", "c"]); 123 /// assert_eq!(record.len(), 3); 124 /// ``` 125 #[inline] new() -> ByteRecord126 pub fn new() -> ByteRecord { 127 ByteRecord::with_capacity(0, 0) 128 } 129 130 /// Create a new empty `ByteRecord` with the given capacity settings. 131 /// 132 /// `buffer` refers to the capacity of the buffer used to store the 133 /// actual row contents. `fields` refers to the number of fields one 134 /// might expect to store. 135 #[inline] with_capacity(buffer: usize, fields: usize) -> ByteRecord136 pub fn with_capacity(buffer: usize, fields: usize) -> ByteRecord { 137 ByteRecord(Box::new(ByteRecordInner { 138 pos: None, 139 fields: vec![0; buffer], 140 bounds: Bounds::with_capacity(fields), 141 })) 142 } 143 144 /// Deserialize this record. 145 /// 146 /// The `D` type parameter refers to the type that this record should be 147 /// deserialized into. The `'de` lifetime refers to the lifetime of the 148 /// `ByteRecord`. The `'de` lifetime permits deserializing into structs 149 /// that borrow field data from this record. 150 /// 151 /// An optional `headers` parameter permits deserializing into a struct 152 /// based on its field names (corresponding to header values) rather than 153 /// the order in which the fields are defined. 154 /// 155 /// # Example: without headers 156 /// 157 /// This shows how to deserialize a single row into a struct based on the 158 /// order in which fields occur. This example also shows how to borrow 159 /// fields from the `ByteRecord`, which results in zero allocation 160 /// deserialization. 161 /// 162 /// ``` 163 /// use std::error::Error; 164 /// 165 /// use csv::ByteRecord; 166 /// use serde::Deserialize; 167 /// 168 /// #[derive(Deserialize)] 169 /// struct Row<'a> { 170 /// city: &'a str, 171 /// country: &'a str, 172 /// population: u64, 173 /// } 174 /// 175 /// # fn main() { example().unwrap() } 176 /// fn example() -> Result<(), Box<dyn Error>> { 177 /// let record = ByteRecord::from(vec![ 178 /// "Boston", "United States", "4628910", 179 /// ]); 180 /// 181 /// let row: Row = record.deserialize(None)?; 182 /// assert_eq!(row.city, "Boston"); 183 /// assert_eq!(row.country, "United States"); 184 /// assert_eq!(row.population, 4628910); 185 /// Ok(()) 186 /// } 187 /// ``` 188 /// 189 /// # Example: with headers 190 /// 191 /// This example is like the previous one, but shows how to deserialize 192 /// into a struct based on the struct's field names. For this to work, 193 /// you must provide a header row. 194 /// 195 /// This example also shows that you can deserialize into owned data 196 /// types (e.g., `String`) instead of borrowed data types (e.g., `&str`). 197 /// 198 /// ``` 199 /// use std::error::Error; 200 /// 201 /// use csv::ByteRecord; 202 /// use serde::Deserialize; 203 /// 204 /// #[derive(Deserialize)] 205 /// struct Row { 206 /// city: String, 207 /// country: String, 208 /// population: u64, 209 /// } 210 /// 211 /// # fn main() { example().unwrap() } 212 /// fn example() -> Result<(), Box<dyn Error>> { 213 /// // Notice that the fields are not in the same order 214 /// // as the fields in the struct! 215 /// let header = ByteRecord::from(vec![ 216 /// "country", "city", "population", 217 /// ]); 218 /// let record = ByteRecord::from(vec![ 219 /// "United States", "Boston", "4628910", 220 /// ]); 221 /// 222 /// let row: Row = record.deserialize(Some(&header))?; 223 /// assert_eq!(row.city, "Boston"); 224 /// assert_eq!(row.country, "United States"); 225 /// assert_eq!(row.population, 4628910); 226 /// Ok(()) 227 /// } 228 /// ``` deserialize<'de, D: Deserialize<'de>>( &'de self, headers: Option<&'de ByteRecord>, ) -> Result<D>229 pub fn deserialize<'de, D: Deserialize<'de>>( 230 &'de self, 231 headers: Option<&'de ByteRecord>, 232 ) -> Result<D> { 233 deserialize_byte_record(self, headers) 234 } 235 236 /// Returns an iterator over all fields in this record. 237 /// 238 /// # Example 239 /// 240 /// This example shows how to iterate over each field in a `ByteRecord`. 241 /// 242 /// ``` 243 /// use csv::ByteRecord; 244 /// 245 /// let record = ByteRecord::from(vec!["a", "b", "c"]); 246 /// for field in record.iter() { 247 /// assert!(field == b"a" || field == b"b" || field == b"c"); 248 /// } 249 /// ``` 250 #[inline] iter(&self) -> ByteRecordIter251 pub fn iter(&self) -> ByteRecordIter { 252 self.into_iter() 253 } 254 255 /// Return the field at index `i`. 256 /// 257 /// If no field at index `i` exists, then this returns `None`. 258 /// 259 /// # Example 260 /// 261 /// ``` 262 /// use csv::ByteRecord; 263 /// 264 /// let record = ByteRecord::from(vec!["a", "b", "c"]); 265 /// assert_eq!(record.get(1), Some(&b"b"[..])); 266 /// assert_eq!(record.get(3), None); 267 /// ``` 268 #[inline] get(&self, i: usize) -> Option<&[u8]>269 pub fn get(&self, i: usize) -> Option<&[u8]> { 270 self.0.bounds.get(i).map(|range| &self.0.fields[range]) 271 } 272 273 /// Returns true if and only if this record is empty. 274 /// 275 /// # Example 276 /// 277 /// ``` 278 /// use csv::ByteRecord; 279 /// 280 /// assert!(ByteRecord::new().is_empty()); 281 /// ``` 282 #[inline] is_empty(&self) -> bool283 pub fn is_empty(&self) -> bool { 284 self.len() == 0 285 } 286 287 /// Returns the number of fields in this record. 288 /// 289 /// # Example 290 /// 291 /// ``` 292 /// use csv::ByteRecord; 293 /// 294 /// let record = ByteRecord::from(vec!["a", "b", "c"]); 295 /// assert_eq!(record.len(), 3); 296 /// ``` 297 #[inline] len(&self) -> usize298 pub fn len(&self) -> usize { 299 self.0.bounds.len() 300 } 301 302 /// Truncate this record to `n` fields. 303 /// 304 /// If `n` is greater than the number of fields in this record, then this 305 /// has no effect. 306 /// 307 /// # Example 308 /// 309 /// ``` 310 /// use csv::ByteRecord; 311 /// 312 /// let mut record = ByteRecord::from(vec!["a", "b", "c"]); 313 /// assert_eq!(record.len(), 3); 314 /// record.truncate(1); 315 /// assert_eq!(record.len(), 1); 316 /// assert_eq!(record, vec!["a"]); 317 /// ``` 318 #[inline] truncate(&mut self, n: usize)319 pub fn truncate(&mut self, n: usize) { 320 if n <= self.len() { 321 self.0.bounds.len = n; 322 } 323 } 324 325 /// Clear this record so that it has zero fields. 326 /// 327 /// This is equivalent to calling `truncate(0)`. 328 /// 329 /// Note that it is not necessary to clear the record to reuse it with 330 /// the CSV reader. 331 /// 332 /// # Example 333 /// 334 /// ``` 335 /// use csv::ByteRecord; 336 /// 337 /// let mut record = ByteRecord::from(vec!["a", "b", "c"]); 338 /// assert_eq!(record.len(), 3); 339 /// record.clear(); 340 /// assert_eq!(record.len(), 0); 341 /// ``` 342 #[inline] clear(&mut self)343 pub fn clear(&mut self) { 344 self.truncate(0); 345 } 346 347 /// Trim the fields of this record so that leading and trailing whitespace 348 /// is removed. 349 /// 350 /// This method uses the ASCII definition of whitespace. That is, only 351 /// bytes in the class `[\t\n\v\f\r ]` are trimmed. 352 /// 353 /// # Example 354 /// 355 /// ``` 356 /// use csv::ByteRecord; 357 /// 358 /// let mut record = ByteRecord::from(vec![ 359 /// " ", "\tfoo", "bar ", "b a z", 360 /// ]); 361 /// record.trim(); 362 /// assert_eq!(record, vec!["", "foo", "bar", "b a z"]); 363 /// ``` trim(&mut self)364 pub fn trim(&mut self) { 365 let length = self.len(); 366 if length == 0 { 367 return; 368 } 369 // TODO: We could likely do this in place, but for now, we allocate. 370 let mut trimmed = 371 ByteRecord::with_capacity(self.as_slice().len(), self.len()); 372 trimmed.set_position(self.position().cloned()); 373 for field in &*self { 374 trimmed.push_field(field.trim()); 375 } 376 *self = trimmed; 377 } 378 379 /// Add a new field to this record. 380 /// 381 /// # Example 382 /// 383 /// ``` 384 /// use csv::ByteRecord; 385 /// 386 /// let mut record = ByteRecord::new(); 387 /// record.push_field(b"foo"); 388 /// assert_eq!(&record[0], b"foo"); 389 /// ``` 390 #[inline] push_field(&mut self, field: &[u8])391 pub fn push_field(&mut self, field: &[u8]) { 392 let (s, e) = (self.0.bounds.end(), self.0.bounds.end() + field.len()); 393 while e > self.0.fields.len() { 394 self.expand_fields(); 395 } 396 self.0.fields[s..e].copy_from_slice(field); 397 self.0.bounds.add(e); 398 } 399 400 /// Return the position of this record, if available. 401 /// 402 /// # Example 403 /// 404 /// ``` 405 /// use std::error::Error; 406 /// 407 /// use csv::{ByteRecord, ReaderBuilder}; 408 /// 409 /// # fn main() { example().unwrap(); } 410 /// fn example() -> Result<(), Box<dyn Error>> { 411 /// let mut record = ByteRecord::new(); 412 /// let mut rdr = ReaderBuilder::new() 413 /// .has_headers(false) 414 /// .from_reader("a,b,c\nx,y,z".as_bytes()); 415 /// 416 /// assert!(rdr.read_byte_record(&mut record)?); 417 /// { 418 /// let pos = record.position().expect("a record position"); 419 /// assert_eq!(pos.byte(), 0); 420 /// assert_eq!(pos.line(), 1); 421 /// assert_eq!(pos.record(), 0); 422 /// } 423 /// 424 /// assert!(rdr.read_byte_record(&mut record)?); 425 /// { 426 /// let pos = record.position().expect("a record position"); 427 /// assert_eq!(pos.byte(), 6); 428 /// assert_eq!(pos.line(), 2); 429 /// assert_eq!(pos.record(), 1); 430 /// } 431 /// 432 /// // Finish the CSV reader for good measure. 433 /// assert!(!rdr.read_byte_record(&mut record)?); 434 /// Ok(()) 435 /// } 436 /// ``` 437 #[inline] position(&self) -> Option<&Position>438 pub fn position(&self) -> Option<&Position> { 439 self.0.pos.as_ref() 440 } 441 442 /// Set the position of this record. 443 /// 444 /// # Example 445 /// 446 /// ``` 447 /// use csv::{ByteRecord, Position}; 448 /// 449 /// let mut record = ByteRecord::from(vec!["a", "b", "c"]); 450 /// let mut pos = Position::new(); 451 /// pos.set_byte(100); 452 /// pos.set_line(4); 453 /// pos.set_record(2); 454 /// 455 /// record.set_position(Some(pos.clone())); 456 /// assert_eq!(record.position(), Some(&pos)); 457 /// ``` 458 #[inline] set_position(&mut self, pos: Option<Position>)459 pub fn set_position(&mut self, pos: Option<Position>) { 460 self.0.pos = pos; 461 } 462 463 /// Return the start and end position of a field in this record. 464 /// 465 /// If no such field exists at the given index, then return `None`. 466 /// 467 /// The range returned can be used with the slice returned by `as_slice`. 468 /// 469 /// # Example 470 /// 471 /// ``` 472 /// use csv::ByteRecord; 473 /// 474 /// let record = ByteRecord::from(vec!["foo", "quux", "z"]); 475 /// let range = record.range(1).expect("a record range"); 476 /// assert_eq!(&record.as_slice()[range], &b"quux"[..]); 477 /// ``` 478 #[inline] range(&self, i: usize) -> Option<Range<usize>>479 pub fn range(&self, i: usize) -> Option<Range<usize>> { 480 self.0.bounds.get(i) 481 } 482 483 /// Return the entire row as a single byte slice. The slice returned stores 484 /// all fields contiguously. The boundaries of each field can be determined 485 /// via the `range` method. 486 /// 487 /// # Example 488 /// 489 /// ``` 490 /// use csv::ByteRecord; 491 /// 492 /// let record = ByteRecord::from(vec!["foo", "quux", "z"]); 493 /// assert_eq!(record.as_slice(), &b"fooquuxz"[..]); 494 /// ``` 495 #[inline] as_slice(&self) -> &[u8]496 pub fn as_slice(&self) -> &[u8] { 497 &self.0.fields[..self.0.bounds.end()] 498 } 499 500 /// Clone this record, but only copy `fields` up to the end of bounds. This 501 /// is useful when one wants to copy a record, but not necessarily any 502 /// excess capacity in that record. 503 #[inline] clone_truncated(&self) -> ByteRecord504 pub(crate) fn clone_truncated(&self) -> ByteRecord { 505 let mut br = ByteRecord::new(); 506 br.0.pos = self.0.pos.clone(); 507 br.0.bounds = self.0.bounds.clone(); 508 br.0.fields = self.0.fields[..self.0.bounds.end()].to_vec(); 509 br 510 } 511 512 /// Retrieve the underlying parts of a byte record. 513 #[inline] as_parts(&mut self) -> (&mut Vec<u8>, &mut Vec<usize>)514 pub(crate) fn as_parts(&mut self) -> (&mut Vec<u8>, &mut Vec<usize>) { 515 let inner = &mut *self.0; 516 (&mut inner.fields, &mut inner.bounds.ends) 517 } 518 519 /// Set the number of fields in the given record record. 520 #[inline] set_len(&mut self, len: usize)521 pub(crate) fn set_len(&mut self, len: usize) { 522 self.0.bounds.len = len; 523 } 524 525 /// Expand the capacity for storing fields. 526 #[inline] expand_fields(&mut self)527 pub(crate) fn expand_fields(&mut self) { 528 let new_len = self.0.fields.len().checked_mul(2).unwrap(); 529 self.0.fields.resize(cmp::max(4, new_len), 0); 530 } 531 532 /// Expand the capacity for storing field ending positions. 533 #[inline] expand_ends(&mut self)534 pub(crate) fn expand_ends(&mut self) { 535 self.0.bounds.expand(); 536 } 537 538 /// Validate the given record as UTF-8. 539 /// 540 /// If it's not UTF-8, return an error. 541 #[inline] validate(&self) -> result::Result<(), Utf8Error>542 pub(crate) fn validate(&self) -> result::Result<(), Utf8Error> { 543 // If the entire buffer is ASCII, then we have nothing to fear. 544 if self.0.fields[..self.0.bounds.end()].is_ascii() { 545 return Ok(()); 546 } 547 // Otherwise, we must check each field individually to ensure that 548 // it's valid UTF-8. 549 for (i, field) in self.iter().enumerate() { 550 if let Err(err) = field.to_str() { 551 return Err(new_utf8_error(i, err.valid_up_to())); 552 } 553 } 554 Ok(()) 555 } 556 557 /// Compare the given byte record with the iterator of fields for equality. iter_eq<I, T>(&self, other: I) -> bool where I: IntoIterator<Item = T>, T: AsRef<[u8]>,558 pub(crate) fn iter_eq<I, T>(&self, other: I) -> bool 559 where 560 I: IntoIterator<Item = T>, 561 T: AsRef<[u8]>, 562 { 563 let mut it_record = self.iter(); 564 let mut it_other = other.into_iter(); 565 loop { 566 match (it_record.next(), it_other.next()) { 567 (None, None) => return true, 568 (None, Some(_)) | (Some(_), None) => return false, 569 (Some(x), Some(y)) => { 570 if x != y.as_ref() { 571 return false; 572 } 573 } 574 } 575 } 576 } 577 } 578 579 /// A position in CSV data. 580 /// 581 /// A position is used to report errors in CSV data. All positions include the 582 /// byte offset, line number and record index at which the error occurred. 583 /// 584 /// Byte offsets and record indices start at `0`. Line numbers start at `1`. 585 /// 586 /// A CSV reader will automatically assign the position of each record. 587 #[derive(Clone, Debug, Eq, PartialEq)] 588 pub struct Position { 589 byte: u64, 590 line: u64, 591 record: u64, 592 } 593 594 impl Position { 595 /// Returns a new position initialized to the start value. 596 #[inline] new() -> Position597 pub fn new() -> Position { 598 Position { byte: 0, line: 1, record: 0 } 599 } 600 601 /// The byte offset, starting at `0`, of this position. 602 #[inline] byte(&self) -> u64603 pub fn byte(&self) -> u64 { 604 self.byte 605 } 606 /// The line number, starting at `1`, of this position. 607 #[inline] line(&self) -> u64608 pub fn line(&self) -> u64 { 609 self.line 610 } 611 /// The record index, starting with the first record at `0`. 612 #[inline] record(&self) -> u64613 pub fn record(&self) -> u64 { 614 self.record 615 } 616 617 /// Set the byte offset of this position. 618 #[inline] set_byte(&mut self, byte: u64) -> &mut Position619 pub fn set_byte(&mut self, byte: u64) -> &mut Position { 620 self.byte = byte; 621 self 622 } 623 624 /// Set the line number of this position. 625 /// 626 /// If the line number is less than `1`, then this method panics. 627 #[inline] set_line(&mut self, line: u64) -> &mut Position628 pub fn set_line(&mut self, line: u64) -> &mut Position { 629 assert!(line > 0); 630 self.line = line; 631 self 632 } 633 634 /// Set the record index of this position. 635 #[inline] set_record(&mut self, record: u64) -> &mut Position636 pub fn set_record(&mut self, record: u64) -> &mut Position { 637 self.record = record; 638 self 639 } 640 } 641 642 /// The bounds of fields in a single record. 643 #[derive(Clone, Debug, Eq, PartialEq)] 644 struct Bounds { 645 /// The ending index of each field. 646 ends: Vec<usize>, 647 /// The number of fields in this record. 648 /// 649 /// Technically, we could drop this field and maintain an invariant that 650 /// `ends.len()` is always the number of fields, but doing that efficiently 651 /// requires attention to safety. We play it safe at essentially no cost. 652 len: usize, 653 } 654 655 impl Default for Bounds { 656 #[inline] default() -> Bounds657 fn default() -> Bounds { 658 Bounds::with_capacity(0) 659 } 660 } 661 662 impl Bounds { 663 /// Create a new set of bounds with the given capacity for storing the 664 /// ends of fields. 665 #[inline] with_capacity(capacity: usize) -> Bounds666 fn with_capacity(capacity: usize) -> Bounds { 667 Bounds { ends: vec![0; capacity], len: 0 } 668 } 669 670 /// Returns the bounds of field `i`. 671 #[inline] get(&self, i: usize) -> Option<Range<usize>>672 fn get(&self, i: usize) -> Option<Range<usize>> { 673 if i >= self.len { 674 return None; 675 } 676 let end = match self.ends.get(i) { 677 None => return None, 678 Some(&end) => end, 679 }; 680 let start = match i.checked_sub(1).and_then(|i| self.ends.get(i)) { 681 None => 0, 682 Some(&start) => start, 683 }; 684 Some(ops::Range { start: start, end: end }) 685 } 686 687 /// Returns a slice of ending positions of all fields. 688 #[inline] ends(&self) -> &[usize]689 fn ends(&self) -> &[usize] { 690 &self.ends[..self.len] 691 } 692 693 /// Return the last position of the last field. 694 /// 695 /// If there are no fields, this returns `0`. 696 #[inline] end(&self) -> usize697 fn end(&self) -> usize { 698 self.ends().last().map(|&i| i).unwrap_or(0) 699 } 700 701 /// Returns the number of fields in these bounds. 702 #[inline] len(&self) -> usize703 fn len(&self) -> usize { 704 self.len 705 } 706 707 /// Expand the capacity for storing field ending positions. 708 #[inline] expand(&mut self)709 fn expand(&mut self) { 710 let new_len = self.ends.len().checked_mul(2).unwrap(); 711 self.ends.resize(cmp::max(4, new_len), 0); 712 } 713 714 /// Add a new field with the given ending position. 715 #[inline] add(&mut self, pos: usize)716 fn add(&mut self, pos: usize) { 717 if self.len >= self.ends.len() { 718 self.expand(); 719 } 720 self.ends[self.len] = pos; 721 self.len += 1; 722 } 723 } 724 725 impl ops::Index<usize> for ByteRecord { 726 type Output = [u8]; 727 #[inline] index(&self, i: usize) -> &[u8]728 fn index(&self, i: usize) -> &[u8] { 729 self.get(i).unwrap() 730 } 731 } 732 733 impl From<StringRecord> for ByteRecord { 734 #[inline] from(record: StringRecord) -> ByteRecord735 fn from(record: StringRecord) -> ByteRecord { 736 record.into_byte_record() 737 } 738 } 739 740 impl<T: AsRef<[u8]>> From<Vec<T>> for ByteRecord { 741 #[inline] from(xs: Vec<T>) -> ByteRecord742 fn from(xs: Vec<T>) -> ByteRecord { 743 ByteRecord::from_iter(&xs) 744 } 745 } 746 747 impl<'a, T: AsRef<[u8]>> From<&'a [T]> for ByteRecord { 748 #[inline] from(xs: &'a [T]) -> ByteRecord749 fn from(xs: &'a [T]) -> ByteRecord { 750 ByteRecord::from_iter(xs) 751 } 752 } 753 754 impl<T: AsRef<[u8]>> FromIterator<T> for ByteRecord { 755 #[inline] from_iter<I: IntoIterator<Item = T>>(iter: I) -> ByteRecord756 fn from_iter<I: IntoIterator<Item = T>>(iter: I) -> ByteRecord { 757 let mut record = ByteRecord::new(); 758 record.extend(iter); 759 record 760 } 761 } 762 763 impl<T: AsRef<[u8]>> Extend<T> for ByteRecord { 764 #[inline] extend<I: IntoIterator<Item = T>>(&mut self, iter: I)765 fn extend<I: IntoIterator<Item = T>>(&mut self, iter: I) { 766 for x in iter { 767 self.push_field(x.as_ref()); 768 } 769 } 770 } 771 772 /// A double-ended iterator over the fields in a byte record. 773 /// 774 /// The `'r` lifetime variable refers to the lifetime of the `ByteRecord` that 775 /// is being iterated over. 776 #[derive(Clone)] 777 pub struct ByteRecordIter<'r> { 778 /// The record we are iterating over. 779 r: &'r ByteRecord, 780 /// The starting index of the previous field. (For reverse iteration.) 781 last_start: usize, 782 /// The ending index of the previous field. (For forward iteration.) 783 last_end: usize, 784 /// The index of forward iteration. 785 i_forward: usize, 786 /// The index of reverse iteration. 787 i_reverse: usize, 788 } 789 790 impl<'r> IntoIterator for &'r ByteRecord { 791 type IntoIter = ByteRecordIter<'r>; 792 type Item = &'r [u8]; 793 794 #[inline] into_iter(self) -> ByteRecordIter<'r>795 fn into_iter(self) -> ByteRecordIter<'r> { 796 ByteRecordIter { 797 r: self, 798 last_start: self.as_slice().len(), 799 last_end: 0, 800 i_forward: 0, 801 i_reverse: self.len(), 802 } 803 } 804 } 805 806 impl<'r> ExactSizeIterator for ByteRecordIter<'r> {} 807 808 impl<'r> Iterator for ByteRecordIter<'r> { 809 type Item = &'r [u8]; 810 811 #[inline] next(&mut self) -> Option<&'r [u8]>812 fn next(&mut self) -> Option<&'r [u8]> { 813 if self.i_forward == self.i_reverse { 814 None 815 } else { 816 let start = self.last_end; 817 let end = self.r.0.bounds.ends()[self.i_forward]; 818 self.i_forward += 1; 819 self.last_end = end; 820 Some(&self.r.0.fields[start..end]) 821 } 822 } 823 824 #[inline] size_hint(&self) -> (usize, Option<usize>)825 fn size_hint(&self) -> (usize, Option<usize>) { 826 let x = self.i_reverse - self.i_forward; 827 (x, Some(x)) 828 } 829 830 #[inline] count(self) -> usize831 fn count(self) -> usize { 832 self.len() 833 } 834 } 835 836 impl<'r> DoubleEndedIterator for ByteRecordIter<'r> { 837 #[inline] next_back(&mut self) -> Option<&'r [u8]>838 fn next_back(&mut self) -> Option<&'r [u8]> { 839 if self.i_forward == self.i_reverse { 840 None 841 } else { 842 self.i_reverse -= 1; 843 let start = self 844 .i_reverse 845 .checked_sub(1) 846 .map(|i| self.r.0.bounds.ends()[i]) 847 .unwrap_or(0); 848 let end = self.last_start; 849 self.last_start = start; 850 Some(&self.r.0.fields[start..end]) 851 } 852 } 853 } 854 855 #[cfg(test)] 856 mod tests { 857 use crate::string_record::StringRecord; 858 859 use super::ByteRecord; 860 b(s: &str) -> &[u8]861 fn b(s: &str) -> &[u8] { 862 s.as_bytes() 863 } 864 865 #[test] record_1()866 fn record_1() { 867 let mut rec = ByteRecord::new(); 868 rec.push_field(b"foo"); 869 870 assert_eq!(rec.len(), 1); 871 assert_eq!(rec.get(0), Some(b("foo"))); 872 assert_eq!(rec.get(1), None); 873 assert_eq!(rec.get(2), None); 874 } 875 876 #[test] record_2()877 fn record_2() { 878 let mut rec = ByteRecord::new(); 879 rec.push_field(b"foo"); 880 rec.push_field(b"quux"); 881 882 assert_eq!(rec.len(), 2); 883 assert_eq!(rec.get(0), Some(b("foo"))); 884 assert_eq!(rec.get(1), Some(b("quux"))); 885 assert_eq!(rec.get(2), None); 886 assert_eq!(rec.get(3), None); 887 } 888 889 #[test] empty_record()890 fn empty_record() { 891 let rec = ByteRecord::new(); 892 893 assert_eq!(rec.len(), 0); 894 assert_eq!(rec.get(0), None); 895 assert_eq!(rec.get(1), None); 896 } 897 898 #[test] trim_whitespace_only()899 fn trim_whitespace_only() { 900 let mut rec = ByteRecord::from(vec![b" \t\n\r\x0c"]); 901 rec.trim(); 902 assert_eq!(rec.get(0), Some(b(""))); 903 } 904 905 #[test] trim_front()906 fn trim_front() { 907 let mut rec = ByteRecord::from(vec![b" abc"]); 908 rec.trim(); 909 assert_eq!(rec.get(0), Some(b("abc"))); 910 911 let mut rec = ByteRecord::from(vec![b(" abc"), b(" xyz")]); 912 rec.trim(); 913 assert_eq!(rec.get(0), Some(b("abc"))); 914 assert_eq!(rec.get(1), Some(b("xyz"))); 915 } 916 917 #[test] trim_back()918 fn trim_back() { 919 let mut rec = ByteRecord::from(vec![b"abc "]); 920 rec.trim(); 921 assert_eq!(rec.get(0), Some(b("abc"))); 922 923 let mut rec = ByteRecord::from(vec![b("abc "), b("xyz ")]); 924 rec.trim(); 925 assert_eq!(rec.get(0), Some(b("abc"))); 926 assert_eq!(rec.get(1), Some(b("xyz"))); 927 } 928 929 #[test] trim_both()930 fn trim_both() { 931 let mut rec = ByteRecord::from(vec![b" abc "]); 932 rec.trim(); 933 assert_eq!(rec.get(0), Some(b("abc"))); 934 935 let mut rec = ByteRecord::from(vec![b(" abc "), b(" xyz ")]); 936 rec.trim(); 937 assert_eq!(rec.get(0), Some(b("abc"))); 938 assert_eq!(rec.get(1), Some(b("xyz"))); 939 } 940 941 #[test] trim_does_not_panic_on_empty_records_1()942 fn trim_does_not_panic_on_empty_records_1() { 943 let mut rec = ByteRecord::from(vec![b""]); 944 rec.trim(); 945 assert_eq!(rec.get(0), Some(b(""))); 946 } 947 948 #[test] trim_does_not_panic_on_empty_records_2()949 fn trim_does_not_panic_on_empty_records_2() { 950 let mut rec = ByteRecord::from(vec![b"", b""]); 951 rec.trim(); 952 assert_eq!(rec.get(0), Some(b(""))); 953 assert_eq!(rec.get(1), Some(b(""))); 954 } 955 956 #[test] trim_does_not_panic_on_empty_records_3()957 fn trim_does_not_panic_on_empty_records_3() { 958 let mut rec = ByteRecord::new(); 959 rec.trim(); 960 assert_eq!(rec.as_slice().len(), 0); 961 } 962 963 #[test] empty_field_1()964 fn empty_field_1() { 965 let mut rec = ByteRecord::new(); 966 rec.push_field(b""); 967 968 assert_eq!(rec.len(), 1); 969 assert_eq!(rec.get(0), Some(b(""))); 970 assert_eq!(rec.get(1), None); 971 assert_eq!(rec.get(2), None); 972 } 973 974 #[test] empty_field_2()975 fn empty_field_2() { 976 let mut rec = ByteRecord::new(); 977 rec.push_field(b""); 978 rec.push_field(b""); 979 980 assert_eq!(rec.len(), 2); 981 assert_eq!(rec.get(0), Some(b(""))); 982 assert_eq!(rec.get(1), Some(b(""))); 983 assert_eq!(rec.get(2), None); 984 assert_eq!(rec.get(3), None); 985 } 986 987 #[test] empty_surround_1()988 fn empty_surround_1() { 989 let mut rec = ByteRecord::new(); 990 rec.push_field(b"foo"); 991 rec.push_field(b""); 992 rec.push_field(b"quux"); 993 994 assert_eq!(rec.len(), 3); 995 assert_eq!(rec.get(0), Some(b("foo"))); 996 assert_eq!(rec.get(1), Some(b(""))); 997 assert_eq!(rec.get(2), Some(b("quux"))); 998 assert_eq!(rec.get(3), None); 999 assert_eq!(rec.get(4), None); 1000 } 1001 1002 #[test] empty_surround_2()1003 fn empty_surround_2() { 1004 let mut rec = ByteRecord::new(); 1005 rec.push_field(b"foo"); 1006 rec.push_field(b""); 1007 rec.push_field(b"quux"); 1008 rec.push_field(b""); 1009 1010 assert_eq!(rec.len(), 4); 1011 assert_eq!(rec.get(0), Some(b("foo"))); 1012 assert_eq!(rec.get(1), Some(b(""))); 1013 assert_eq!(rec.get(2), Some(b("quux"))); 1014 assert_eq!(rec.get(3), Some(b(""))); 1015 assert_eq!(rec.get(4), None); 1016 assert_eq!(rec.get(5), None); 1017 } 1018 1019 #[test] utf8_error_1()1020 fn utf8_error_1() { 1021 let mut rec = ByteRecord::new(); 1022 rec.push_field(b"foo"); 1023 rec.push_field(b"b\xFFar"); 1024 1025 let err = StringRecord::from_byte_record(rec).unwrap_err(); 1026 assert_eq!(err.utf8_error().field(), 1); 1027 assert_eq!(err.utf8_error().valid_up_to(), 1); 1028 } 1029 1030 #[test] utf8_error_2()1031 fn utf8_error_2() { 1032 let mut rec = ByteRecord::new(); 1033 rec.push_field(b"\xFF"); 1034 1035 let err = StringRecord::from_byte_record(rec).unwrap_err(); 1036 assert_eq!(err.utf8_error().field(), 0); 1037 assert_eq!(err.utf8_error().valid_up_to(), 0); 1038 } 1039 1040 #[test] utf8_error_3()1041 fn utf8_error_3() { 1042 let mut rec = ByteRecord::new(); 1043 rec.push_field(b"a\xFF"); 1044 1045 let err = StringRecord::from_byte_record(rec).unwrap_err(); 1046 assert_eq!(err.utf8_error().field(), 0); 1047 assert_eq!(err.utf8_error().valid_up_to(), 1); 1048 } 1049 1050 #[test] utf8_error_4()1051 fn utf8_error_4() { 1052 let mut rec = ByteRecord::new(); 1053 rec.push_field(b"a"); 1054 rec.push_field(b"b"); 1055 rec.push_field(b"c"); 1056 rec.push_field(b"d"); 1057 rec.push_field(b"xyz\xFF"); 1058 1059 let err = StringRecord::from_byte_record(rec).unwrap_err(); 1060 assert_eq!(err.utf8_error().field(), 4); 1061 assert_eq!(err.utf8_error().valid_up_to(), 3); 1062 } 1063 1064 #[test] utf8_error_5()1065 fn utf8_error_5() { 1066 let mut rec = ByteRecord::new(); 1067 rec.push_field(b"a"); 1068 rec.push_field(b"b"); 1069 rec.push_field(b"c"); 1070 rec.push_field(b"d"); 1071 rec.push_field(b"\xFFxyz"); 1072 1073 let err = StringRecord::from_byte_record(rec).unwrap_err(); 1074 assert_eq!(err.utf8_error().field(), 4); 1075 assert_eq!(err.utf8_error().valid_up_to(), 0); 1076 } 1077 1078 // This tests a tricky case where a single field on its own isn't valid 1079 // UTF-8, but the concatenation of all fields is. 1080 #[test] utf8_error_6()1081 fn utf8_error_6() { 1082 let mut rec = ByteRecord::new(); 1083 rec.push_field(b"a\xc9"); 1084 rec.push_field(b"\x91b"); 1085 1086 let err = StringRecord::from_byte_record(rec).unwrap_err(); 1087 assert_eq!(err.utf8_error().field(), 0); 1088 assert_eq!(err.utf8_error().valid_up_to(), 1); 1089 } 1090 1091 // This tests that we can always clear a `ByteRecord` and get a guaranteed 1092 // successful conversion to UTF-8. This permits reusing the allocation. 1093 #[test] utf8_clear_ok()1094 fn utf8_clear_ok() { 1095 let mut rec = ByteRecord::new(); 1096 rec.push_field(b"\xFF"); 1097 assert!(StringRecord::from_byte_record(rec).is_err()); 1098 1099 let mut rec = ByteRecord::new(); 1100 rec.push_field(b"\xFF"); 1101 rec.clear(); 1102 assert!(StringRecord::from_byte_record(rec).is_ok()); 1103 } 1104 1105 #[test] iter()1106 fn iter() { 1107 let data = vec!["foo", "bar", "baz", "quux", "wat"]; 1108 let rec = ByteRecord::from(&*data); 1109 let got: Vec<&str> = 1110 rec.iter().map(|x| ::std::str::from_utf8(x).unwrap()).collect(); 1111 assert_eq!(data, got); 1112 } 1113 1114 #[test] iter_reverse()1115 fn iter_reverse() { 1116 let mut data = vec!["foo", "bar", "baz", "quux", "wat"]; 1117 let rec = ByteRecord::from(&*data); 1118 let got: Vec<&str> = rec 1119 .iter() 1120 .rev() 1121 .map(|x| ::std::str::from_utf8(x).unwrap()) 1122 .collect(); 1123 data.reverse(); 1124 assert_eq!(data, got); 1125 } 1126 1127 #[test] iter_forward_and_reverse()1128 fn iter_forward_and_reverse() { 1129 let data = vec!["foo", "bar", "baz", "quux", "wat"]; 1130 let rec = ByteRecord::from(data); 1131 let mut it = rec.iter(); 1132 1133 assert_eq!(it.next_back(), Some(b("wat"))); 1134 assert_eq!(it.next(), Some(b("foo"))); 1135 assert_eq!(it.next(), Some(b("bar"))); 1136 assert_eq!(it.next_back(), Some(b("quux"))); 1137 assert_eq!(it.next(), Some(b("baz"))); 1138 assert_eq!(it.next_back(), None); 1139 assert_eq!(it.next(), None); 1140 } 1141 1142 // Check that record equality respects field boundaries. 1143 // 1144 // Regression test for #138. 1145 #[test] eq_field_boundaries()1146 fn eq_field_boundaries() { 1147 let test1 = ByteRecord::from(vec!["12", "34"]); 1148 let test2 = ByteRecord::from(vec!["123", "4"]); 1149 1150 assert_ne!(test1, test2); 1151 } 1152 1153 // Check that record equality respects number of fields. 1154 // 1155 // Regression test for #138. 1156 #[test] eq_record_len()1157 fn eq_record_len() { 1158 let test1 = ByteRecord::from(vec!["12", "34", "56"]); 1159 let test2 = ByteRecord::from(vec!["12", "34"]); 1160 assert_ne!(test1, test2); 1161 } 1162 } 1163