ciborium-ll/src/seg.rs

use super::*;

use ciborium_io::Read;

use core::marker::PhantomData;

/// A parser for incoming segments
pub trait Parser: Default {
    /// The type of item that is parsed
    type Item: ?Sized;

    /// The parsing error that may occur
    type Error;

    /// The main parsing function
    ///
    /// This function processes the incoming bytes and returns the item.
    ///
    /// One important detail that **MUST NOT** be overlooked is that the
    /// parser may save data from a previous parsing attempt. The number of
    /// bytes saved is indicated by the `Parser::saved()` function. The saved
    /// bytes will be copied into the beginning of the `bytes` array before
    /// processing. Therefore, two requirements should be met.
    ///
    /// First, the incoming byte slice should be larger than the saved bytes.
    ///
    /// Second, the incoming byte slice should contain new bytes only after
    /// the saved byte prefix.
    ///
    /// If both criteria are met, this allows the parser to prepend its saved
    /// bytes without any additional allocation.
    fn parse<'a>(&mut self, bytes: &'a mut [u8]) -> Result<&'a Self::Item, Self::Error>;

    /// Indicates the number of saved bytes in the parser
    fn saved(&self) -> usize {
        0
    }
}

/// A bytes parser
///
/// No actual processing is performed and the input bytes are directly
/// returned. This implies that this parser never saves any bytes internally.
#[derive(Default)]
pub struct Bytes(());

impl Parser for Bytes {
    type Item = [u8];
    type Error = core::convert::Infallible;

    fn parse<'a>(&mut self, bytes: &'a mut [u8]) -> Result<&'a [u8], Self::Error> {
        Ok(bytes)
    }
}

/// A text parser
///
/// This parser converts the input bytes to a `str`. This parser preserves
/// trailing invalid UTF-8 sequences in the case that chunking fell in the
/// middle of a valid UTF-8 character.
#[derive(Default)]
pub struct Text {
    stored: usize,
    buffer: [u8; 3],
}

impl Parser for Text {
    type Item = str;
    type Error = core::str::Utf8Error;

    fn parse<'a>(&mut self, bytes: &'a mut [u8]) -> Result<&'a str, Self::Error> {
        // If we cannot advance, return nothing.
        if bytes.len() <= self.stored {
            return Ok("");
        }

        // Copy previously invalid data into place.
        bytes[..self.stored].clone_from_slice(&self.buffer[..self.stored]);

        Ok(match core::str::from_utf8(bytes) {
            Ok(s) => s,
            Err(e) => {
                let valid_len = e.valid_up_to();
                let invalid_len = bytes.len() - valid_len;

                // If the size of the invalid UTF-8 is large enough to hold
                // all valid UTF-8 characters, we have a syntax error.
                if invalid_len > self.buffer.len() {
                    return Err(e);
                }

                // Otherwise, store the invalid bytes for the next read cycle.
                self.buffer[..invalid_len].clone_from_slice(&bytes[valid_len..]);
                self.stored = invalid_len;

                // Decode the valid part of the string.
                core::str::from_utf8(&bytes[..valid_len]).unwrap()
            }
        })
    }

    fn saved(&self) -> usize {
        self.stored
    }
}

/// A CBOR segment
///
/// This type represents a single bytes or text segment on the wire. It can be
/// read out in parsed chunks based on the size of the input scratch buffer.
pub struct Segment<'r, R: Read, P: Parser> {
    reader: &'r mut Decoder<R>,
    unread: usize,
    offset: usize,
    parser: P,
}

impl<'r, R: Read, P: Parser> Segment<'r, R, P> {
    /// Gets the number of unprocessed bytes
    #[inline]
    pub fn left(&self) -> usize {
        self.unread + self.parser.saved()
    }

    /// Gets the next parsed chunk within the segment
    ///
    /// Returns `Ok(None)` when all chunks have been read.
    #[inline]
    pub fn pull<'a>(
        &mut self,
        buffer: &'a mut [u8],
    ) -> Result<Option<&'a P::Item>, Error<R::Error>> {
        use core::cmp::min;

        let prev = self.parser.saved();
        match self.unread {
            0 if prev == 0 => return Ok(None),
            0 => return Err(Error::Syntax(self.offset)),
            _ => (),
        }

        // Determine how many bytes to read.
        let size = min(buffer.len(), prev + self.unread);
        let full = &mut buffer[..size];
        let next = &mut full[min(size, prev)..];

        // Read additional bytes.
        self.reader.read_exact(next)?;
        self.unread -= next.len();

        self.parser
            .parse(full)
            .or(Err(Error::Syntax(self.offset)))
            .map(Some)
    }
}

/// A sequence of CBOR segments
///
/// CBOR allows for bytes or text items to be segmented. This type represents
/// the state of that segmented input stream.
pub struct Segments<'r, R: Read, P: Parser> {
    reader: &'r mut Decoder<R>,
    finish: bool,
    nested: usize,
    parser: PhantomData<P>,
    unwrap: fn(Header) -> Result<Option<usize>, ()>,
}

impl<'r, R: Read, P: Parser> Segments<'r, R, P> {
    #[inline]
    pub(crate) fn new(
        decoder: &'r mut Decoder<R>,
        unwrap: fn(Header) -> Result<Option<usize>, ()>,
    ) -> Self {
        Self {
            reader: decoder,
            finish: false,
            nested: 0,
            parser: PhantomData,
            unwrap,
        }
    }

    /// Gets the next segment in the stream
    ///
    /// Returns `Ok(None)` at the conclusion of the stream.
    #[inline]
    pub fn pull(&mut self) -> Result<Option<Segment<R, P>>, Error<R::Error>> {
        while !self.finish {
            let offset = self.reader.offset();
            match self.reader.pull()? {
                Header::Break if self.nested == 1 => return Ok(None),
                Header::Break if self.nested > 1 => self.nested -= 1,
                header => match (self.unwrap)(header) {
                    Err(..) => return Err(Error::Syntax(offset)),
                    Ok(None) => self.nested += 1,
                    Ok(Some(len)) => {
                        self.finish = self.nested == 0;
                        return Ok(Some(Segment {
                            reader: self.reader,
                            unread: len,
                            offset,
                            parser: P::default(),
                        }));
                    }
                },
            }
        }

        Ok(None)
    }
}