// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! This module contains various types for the header part of casemapping exception data
//!
//! This is both used in datagen to decode ICU4C's data, and natively in ICU4X's
//! own data model.
//!
//! [`ExceptionBits`] is the bag of bits associated with exceptions, and [`SlotPresence`]
//! marks the presence or absence of various "slots" in a given exception.
//!
//! The `exceptions_builder` module of this crate handles decoding ICU4C data using the exception
//! header, and [`crate::provider::exceptions`] handles.
use crate::provider::data::{DotType, MappingKind};
use zerovec::ule::{AsULE, ULE};
/// A bunch of bits associated with each exception.
///
///
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
///
#[derive(Copy, Clone, PartialEq, Eq, Debug, Default)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[cfg_attr(feature = "datagen", derive(serde::Serialize))]
pub struct ExceptionBits {
/// Whether or not the slots are double-width.
///
/// Unused in ICU4X
pub double_width_slots: bool,
/// There is no simple casefolding, even if there is a simple lowercase mapping
pub no_simple_case_folding: bool,
/// The delta stored in the `Delta` slot is negative
pub negative_delta: bool,
/// If the character is case sensitive
pub is_sensitive: bool,
/// The dot type of the character
pub dot_type: DotType,
/// If the character has conditional special casing
pub has_conditional_special: bool,
/// If the character has conditional case folding
pub has_conditional_fold: bool,
}
impl ExceptionBits {
/// Extract from the upper half of an ICU4C-format u16
pub(crate) fn from_integer(int: u8) -> Self {
let ule = ExceptionBitsULE(int);
let double_width_slots = ule.double_width_slots();
let no_simple_case_folding = ule.no_simple_case_folding();
let negative_delta = ule.negative_delta();
let is_sensitive = ule.is_sensitive();
let has_conditional_special = ule.has_conditional_special();
let has_conditional_fold = ule.has_conditional_fold();
let dot_type = ule.dot_type();
Self {
double_width_slots,
no_simple_case_folding,
negative_delta,
is_sensitive,
dot_type,
has_conditional_special,
has_conditional_fold,
}
}
/// Convert to an ICU4C-format upper half of u16
pub(crate) fn to_integer(self) -> u8 {
let mut int = 0;
let dot_data = (self.dot_type as u8) << ExceptionBitsULE::DOT_SHIFT;
int |= dot_data;
if self.double_width_slots {
int |= ExceptionBitsULE::DOUBLE_SLOTS_FLAG
}
if self.no_simple_case_folding {
int |= ExceptionBitsULE::NO_SIMPLE_CASE_FOLDING_FLAG
}
if self.negative_delta {
int |= ExceptionBitsULE::NEGATIVE_DELTA_FLAG
}
if self.is_sensitive {
int |= ExceptionBitsULE::SENSITIVE_FLAG
}
if self.has_conditional_special {
int |= ExceptionBitsULE::CONDITIONAL_SPECIAL_FLAG
}
if self.has_conditional_fold {
int |= ExceptionBitsULE::CONDITIONAL_FOLD_FLAG
}
int
}
}
/// Packed slot presence marker
///
/// All bits are valid, though bit 4 is unused and reserved
///
/// Bits:
///
/// ```text
/// 0: Lowercase mapping (code point)
/// 1: Case folding (code point)
/// 2: Uppercase mapping (code point)
/// 3: Titlecase mapping (code point)
/// 4: Delta to simple case mapping (code point) (sign stored separately)
/// 5: RESERVED
/// 6: Closure mappings (string; see below)
/// 7: Full mappings (strings; see below)
/// ```
///
///
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
///
#[derive(Copy, Clone, PartialEq, Eq, ULE, Debug, Default)]
#[repr(transparent)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[cfg_attr(feature = "datagen", derive(serde::Serialize))]
pub struct SlotPresence(pub u8);
impl SlotPresence {
pub(crate) fn add_slot(&mut self, slot: ExceptionSlot) {
self.0 |= 1 << slot as u8;
}
pub(crate) fn has_slot(self, slot: ExceptionSlot) -> bool {
let bit = 1 << (slot as u8);
self.0 & bit != 0
}
}
/// The bitflags on an exception header.
///
/// Format from icu4c, documented in casepropsbuilder.cpp, shifted 8 bits since ICU4C has this packed
/// alongside a SlotPresence
///
/// ```text
/// 0 Double-width slots. If set, then each optional slot is stored as two
/// elements of the array (high and low halves of 32-bit values) instead of
/// a single element.
/// 1 Has no simple case folding, even if there is a simple lowercase mapping
/// 2 The value in the delta slot is negative
/// 3 Is case-sensitive (not exposed)
/// 4..5 Dot type
/// 6 Has conditional special casing
/// 7 Has conditional case folding
/// ```
///
/// All bits are valid, though in ICU4X data bits 0 and 2 are not used
///
///
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
///
#[derive(Copy, Clone, PartialEq, Eq, ULE, Debug)]
#[repr(transparent)]
pub struct ExceptionBitsULE(pub u8);
impl ExceptionBitsULE {
const DOUBLE_SLOTS_FLAG: u8 = 0x1;
const NO_SIMPLE_CASE_FOLDING_FLAG: u8 = 0x2;
const NEGATIVE_DELTA_FLAG: u8 = 0x4;
const SENSITIVE_FLAG: u8 = 0x8;
const DOT_SHIFT: u8 = 4;
const CONDITIONAL_SPECIAL_FLAG: u8 = 0x40;
const CONDITIONAL_FOLD_FLAG: u8 = 0x80;
}
impl ExceptionBitsULE {
/// Whether or not the slots are double-width.
///
/// Unused in ICU4X
pub fn double_width_slots(self) -> bool {
self.0 & Self::DOUBLE_SLOTS_FLAG != 0
}
/// There is no simple casefolding, even if there is a simple lowercase mapping
pub fn no_simple_case_folding(self) -> bool {
self.0 & Self::NO_SIMPLE_CASE_FOLDING_FLAG != 0
}
/// The delta stored in the `Delta` slot is negative
pub fn negative_delta(self) -> bool {
self.0 & Self::NEGATIVE_DELTA_FLAG != 0
}
/// If the character is case sensitive
pub fn is_sensitive(self) -> bool {
self.0 & Self::SENSITIVE_FLAG != 0
}
/// If the character has conditional special casing
pub fn has_conditional_special(self) -> bool {
self.0 & Self::CONDITIONAL_SPECIAL_FLAG != 0
}
/// If the character has conditional case folding
pub fn has_conditional_fold(self) -> bool {
self.0 & Self::CONDITIONAL_FOLD_FLAG != 0
}
/// The dot type of the character
pub fn dot_type(self) -> DotType {
DotType::from_masked_bits((u16::from(self.0 >> Self::DOT_SHIFT)) & DotType::DOT_MASK)
}
}
impl AsULE for ExceptionBits {
type ULE = ExceptionBitsULE;
fn from_unaligned(u: ExceptionBitsULE) -> Self {
ExceptionBits::from_integer(u.0)
}
fn to_unaligned(self) -> ExceptionBitsULE {
ExceptionBitsULE(self.to_integer())
}
}
impl AsULE for SlotPresence {
type ULE = SlotPresence;
fn from_unaligned(u: Self) -> Self {
u
}
fn to_unaligned(self) -> Self {
self
}
}
/// The different slots that may be present in slot-based exception data
///
///
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
///
#[derive(Copy, Clone, Debug, PartialOrd, Ord, PartialEq, Eq)]
pub(crate) enum ExceptionSlot {
/// Lowercase mapping
Lower = 0,
/// Case folding
Fold = 1,
/// Uppercase mapping
Upper = 2,
/// Titlecase mapping
Title = 3,
/// The delta to the simple case folding
Delta = 4,
// Slot 5 is reserved
/// The closure set
Closure = 6,
/// The four full-mappings
FullMappings = 7,
}
impl ExceptionSlot {
/// Where the string slots begin
pub(crate) const STRING_SLOTS_START: Self = Self::Closure;
}
impl From for ExceptionSlot {
fn from(full: MappingKind) -> Self {
match full {
MappingKind::Lower => Self::Lower,
MappingKind::Fold => Self::Fold,
MappingKind::Upper => Self::Upper,
MappingKind::Title => Self::Title,
}
}
}