1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 use super::super::branch_meta::BranchMeta; 6 use super::super::bytestr::ByteStr; 7 use super::store::const_for_each; 8 use super::store::ConstArrayBuilder; 9 use super::store::ConstLengthsStack; 10 use super::store::ConstSlice; 11 use crate::error::ZeroTrieBuildError; 12 use crate::varint; 13 14 /// A low-level builder for ZeroTrieSimpleAscii. Works in const contexts. 15 pub(crate) struct ZeroTrieBuilderConst<const N: usize> { 16 data: ConstArrayBuilder<N, u8>, 17 } 18 19 impl<const N: usize> ZeroTrieBuilderConst<N> { 20 /// Non-const function that returns the current trie data as a slice. 21 #[cfg(feature = "litemap")] as_bytes(&self) -> &[u8]22 pub fn as_bytes(&self) -> &[u8] { 23 self.data.as_const_slice().as_slice() 24 } 25 26 /// Returns the trie data, panicking if the buffer is the wrong size. build_or_panic(self) -> [u8; N]27 pub const fn build_or_panic(self) -> [u8; N] { 28 self.data.const_build_or_panic() 29 } 30 31 /// Creates a new empty builder. new() -> Self32 pub const fn new() -> Self { 33 Self { 34 data: ConstArrayBuilder::new_empty([0; N], N), 35 } 36 } 37 38 /// Prepends an ASCII node to the front of the builder. Returns the new builder 39 /// and the delta in length, which is always 1. 40 #[must_use] prepend_ascii(self, ascii: u8) -> (Self, usize)41 const fn prepend_ascii(self, ascii: u8) -> (Self, usize) { 42 if ascii >= 128 { 43 panic!("Non-ASCII not supported in ZeroTrieSimpleAscii"); 44 } 45 let data = self.data.const_push_front_or_panic(ascii); 46 (Self { data }, 1) 47 } 48 49 /// Prepends a value node to the front of the builder. Returns the new builder 50 /// and the delta in length, which depends on the size of the varint. 51 #[must_use] prepend_value(self, value: usize) -> (Self, usize)52 const fn prepend_value(self, value: usize) -> (Self, usize) { 53 let mut data = self.data; 54 let varint_array = varint::write_varint_meta3(value); 55 data = data.const_extend_front_or_panic(varint_array.as_const_slice()); 56 data = data.const_bitor_assign(0, 0b10000000); 57 (Self { data }, varint_array.len()) 58 } 59 60 /// Prepends a branch node to the front of the builder. Returns the new builder 61 /// and the delta in length, which depends on the size of the varint. 62 #[must_use] prepend_branch(self, value: usize) -> (Self, usize)63 const fn prepend_branch(self, value: usize) -> (Self, usize) { 64 let mut data = self.data; 65 let varint_array = varint::write_varint_meta2(value); 66 data = data.const_extend_front_or_panic(varint_array.as_const_slice()); 67 data = data.const_bitor_assign(0, 0b11000000); 68 (Self { data }, varint_array.len()) 69 } 70 71 /// Prepends multiple arbitrary bytes to the front of the builder. Returns the new builder 72 /// and the delta in length, which is the length of the slice. 73 #[must_use] prepend_slice(self, s: ConstSlice<u8>) -> (Self, usize)74 const fn prepend_slice(self, s: ConstSlice<u8>) -> (Self, usize) { 75 let mut data = self.data; 76 let mut i = s.len(); 77 while i > 0 { 78 data = data.const_push_front_or_panic(*s.get_or_panic(i - 1)); 79 i -= 1; 80 } 81 (Self { data }, s.len()) 82 } 83 84 /// Prepends multiple zeros to the front of the builder. Returns the new builder. 85 #[must_use] prepend_n_zeros(self, n: usize) -> Self86 const fn prepend_n_zeros(self, n: usize) -> Self { 87 let mut data = self.data; 88 let mut i = 0; 89 while i < n { 90 data = data.const_push_front_or_panic(0); 91 i += 1; 92 } 93 Self { data } 94 } 95 96 /// Performs the operation `self[index] |= bits` bitor_assign_at(self, index: usize, bits: u8) -> Self97 const fn bitor_assign_at(self, index: usize, bits: u8) -> Self { 98 let mut data = self.data; 99 data = data.const_bitor_assign(index, bits); 100 Self { data } 101 } 102 103 /// Creates a new builder containing the elements in the given slice of key/value pairs. 104 /// 105 /// `K` is the stack size of the lengths stack. If you get an error such as 106 /// "AsciiTrie Builder: Need more stack", try increasing `K`. 107 /// 108 /// # Panics 109 /// 110 /// Panics if the items are not sorted from_tuple_slice<'a, const K: usize>( items: &[(&'a ByteStr, usize)], ) -> Result<Self, ZeroTrieBuildError>111 pub const fn from_tuple_slice<'a, const K: usize>( 112 items: &[(&'a ByteStr, usize)], 113 ) -> Result<Self, ZeroTrieBuildError> { 114 let items = ConstSlice::from_slice(items); 115 let mut prev: Option<&'a ByteStr> = None; 116 const_for_each!(items, (ascii_str, _), { 117 match prev { 118 None => (), 119 Some(prev) => { 120 if !prev.is_less_then(ascii_str) { 121 panic!("Strings in ByteStr constructor are not sorted"); 122 } 123 } 124 }; 125 prev = Some(ascii_str) 126 }); 127 Self::from_sorted_const_tuple_slice::<K>(items) 128 } 129 130 /// Creates a new builder containing the elements in the given slice of key/value pairs. 131 /// 132 /// Assumes that the items are sorted. If they are not, unexpected behavior may occur. 133 /// 134 /// `K` is the stack size of the lengths stack. If you get an error such as 135 /// "AsciiTrie Builder: Need more stack", try increasing `K`. from_sorted_const_tuple_slice<const K: usize>( items: ConstSlice<(&ByteStr, usize)>, ) -> Result<Self, ZeroTrieBuildError>136 pub const fn from_sorted_const_tuple_slice<const K: usize>( 137 items: ConstSlice<(&ByteStr, usize)>, 138 ) -> Result<Self, ZeroTrieBuildError> { 139 let mut result = Self::new(); 140 let total_size; 141 (result, total_size) = result.create_or_panic::<K>(items); 142 debug_assert!(total_size == result.data.len()); 143 Ok(result) 144 } 145 146 /// The actual builder algorithm. For an explanation, see [`crate::builder`]. 147 #[must_use] create_or_panic<const K: usize>( mut self, all_items: ConstSlice<(&ByteStr, usize)>, ) -> (Self, usize)148 const fn create_or_panic<const K: usize>( 149 mut self, 150 all_items: ConstSlice<(&ByteStr, usize)>, 151 ) -> (Self, usize) { 152 let mut prefix_len = match all_items.last() { 153 Some(x) => x.0.len(), 154 // Empty slice: 155 None => return (Self::new(), 0), 156 }; 157 // Initialize the main loop to point at the last string. 158 let mut lengths_stack = ConstLengthsStack::<K>::new(); 159 let mut i = all_items.len() - 1; 160 let mut j = all_items.len(); 161 let mut current_len = 0; 162 // Start the main loop. 163 loop { 164 let item_i = all_items.get_or_panic(i); 165 let item_j = all_items.get_or_panic(j - 1); 166 debug_assert!(item_i.0.prefix_eq(item_j.0, prefix_len)); 167 // Check if we need to add a value node here. 168 if item_i.0.len() == prefix_len { 169 let len; 170 (self, len) = self.prepend_value(item_i.1); 171 current_len += len; 172 } 173 if prefix_len == 0 { 174 // All done! Leave the main loop. 175 break; 176 } 177 // Reduce the prefix length by 1 and recalculate i and j. 178 prefix_len -= 1; 179 let mut new_i = i; 180 let mut new_j = j; 181 let mut ascii_i = item_i.0.byte_at_or_panic(prefix_len); 182 let mut ascii_j = item_j.0.byte_at_or_panic(prefix_len); 183 debug_assert!(ascii_i == ascii_j); 184 let key_ascii = ascii_i; 185 loop { 186 if new_i == 0 { 187 break; 188 } 189 let candidate = all_items.get_or_panic(new_i - 1).0; 190 if candidate.len() < prefix_len { 191 // Too short 192 break; 193 } 194 if item_i.0.prefix_eq(candidate, prefix_len) { 195 new_i -= 1; 196 } else { 197 break; 198 } 199 if candidate.len() == prefix_len { 200 // A string that equals the prefix does not take part in the branch node. 201 break; 202 } 203 let candidate = candidate.byte_at_or_panic(prefix_len); 204 if candidate != ascii_i { 205 ascii_i = candidate; 206 } 207 } 208 loop { 209 if new_j == all_items.len() { 210 break; 211 } 212 let candidate = all_items.get_or_panic(new_j).0; 213 if candidate.len() < prefix_len { 214 // Too short 215 break; 216 } 217 if item_j.0.prefix_eq(candidate, prefix_len) { 218 new_j += 1; 219 } else { 220 break; 221 } 222 if candidate.len() == prefix_len { 223 panic!("A shorter string should be earlier in the sequence"); 224 } 225 let candidate = candidate.byte_at_or_panic(prefix_len); 226 if candidate != ascii_j { 227 ascii_j = candidate; 228 } 229 } 230 // If there are no different bytes at this prefix level, we can add an ASCII or Span 231 // node and then continue to the next iteration of the main loop. 232 if ascii_i == key_ascii && ascii_j == key_ascii { 233 let len; 234 (self, len) = self.prepend_ascii(ascii_i); 235 current_len += len; 236 debug_assert!(i == new_i || i == new_i + 1); 237 i = new_i; 238 debug_assert!(j == new_j); 239 continue; 240 } 241 // If i and j changed, we are a target of a branch node. 242 if ascii_j == key_ascii { 243 // We are the _last_ target of a branch node. 244 lengths_stack = lengths_stack.push_or_panic(BranchMeta { 245 ascii: key_ascii, 246 cumulative_length: current_len, 247 local_length: current_len, 248 count: 1, 249 }); 250 } else { 251 // We are the _not the last_ target of a branch node. 252 let BranchMeta { 253 cumulative_length, 254 count, 255 .. 256 } = lengths_stack.peek_or_panic(); 257 lengths_stack = lengths_stack.push_or_panic(BranchMeta { 258 ascii: key_ascii, 259 cumulative_length: cumulative_length + current_len, 260 local_length: current_len, 261 count: count + 1, 262 }); 263 } 264 if ascii_i != key_ascii { 265 // We are _not the first_ target of a branch node. 266 // Set the cursor to the previous string and continue the loop. 267 j = i; 268 i -= 1; 269 prefix_len = all_items.get_or_panic(i).0.len(); 270 current_len = 0; 271 continue; 272 } 273 // Branch (first) 274 let (total_length, total_count) = { 275 let BranchMeta { 276 cumulative_length, 277 count, 278 .. 279 } = lengths_stack.peek_or_panic(); 280 (cumulative_length, count) 281 }; 282 let branch_metas; 283 (lengths_stack, branch_metas) = lengths_stack.pop_many_or_panic(total_count); 284 let original_keys = branch_metas.map_to_ascii_bytes(); 285 // Write out the offset table 286 current_len = total_length; 287 const USIZE_BITS: usize = core::mem::size_of::<usize>() * 8; 288 let w = (USIZE_BITS - (total_length.leading_zeros() as usize) - 1) / 8; 289 if w > 3 { 290 panic!("ZeroTrie capacity exceeded"); 291 } 292 let mut k = 0; 293 while k <= w { 294 self = self.prepend_n_zeros(total_count - 1); 295 current_len += total_count - 1; 296 let mut l = 0; 297 let mut length_to_write = 0; 298 while l < total_count { 299 let BranchMeta { local_length, .. } = *branch_metas 300 .as_const_slice() 301 .get_or_panic(total_count - l - 1); 302 let mut adjusted_length = length_to_write; 303 let mut m = 0; 304 while m < k { 305 adjusted_length >>= 8; 306 m += 1; 307 } 308 if l > 0 { 309 self = self.bitor_assign_at(l - 1, adjusted_length as u8); 310 } 311 l += 1; 312 length_to_write += local_length; 313 } 314 k += 1; 315 } 316 // Write out the lookup table 317 assert!(0 < total_count && total_count <= 256); 318 let branch_value = (w << 8) + (total_count & 0xff); 319 let slice_len; 320 (self, slice_len) = self.prepend_slice(original_keys.as_const_slice()); 321 let branch_len; 322 (self, branch_len) = self.prepend_branch(branch_value); 323 current_len += slice_len + branch_len; 324 i = new_i; 325 j = new_j; 326 } 327 assert!(lengths_stack.is_empty()); 328 (self, current_len) 329 } 330 } 331