• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 use super::super::branch_meta::BranchMeta;
6 use super::super::bytestr::ByteStr;
7 use super::store::const_for_each;
8 use super::store::ConstArrayBuilder;
9 use super::store::ConstLengthsStack;
10 use super::store::ConstSlice;
11 use crate::error::ZeroTrieBuildError;
12 use crate::varint;
13 
14 /// A low-level builder for ZeroTrieSimpleAscii. Works in const contexts.
15 pub(crate) struct ZeroTrieBuilderConst<const N: usize> {
16     data: ConstArrayBuilder<N, u8>,
17 }
18 
19 impl<const N: usize> ZeroTrieBuilderConst<N> {
20     /// Non-const function that returns the current trie data as a slice.
21     #[cfg(feature = "litemap")]
as_bytes(&self) -> &[u8]22     pub fn as_bytes(&self) -> &[u8] {
23         self.data.as_const_slice().as_slice()
24     }
25 
26     /// Returns the trie data, panicking if the buffer is the wrong size.
build_or_panic(self) -> [u8; N]27     pub const fn build_or_panic(self) -> [u8; N] {
28         self.data.const_build_or_panic()
29     }
30 
31     /// Creates a new empty builder.
new() -> Self32     pub const fn new() -> Self {
33         Self {
34             data: ConstArrayBuilder::new_empty([0; N], N),
35         }
36     }
37 
38     /// Prepends an ASCII node to the front of the builder. Returns the new builder
39     /// and the delta in length, which is always 1.
40     #[must_use]
prepend_ascii(self, ascii: u8) -> (Self, usize)41     const fn prepend_ascii(self, ascii: u8) -> (Self, usize) {
42         if ascii >= 128 {
43             panic!("Non-ASCII not supported in ZeroTrieSimpleAscii");
44         }
45         let data = self.data.const_push_front_or_panic(ascii);
46         (Self { data }, 1)
47     }
48 
49     /// Prepends a value node to the front of the builder. Returns the new builder
50     /// and the delta in length, which depends on the size of the varint.
51     #[must_use]
prepend_value(self, value: usize) -> (Self, usize)52     const fn prepend_value(self, value: usize) -> (Self, usize) {
53         let mut data = self.data;
54         let varint_array = varint::write_varint_meta3(value);
55         data = data.const_extend_front_or_panic(varint_array.as_const_slice());
56         data = data.const_bitor_assign(0, 0b10000000);
57         (Self { data }, varint_array.len())
58     }
59 
60     /// Prepends a branch node to the front of the builder. Returns the new builder
61     /// and the delta in length, which depends on the size of the varint.
62     #[must_use]
prepend_branch(self, value: usize) -> (Self, usize)63     const fn prepend_branch(self, value: usize) -> (Self, usize) {
64         let mut data = self.data;
65         let varint_array = varint::write_varint_meta2(value);
66         data = data.const_extend_front_or_panic(varint_array.as_const_slice());
67         data = data.const_bitor_assign(0, 0b11000000);
68         (Self { data }, varint_array.len())
69     }
70 
71     /// Prepends multiple arbitrary bytes to the front of the builder. Returns the new builder
72     /// and the delta in length, which is the length of the slice.
73     #[must_use]
prepend_slice(self, s: ConstSlice<u8>) -> (Self, usize)74     const fn prepend_slice(self, s: ConstSlice<u8>) -> (Self, usize) {
75         let mut data = self.data;
76         let mut i = s.len();
77         while i > 0 {
78             data = data.const_push_front_or_panic(*s.get_or_panic(i - 1));
79             i -= 1;
80         }
81         (Self { data }, s.len())
82     }
83 
84     /// Prepends multiple zeros to the front of the builder. Returns the new builder.
85     #[must_use]
prepend_n_zeros(self, n: usize) -> Self86     const fn prepend_n_zeros(self, n: usize) -> Self {
87         let mut data = self.data;
88         let mut i = 0;
89         while i < n {
90             data = data.const_push_front_or_panic(0);
91             i += 1;
92         }
93         Self { data }
94     }
95 
96     /// Performs the operation `self[index] |= bits`
bitor_assign_at(self, index: usize, bits: u8) -> Self97     const fn bitor_assign_at(self, index: usize, bits: u8) -> Self {
98         let mut data = self.data;
99         data = data.const_bitor_assign(index, bits);
100         Self { data }
101     }
102 
103     /// Creates a new builder containing the elements in the given slice of key/value pairs.
104     ///
105     /// `K` is the stack size of the lengths stack. If you get an error such as
106     /// "AsciiTrie Builder: Need more stack", try increasing `K`.
107     ///
108     /// # Panics
109     ///
110     /// Panics if the items are not sorted
from_tuple_slice<'a, const K: usize>( items: &[(&'a ByteStr, usize)], ) -> Result<Self, ZeroTrieBuildError>111     pub const fn from_tuple_slice<'a, const K: usize>(
112         items: &[(&'a ByteStr, usize)],
113     ) -> Result<Self, ZeroTrieBuildError> {
114         let items = ConstSlice::from_slice(items);
115         let mut prev: Option<&'a ByteStr> = None;
116         const_for_each!(items, (ascii_str, _), {
117             match prev {
118                 None => (),
119                 Some(prev) => {
120                     if !prev.is_less_then(ascii_str) {
121                         panic!("Strings in ByteStr constructor are not sorted");
122                     }
123                 }
124             };
125             prev = Some(ascii_str)
126         });
127         Self::from_sorted_const_tuple_slice::<K>(items)
128     }
129 
130     /// Creates a new builder containing the elements in the given slice of key/value pairs.
131     ///
132     /// Assumes that the items are sorted. If they are not, unexpected behavior may occur.
133     ///
134     /// `K` is the stack size of the lengths stack. If you get an error such as
135     /// "AsciiTrie Builder: Need more stack", try increasing `K`.
from_sorted_const_tuple_slice<const K: usize>( items: ConstSlice<(&ByteStr, usize)>, ) -> Result<Self, ZeroTrieBuildError>136     pub const fn from_sorted_const_tuple_slice<const K: usize>(
137         items: ConstSlice<(&ByteStr, usize)>,
138     ) -> Result<Self, ZeroTrieBuildError> {
139         let mut result = Self::new();
140         let total_size;
141         (result, total_size) = result.create_or_panic::<K>(items);
142         debug_assert!(total_size == result.data.len());
143         Ok(result)
144     }
145 
146     /// The actual builder algorithm. For an explanation, see [`crate::builder`].
147     #[must_use]
create_or_panic<const K: usize>( mut self, all_items: ConstSlice<(&ByteStr, usize)>, ) -> (Self, usize)148     const fn create_or_panic<const K: usize>(
149         mut self,
150         all_items: ConstSlice<(&ByteStr, usize)>,
151     ) -> (Self, usize) {
152         let mut prefix_len = match all_items.last() {
153             Some(x) => x.0.len(),
154             // Empty slice:
155             None => return (Self::new(), 0),
156         };
157         // Initialize the main loop to point at the last string.
158         let mut lengths_stack = ConstLengthsStack::<K>::new();
159         let mut i = all_items.len() - 1;
160         let mut j = all_items.len();
161         let mut current_len = 0;
162         // Start the main loop.
163         loop {
164             let item_i = all_items.get_or_panic(i);
165             let item_j = all_items.get_or_panic(j - 1);
166             debug_assert!(item_i.0.prefix_eq(item_j.0, prefix_len));
167             // Check if we need to add a value node here.
168             if item_i.0.len() == prefix_len {
169                 let len;
170                 (self, len) = self.prepend_value(item_i.1);
171                 current_len += len;
172             }
173             if prefix_len == 0 {
174                 // All done! Leave the main loop.
175                 break;
176             }
177             // Reduce the prefix length by 1 and recalculate i and j.
178             prefix_len -= 1;
179             let mut new_i = i;
180             let mut new_j = j;
181             let mut ascii_i = item_i.0.byte_at_or_panic(prefix_len);
182             let mut ascii_j = item_j.0.byte_at_or_panic(prefix_len);
183             debug_assert!(ascii_i == ascii_j);
184             let key_ascii = ascii_i;
185             loop {
186                 if new_i == 0 {
187                     break;
188                 }
189                 let candidate = all_items.get_or_panic(new_i - 1).0;
190                 if candidate.len() < prefix_len {
191                     // Too short
192                     break;
193                 }
194                 if item_i.0.prefix_eq(candidate, prefix_len) {
195                     new_i -= 1;
196                 } else {
197                     break;
198                 }
199                 if candidate.len() == prefix_len {
200                     // A string that equals the prefix does not take part in the branch node.
201                     break;
202                 }
203                 let candidate = candidate.byte_at_or_panic(prefix_len);
204                 if candidate != ascii_i {
205                     ascii_i = candidate;
206                 }
207             }
208             loop {
209                 if new_j == all_items.len() {
210                     break;
211                 }
212                 let candidate = all_items.get_or_panic(new_j).0;
213                 if candidate.len() < prefix_len {
214                     // Too short
215                     break;
216                 }
217                 if item_j.0.prefix_eq(candidate, prefix_len) {
218                     new_j += 1;
219                 } else {
220                     break;
221                 }
222                 if candidate.len() == prefix_len {
223                     panic!("A shorter string should be earlier in the sequence");
224                 }
225                 let candidate = candidate.byte_at_or_panic(prefix_len);
226                 if candidate != ascii_j {
227                     ascii_j = candidate;
228                 }
229             }
230             // If there are no different bytes at this prefix level, we can add an ASCII or Span
231             // node and then continue to the next iteration of the main loop.
232             if ascii_i == key_ascii && ascii_j == key_ascii {
233                 let len;
234                 (self, len) = self.prepend_ascii(ascii_i);
235                 current_len += len;
236                 debug_assert!(i == new_i || i == new_i + 1);
237                 i = new_i;
238                 debug_assert!(j == new_j);
239                 continue;
240             }
241             // If i and j changed, we are a target of a branch node.
242             if ascii_j == key_ascii {
243                 // We are the _last_ target of a branch node.
244                 lengths_stack = lengths_stack.push_or_panic(BranchMeta {
245                     ascii: key_ascii,
246                     cumulative_length: current_len,
247                     local_length: current_len,
248                     count: 1,
249                 });
250             } else {
251                 // We are the _not the last_ target of a branch node.
252                 let BranchMeta {
253                     cumulative_length,
254                     count,
255                     ..
256                 } = lengths_stack.peek_or_panic();
257                 lengths_stack = lengths_stack.push_or_panic(BranchMeta {
258                     ascii: key_ascii,
259                     cumulative_length: cumulative_length + current_len,
260                     local_length: current_len,
261                     count: count + 1,
262                 });
263             }
264             if ascii_i != key_ascii {
265                 // We are _not the first_ target of a branch node.
266                 // Set the cursor to the previous string and continue the loop.
267                 j = i;
268                 i -= 1;
269                 prefix_len = all_items.get_or_panic(i).0.len();
270                 current_len = 0;
271                 continue;
272             }
273             // Branch (first)
274             let (total_length, total_count) = {
275                 let BranchMeta {
276                     cumulative_length,
277                     count,
278                     ..
279                 } = lengths_stack.peek_or_panic();
280                 (cumulative_length, count)
281             };
282             let branch_metas;
283             (lengths_stack, branch_metas) = lengths_stack.pop_many_or_panic(total_count);
284             let original_keys = branch_metas.map_to_ascii_bytes();
285             // Write out the offset table
286             current_len = total_length;
287             const USIZE_BITS: usize = core::mem::size_of::<usize>() * 8;
288             let w = (USIZE_BITS - (total_length.leading_zeros() as usize) - 1) / 8;
289             if w > 3 {
290                 panic!("ZeroTrie capacity exceeded");
291             }
292             let mut k = 0;
293             while k <= w {
294                 self = self.prepend_n_zeros(total_count - 1);
295                 current_len += total_count - 1;
296                 let mut l = 0;
297                 let mut length_to_write = 0;
298                 while l < total_count {
299                     let BranchMeta { local_length, .. } = *branch_metas
300                         .as_const_slice()
301                         .get_or_panic(total_count - l - 1);
302                     let mut adjusted_length = length_to_write;
303                     let mut m = 0;
304                     while m < k {
305                         adjusted_length >>= 8;
306                         m += 1;
307                     }
308                     if l > 0 {
309                         self = self.bitor_assign_at(l - 1, adjusted_length as u8);
310                     }
311                     l += 1;
312                     length_to_write += local_length;
313                 }
314                 k += 1;
315             }
316             // Write out the lookup table
317             assert!(0 < total_count && total_count <= 256);
318             let branch_value = (w << 8) + (total_count & 0xff);
319             let slice_len;
320             (self, slice_len) = self.prepend_slice(original_keys.as_const_slice());
321             let branch_len;
322             (self, branch_len) = self.prepend_branch(branch_value);
323             current_len += slice_len + branch_len;
324             i = new_i;
325             j = new_j;
326         }
327         assert!(lengths_stack.is_empty());
328         (self, current_len)
329     }
330 }
331