@@ -5,7 +5,7 @@ pub(crate) struct Leaf<const LEAF_FANOUT: usize> {
55 pub lo : InlineArray ,
66 pub hi : Option < InlineArray > ,
77 pub prefix_length : usize ,
8- pub data : stack_map:: StackMap < InlineArray , InlineArray , LEAF_FANOUT > ,
8+ data : stack_map:: StackMap < InlineArray , InlineArray , LEAF_FANOUT > ,
99 pub in_memory_size : usize ,
1010 pub mutation_count : u64 ,
1111 #[ serde( skip) ]
@@ -58,13 +58,8 @@ impl<const LEAF_FANOUT: usize> Leaf<LEAF_FANOUT> {
5858
5959 pub ( crate ) fn get ( & self , key : & [ u8 ] ) -> Option < & InlineArray > {
6060 assert ! ( self . deleted. is_none( ) ) ;
61- let prefixed_key = if self . prefix_length == 0 {
62- key
63- } else {
64- let prefix = self . prefix ( ) ;
65- assert ! ( key. starts_with( prefix) ) ;
66- & key[ self . prefix_length ..]
67- } ;
61+ assert ! ( key. starts_with( self . prefix( ) ) ) ;
62+ let prefixed_key = & key[ self . prefix_length ..] ;
6863 self . data . get ( prefixed_key)
6964 }
7065
@@ -74,13 +69,8 @@ impl<const LEAF_FANOUT: usize> Leaf<LEAF_FANOUT> {
7469 value : InlineArray ,
7570 ) -> Option < InlineArray > {
7671 assert ! ( self . deleted. is_none( ) ) ;
77- let prefixed_key = if self . prefix_length == 0 {
78- key
79- } else {
80- let prefix = self . prefix ( ) ;
81- assert ! ( key. starts_with( prefix) ) ;
82- key[ self . prefix_length ..] . into ( )
83- } ;
72+ assert ! ( key. starts_with( self . prefix( ) ) ) ;
73+ let prefixed_key = key[ self . prefix_length ..] . into ( ) ;
8474 self . data . insert ( prefixed_key, value)
8575 }
8676
@@ -91,4 +81,230 @@ impl<const LEAF_FANOUT: usize> Leaf<LEAF_FANOUT> {
9181 let partial_key = & key[ self . prefix_length ..] ;
9282 self . data . remove ( partial_key)
9383 }
84+
85+ pub ( crate ) fn merge_from ( & mut self , other : & mut Self ) {
86+ assert ! ( self . is_empty( ) ) ;
87+
88+ self . hi = other. hi . clone ( ) ;
89+
90+ let new_prefix_len = if let Some ( hi) = & self . hi {
91+ self . lo . iter ( ) . zip ( hi. iter ( ) ) . take_while ( |( l, r) | l == r) . count ( )
92+ } else {
93+ 0
94+ } ;
95+
96+ assert_eq ! ( self . lo[ ..new_prefix_len] , other. lo[ ..new_prefix_len] ) ;
97+
98+ // self.prefix_length is not read because it's expected to be
99+ // initialized here.
100+ self . prefix_length = new_prefix_len;
101+
102+ if self . prefix ( ) == other. prefix ( ) {
103+ self . data = std:: mem:: take ( & mut other. data ) ;
104+ return ;
105+ }
106+
107+ assert ! (
108+ self . prefix_length < other. prefix_length,
109+ "self: {:?} other: {:?}" ,
110+ self ,
111+ other
112+ ) ;
113+
114+ let unshifted_key_amount = other. prefix_length - self . prefix_length ;
115+ let unshifted_prefix = & other. lo
116+ [ other. prefix_length - unshifted_key_amount..other. prefix_length ] ;
117+
118+ for ( k, v) in other. data . iter ( ) {
119+ let mut unshifted_key =
120+ Vec :: with_capacity ( unshifted_prefix. len ( ) + k. len ( ) ) ;
121+ unshifted_key. extend_from_slice ( unshifted_prefix) ;
122+ unshifted_key. extend_from_slice ( k) ;
123+ self . data . insert ( unshifted_key. into ( ) , v. clone ( ) ) ;
124+ }
125+
126+ assert_eq ! ( other. data. len( ) , self . data. len( ) ) ;
127+
128+ #[ cfg( feature = "for-internal-testing-only" ) ]
129+ assert_eq ! (
130+ self . iter( ) . collect:: <Vec <_>>( ) ,
131+ other. iter( ) . collect:: <Vec <_>>( ) ,
132+ "self: {:#?} \n other: {:#?}\n " ,
133+ self ,
134+ other
135+ ) ;
136+ }
137+
138+ pub ( crate ) fn iter (
139+ & self ,
140+ ) -> impl Iterator < Item = ( InlineArray , InlineArray ) > {
141+ let prefix = self . prefix ( ) ;
142+ self . data . iter ( ) . map ( |( k, v) | {
143+ let mut unshifted_key = Vec :: with_capacity ( prefix. len ( ) + k. len ( ) ) ;
144+ unshifted_key. extend_from_slice ( prefix) ;
145+ unshifted_key. extend_from_slice ( k) ;
146+ ( unshifted_key. into ( ) , v. clone ( ) )
147+ } )
148+ }
149+
150+ pub ( crate ) fn serialize ( & self , zstd_compression_level : i32 ) -> Vec < u8 > {
151+ let mut ret = vec ! [ ] ;
152+
153+ let mut zstd_enc =
154+ zstd:: stream:: Encoder :: new ( & mut ret, zstd_compression_level)
155+ . unwrap ( ) ;
156+
157+ bincode:: serialize_into ( & mut zstd_enc, self ) . unwrap ( ) ;
158+
159+ zstd_enc. finish ( ) . unwrap ( ) ;
160+
161+ ret
162+ }
163+
164+ pub ( crate ) fn deserialize (
165+ buf : & [ u8 ] ,
166+ ) -> std:: io:: Result < Box < Leaf < LEAF_FANOUT > > > {
167+ let zstd_decoded = zstd:: stream:: decode_all ( buf) . unwrap ( ) ;
168+ let mut leaf: Box < Leaf < LEAF_FANOUT > > =
169+ bincode:: deserialize ( & zstd_decoded) . unwrap ( ) ;
170+
171+ // use decompressed buffer length as a cheap proxy for in-memory size for now
172+ leaf. in_memory_size = zstd_decoded. len ( ) ;
173+
174+ Ok ( leaf)
175+ }
176+
177+ fn set_in_memory_size ( & mut self ) {
178+ self . in_memory_size = std:: mem:: size_of :: < Leaf < LEAF_FANOUT > > ( )
179+ + self . hi . as_ref ( ) . map ( |h| h. len ( ) ) . unwrap_or ( 0 )
180+ + self . lo . len ( )
181+ + self . data . iter ( ) . map ( |( k, v) | k. len ( ) + v. len ( ) ) . sum :: < usize > ( ) ;
182+ }
183+
184+ pub ( crate ) fn split_if_full (
185+ & mut self ,
186+ new_epoch : FlushEpoch ,
187+ allocator : & ObjectCache < LEAF_FANOUT > ,
188+ collection_id : CollectionId ,
189+ ) -> Option < ( InlineArray , Object < LEAF_FANOUT > ) > {
190+ if self . data . is_full ( ) {
191+ let original_len = self . data . len ( ) ;
192+
193+ let old_prefix_len = self . prefix_length ;
194+ // split
195+ let split_offset = if self . lo . is_empty ( ) {
196+ // split left-most shard almost at the beginning for
197+ // optimizing downward-growing workloads
198+ 1
199+ } else if self . hi . is_none ( ) {
200+ // split right-most shard almost at the end for
201+ // optimizing upward-growing workloads
202+ self . data . len ( ) - 2
203+ } else {
204+ self . data . len ( ) / 2
205+ } ;
206+
207+ let data = self . data . split_off ( split_offset) ;
208+
209+ let left_max = & self . data . last ( ) . unwrap ( ) . 0 ;
210+ let right_min = & data. first ( ) . unwrap ( ) . 0 ;
211+
212+ // suffix truncation attempts to shrink the split key
213+ // so that shorter keys bubble up into the index
214+ let splitpoint_length = right_min
215+ . iter ( )
216+ . zip ( left_max. iter ( ) )
217+ . take_while ( |( a, b) | a == b)
218+ . count ( )
219+ + 1 ;
220+
221+ let mut split_vec =
222+ Vec :: with_capacity ( self . prefix_length + splitpoint_length) ;
223+ split_vec. extend_from_slice ( self . prefix ( ) ) ;
224+ split_vec. extend_from_slice ( & right_min[ ..splitpoint_length] ) ;
225+ let split_key = InlineArray :: from ( split_vec) ;
226+
227+ let rhs_id = allocator. allocate_object_id ( new_epoch) ;
228+
229+ log:: trace!(
230+ "split leaf {:?} at split key: {:?} into new {:?} at {:?}" ,
231+ self . lo,
232+ split_key,
233+ rhs_id,
234+ new_epoch,
235+ ) ;
236+
237+ let mut rhs = Leaf {
238+ dirty_flush_epoch : Some ( new_epoch) ,
239+ hi : self . hi . clone ( ) ,
240+ lo : split_key. clone ( ) ,
241+ prefix_length : 0 ,
242+ in_memory_size : 0 ,
243+ data,
244+ mutation_count : 0 ,
245+ page_out_on_flush : None ,
246+ deleted : None ,
247+ max_unflushed_epoch : None ,
248+ } ;
249+
250+ rhs. shorten_keys_after_split ( old_prefix_len) ;
251+
252+ rhs. set_in_memory_size ( ) ;
253+
254+ self . hi = Some ( split_key. clone ( ) ) ;
255+
256+ self . shorten_keys_after_split ( old_prefix_len) ;
257+
258+ self . set_in_memory_size ( ) ;
259+
260+ assert_eq ! ( self . hi. as_ref( ) . unwrap( ) , & split_key) ;
261+ assert_eq ! ( rhs. lo, & split_key) ;
262+ assert_eq ! ( rhs. data. len( ) + self . data. len( ) , original_len) ;
263+
264+ let rhs_node = Object {
265+ object_id : rhs_id,
266+ collection_id,
267+ low_key : split_key. clone ( ) ,
268+ inner : Arc :: new ( RwLock :: new ( CacheBox {
269+ leaf : Some ( Box :: new ( rhs) ) ,
270+ logged_index : BTreeMap :: default ( ) ,
271+ } ) ) ,
272+ } ;
273+
274+ return Some ( ( split_key, rhs_node) ) ;
275+ }
276+
277+ None
278+ }
279+
280+ pub ( crate ) fn shorten_keys_after_split ( & mut self , old_prefix_len : usize ) {
281+ let Some ( hi) = self . hi . as_ref ( ) else { return } ;
282+
283+ let new_prefix_len =
284+ self . lo . iter ( ) . zip ( hi. iter ( ) ) . take_while ( |( l, r) | l == r) . count ( ) ;
285+
286+ assert_eq ! ( self . lo[ ..new_prefix_len] , hi[ ..new_prefix_len] ) ;
287+
288+ // self.prefix_length is not read because it's expected to be
289+ // initialized here.
290+ self . prefix_length = new_prefix_len;
291+
292+ if new_prefix_len == old_prefix_len {
293+ return ;
294+ }
295+
296+ assert ! (
297+ new_prefix_len > old_prefix_len,
298+ "expected new prefix length of {} to be greater than the pre-split prefix length of {} for node {:?}" ,
299+ new_prefix_len,
300+ old_prefix_len,
301+ self
302+ ) ;
303+
304+ let key_shift = new_prefix_len - old_prefix_len;
305+
306+ for ( k, v) in std:: mem:: take ( & mut self . data ) . iter ( ) {
307+ self . data . insert ( k[ key_shift..] . into ( ) , v. clone ( ) ) ;
308+ }
309+ }
94310}
0 commit comments