Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 92cca2d

Browse files
authored
Merge pull request #1525 from spacejam/tyler_prefix_encoding
[bloodstone] implement prefix encoding
2 parents 869009a + f6870e0 commit 92cca2d

7 files changed

Lines changed: 305 additions & 166 deletions

File tree

src/leaf.rs

Lines changed: 231 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ pub(crate) struct Leaf<const LEAF_FANOUT: usize> {
55
pub lo: InlineArray,
66
pub hi: Option<InlineArray>,
77
pub prefix_length: usize,
8-
pub data: stack_map::StackMap<InlineArray, InlineArray, LEAF_FANOUT>,
8+
data: stack_map::StackMap<InlineArray, InlineArray, LEAF_FANOUT>,
99
pub in_memory_size: usize,
1010
pub mutation_count: u64,
1111
#[serde(skip)]
@@ -58,13 +58,8 @@ impl<const LEAF_FANOUT: usize> Leaf<LEAF_FANOUT> {
5858

5959
pub(crate) fn get(&self, key: &[u8]) -> Option<&InlineArray> {
6060
assert!(self.deleted.is_none());
61-
let prefixed_key = if self.prefix_length == 0 {
62-
key
63-
} else {
64-
let prefix = self.prefix();
65-
assert!(key.starts_with(prefix));
66-
&key[self.prefix_length..]
67-
};
61+
assert!(key.starts_with(self.prefix()));
62+
let prefixed_key = &key[self.prefix_length..];
6863
self.data.get(prefixed_key)
6964
}
7065

@@ -74,13 +69,8 @@ impl<const LEAF_FANOUT: usize> Leaf<LEAF_FANOUT> {
7469
value: InlineArray,
7570
) -> Option<InlineArray> {
7671
assert!(self.deleted.is_none());
77-
let prefixed_key = if self.prefix_length == 0 {
78-
key
79-
} else {
80-
let prefix = self.prefix();
81-
assert!(key.starts_with(prefix));
82-
key[self.prefix_length..].into()
83-
};
72+
assert!(key.starts_with(self.prefix()));
73+
let prefixed_key = key[self.prefix_length..].into();
8474
self.data.insert(prefixed_key, value)
8575
}
8676

@@ -91,4 +81,230 @@ impl<const LEAF_FANOUT: usize> Leaf<LEAF_FANOUT> {
9181
let partial_key = &key[self.prefix_length..];
9282
self.data.remove(partial_key)
9383
}
84+
85+
pub(crate) fn merge_from(&mut self, other: &mut Self) {
86+
assert!(self.is_empty());
87+
88+
self.hi = other.hi.clone();
89+
90+
let new_prefix_len = if let Some(hi) = &self.hi {
91+
self.lo.iter().zip(hi.iter()).take_while(|(l, r)| l == r).count()
92+
} else {
93+
0
94+
};
95+
96+
assert_eq!(self.lo[..new_prefix_len], other.lo[..new_prefix_len]);
97+
98+
// self.prefix_length is not read because it's expected to be
99+
// initialized here.
100+
self.prefix_length = new_prefix_len;
101+
102+
if self.prefix() == other.prefix() {
103+
self.data = std::mem::take(&mut other.data);
104+
return;
105+
}
106+
107+
assert!(
108+
self.prefix_length < other.prefix_length,
109+
"self: {:?} other: {:?}",
110+
self,
111+
other
112+
);
113+
114+
let unshifted_key_amount = other.prefix_length - self.prefix_length;
115+
let unshifted_prefix = &other.lo
116+
[other.prefix_length - unshifted_key_amount..other.prefix_length];
117+
118+
for (k, v) in other.data.iter() {
119+
let mut unshifted_key =
120+
Vec::with_capacity(unshifted_prefix.len() + k.len());
121+
unshifted_key.extend_from_slice(unshifted_prefix);
122+
unshifted_key.extend_from_slice(k);
123+
self.data.insert(unshifted_key.into(), v.clone());
124+
}
125+
126+
assert_eq!(other.data.len(), self.data.len());
127+
128+
#[cfg(feature = "for-internal-testing-only")]
129+
assert_eq!(
130+
self.iter().collect::<Vec<_>>(),
131+
other.iter().collect::<Vec<_>>(),
132+
"self: {:#?} \n other: {:#?}\n",
133+
self,
134+
other
135+
);
136+
}
137+
138+
pub(crate) fn iter(
139+
&self,
140+
) -> impl Iterator<Item = (InlineArray, InlineArray)> {
141+
let prefix = self.prefix();
142+
self.data.iter().map(|(k, v)| {
143+
let mut unshifted_key = Vec::with_capacity(prefix.len() + k.len());
144+
unshifted_key.extend_from_slice(prefix);
145+
unshifted_key.extend_from_slice(k);
146+
(unshifted_key.into(), v.clone())
147+
})
148+
}
149+
150+
pub(crate) fn serialize(&self, zstd_compression_level: i32) -> Vec<u8> {
151+
let mut ret = vec![];
152+
153+
let mut zstd_enc =
154+
zstd::stream::Encoder::new(&mut ret, zstd_compression_level)
155+
.unwrap();
156+
157+
bincode::serialize_into(&mut zstd_enc, self).unwrap();
158+
159+
zstd_enc.finish().unwrap();
160+
161+
ret
162+
}
163+
164+
pub(crate) fn deserialize(
165+
buf: &[u8],
166+
) -> std::io::Result<Box<Leaf<LEAF_FANOUT>>> {
167+
let zstd_decoded = zstd::stream::decode_all(buf).unwrap();
168+
let mut leaf: Box<Leaf<LEAF_FANOUT>> =
169+
bincode::deserialize(&zstd_decoded).unwrap();
170+
171+
// use decompressed buffer length as a cheap proxy for in-memory size for now
172+
leaf.in_memory_size = zstd_decoded.len();
173+
174+
Ok(leaf)
175+
}
176+
177+
fn set_in_memory_size(&mut self) {
178+
self.in_memory_size = std::mem::size_of::<Leaf<LEAF_FANOUT>>()
179+
+ self.hi.as_ref().map(|h| h.len()).unwrap_or(0)
180+
+ self.lo.len()
181+
+ self.data.iter().map(|(k, v)| k.len() + v.len()).sum::<usize>();
182+
}
183+
184+
pub(crate) fn split_if_full(
185+
&mut self,
186+
new_epoch: FlushEpoch,
187+
allocator: &ObjectCache<LEAF_FANOUT>,
188+
collection_id: CollectionId,
189+
) -> Option<(InlineArray, Object<LEAF_FANOUT>)> {
190+
if self.data.is_full() {
191+
let original_len = self.data.len();
192+
193+
let old_prefix_len = self.prefix_length;
194+
// split
195+
let split_offset = if self.lo.is_empty() {
196+
// split left-most shard almost at the beginning for
197+
// optimizing downward-growing workloads
198+
1
199+
} else if self.hi.is_none() {
200+
// split right-most shard almost at the end for
201+
// optimizing upward-growing workloads
202+
self.data.len() - 2
203+
} else {
204+
self.data.len() / 2
205+
};
206+
207+
let data = self.data.split_off(split_offset);
208+
209+
let left_max = &self.data.last().unwrap().0;
210+
let right_min = &data.first().unwrap().0;
211+
212+
// suffix truncation attempts to shrink the split key
213+
// so that shorter keys bubble up into the index
214+
let splitpoint_length = right_min
215+
.iter()
216+
.zip(left_max.iter())
217+
.take_while(|(a, b)| a == b)
218+
.count()
219+
+ 1;
220+
221+
let mut split_vec =
222+
Vec::with_capacity(self.prefix_length + splitpoint_length);
223+
split_vec.extend_from_slice(self.prefix());
224+
split_vec.extend_from_slice(&right_min[..splitpoint_length]);
225+
let split_key = InlineArray::from(split_vec);
226+
227+
let rhs_id = allocator.allocate_object_id(new_epoch);
228+
229+
log::trace!(
230+
"split leaf {:?} at split key: {:?} into new {:?} at {:?}",
231+
self.lo,
232+
split_key,
233+
rhs_id,
234+
new_epoch,
235+
);
236+
237+
let mut rhs = Leaf {
238+
dirty_flush_epoch: Some(new_epoch),
239+
hi: self.hi.clone(),
240+
lo: split_key.clone(),
241+
prefix_length: 0,
242+
in_memory_size: 0,
243+
data,
244+
mutation_count: 0,
245+
page_out_on_flush: None,
246+
deleted: None,
247+
max_unflushed_epoch: None,
248+
};
249+
250+
rhs.shorten_keys_after_split(old_prefix_len);
251+
252+
rhs.set_in_memory_size();
253+
254+
self.hi = Some(split_key.clone());
255+
256+
self.shorten_keys_after_split(old_prefix_len);
257+
258+
self.set_in_memory_size();
259+
260+
assert_eq!(self.hi.as_ref().unwrap(), &split_key);
261+
assert_eq!(rhs.lo, &split_key);
262+
assert_eq!(rhs.data.len() + self.data.len(), original_len);
263+
264+
let rhs_node = Object {
265+
object_id: rhs_id,
266+
collection_id,
267+
low_key: split_key.clone(),
268+
inner: Arc::new(RwLock::new(CacheBox {
269+
leaf: Some(Box::new(rhs)),
270+
logged_index: BTreeMap::default(),
271+
})),
272+
};
273+
274+
return Some((split_key, rhs_node));
275+
}
276+
277+
None
278+
}
279+
280+
pub(crate) fn shorten_keys_after_split(&mut self, old_prefix_len: usize) {
281+
let Some(hi) = self.hi.as_ref() else { return };
282+
283+
let new_prefix_len =
284+
self.lo.iter().zip(hi.iter()).take_while(|(l, r)| l == r).count();
285+
286+
assert_eq!(self.lo[..new_prefix_len], hi[..new_prefix_len]);
287+
288+
// self.prefix_length is not read because it's expected to be
289+
// initialized here.
290+
self.prefix_length = new_prefix_len;
291+
292+
if new_prefix_len == old_prefix_len {
293+
return;
294+
}
295+
296+
assert!(
297+
new_prefix_len > old_prefix_len,
298+
"expected new prefix length of {} to be greater than the pre-split prefix length of {} for node {:?}",
299+
new_prefix_len,
300+
old_prefix_len,
301+
self
302+
);
303+
304+
let key_shift = new_prefix_len - old_prefix_len;
305+
306+
for (k, v) in std::mem::take(&mut self.data).iter() {
307+
self.data.insert(k[key_shift..].into(), v.clone());
308+
}
309+
}
94310
}

src/lib.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,6 @@
11
// 1.0 blockers
22
//
33
// bugs
4-
// * tree predecessor holds lock on successor and tries to get it for predecessor. This will
5-
// deadlock if used concurrently with write batches, which acquire locks lexicographically.
6-
// * add merges to iterator test and assert it deadlocks
7-
// * alternative is to merge right, not left
84
// * page-out needs to be deferred until after any flush of the dirty epoch
95
// * need to remove max_unflushed_epoch after flushing it
106
// * can't send reliable page-out request backwards from 7->6
@@ -25,6 +21,7 @@
2521
// * clean -> dirty -> {maybe coop} -> flushed
2622
// * for page-out, we only care if it's stable or if we need to add it to
2723
// a page-out priority queue
24+
// * page-out doesn't seem to happen as expected
2825
//
2926
// reliability
3027
// TODO make all writes wrapped in a Tearable wrapper that splits writes

src/object_cache.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -396,12 +396,12 @@ impl<const LEAF_FANOUT: usize> ObjectCache<LEAF_FANOUT> {
396396
// this being called in the destructor.
397397
pub fn mark_access_and_evict(
398398
&self,
399-
object_id: ObjectId,
399+
accessed_object_id: ObjectId,
400400
size: usize,
401401
#[allow(unused)] flush_epoch: FlushEpoch,
402402
) -> io::Result<()> {
403403
let mut ca = self.cache_advisor.borrow_mut();
404-
let to_evict = ca.accessed_reuse_buffer(*object_id, size);
404+
let to_evict = ca.accessed_reuse_buffer(*accessed_object_id, size);
405405
let mut not_found = 0;
406406
for (node_to_evict, _rough_size) in to_evict {
407407
let object_id =
@@ -411,6 +411,12 @@ impl<const LEAF_FANOUT: usize> ObjectCache<LEAF_FANOUT> {
411411
unreachable!("object ID must never have been 0");
412412
};
413413

414+
if accessed_object_id == object_id {
415+
// TODO our own object was evicted, so
416+
// set page out after current epoch (or just page out if clean?)
417+
continue;
418+
}
419+
414420
let node = if let Some(n) = self.object_id_index.get(&object_id) {
415421
if *n.object_id != *node_to_evict {
416422
continue;

0 commit comments

Comments
 (0)