Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 611a69c

Browse files
committed
Add a chunk cache when decoding chunks for miniblock so that continuous chunks don't have to be decoded multiple times.
1 parent 00703be commit 611a69c

File tree

1 file changed

+35
-6
lines changed

1 file changed

+35
-6
lines changed

rust/lance-encoding/src/encodings/logical/primitive.rs

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ struct ChunkMeta {
127127
}
128128

129129
/// A mini-block chunk that has been decoded and decompressed
130-
#[derive(Debug)]
130+
#[derive(Debug, Clone)]
131131
struct DecodedMiniBlockChunk {
132132
rep: Option<ScalarBuffer<u16>>,
133133
def: Option<ScalarBuffer<u16>>,
@@ -527,13 +527,42 @@ impl DecodePageTask for DecodeMiniBlockTask {
527527

528528
// We need to keep track of the offset into repbuf/defbuf that we are building up
529529
let mut level_offset = 0;
530+
531+
// Pre-compute caching needs for each chunk by checking if the next chunk is the same
532+
let needs_caching: Vec<bool> = self
533+
.instructions
534+
.windows(2)
535+
.map(|w| w[0].1.chunk_idx == w[1].1.chunk_idx)
536+
.chain(std::iter::once(false)) // the last one never needs caching
537+
.collect();
538+
539+
// Cache for storing decoded chunks when beneficial
540+
let mut chunk_cache: Option<(usize, DecodedMiniBlockChunk)> = None;
541+
530542
// Now we iterate through each instruction and process it
531-
for (instructions, chunk) in self.instructions.iter() {
532-
// TODO: It's very possible that we have duplicate `buf` in self.instructions and we
533-
// don't want to decode the buf again and again on the same thread.
543+
for (idx, (instructions, chunk)) in self.instructions.iter().enumerate() {
544+
let should_cache_this_chunk = needs_caching[idx];
545+
546+
let decoded_chunk = match &chunk_cache {
547+
Some((cached_chunk_idx, ref cached_chunk))
548+
if *cached_chunk_idx == chunk.chunk_idx =>
549+
{
550+
// Clone only when we have a cache hit (much cheaper than decoding)
551+
cached_chunk.clone()
552+
}
553+
_ => {
554+
// Cache miss, need to decode
555+
let decoded = self.decode_miniblock_chunk(&chunk.data, chunk.items_in_chunk)?;
556+
557+
// Only update cache if this chunk will benefit the next access
558+
if should_cache_this_chunk {
559+
chunk_cache = Some((chunk.chunk_idx, decoded.clone()));
560+
}
561+
decoded
562+
}
563+
};
534564

535-
let DecodedMiniBlockChunk { rep, def, values } =
536-
self.decode_miniblock_chunk(&chunk.data, chunk.items_in_chunk)?;
565+
let DecodedMiniBlockChunk { rep, def, values } = decoded_chunk;
537566

538567
// Our instructions tell us which rows we want to take from this chunk
539568
let row_range_start =

0 commit comments

Comments
 (0)