Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 0b3d03d

Browse files
author
Kai Liu
committed
Materialize the hash index
Summary: Materialize the hash index to avoid the soaring cpu/flash usage when initializing the database. Test Plan: existing unit tests passed Reviewers: sdong, haobo Reviewed By: sdong CC: leveldb Differential Revision: https://reviews.facebook.net/D18339
1 parent 4e0602f commit 0b3d03d

12 files changed

Lines changed: 389 additions & 141 deletions

HISTORY.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
### Public API changes
66
* Replaced ColumnFamilyOptions::table_properties_collectors with ColumnFamilyOptions::table_properties_collector_factories
77

8+
### New Features
9+
* Hash index for block-based table will be materialized and reconstructed more efficiently. Previously hash index is constructed by scanning the whole table during every table open.
10+
811
## 3.0.0 (05/05/2014)
912

1013
### Public API changes

table/block_based_table_builder.cc

Lines changed: 165 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515

1616
#include <map>
1717
#include <memory>
18+
#include <string>
19+
#include <unordered_map>
1820

1921
#include "db/dbformat.h"
2022

@@ -41,6 +43,8 @@
4143

4244
namespace rocksdb {
4345

46+
extern const std::string kHashIndexPrefixesBlock;
47+
extern const std::string kHashIndexPrefixesMetadataBlock;
4448
namespace {
4549

4650
typedef BlockBasedTableOptions::IndexType IndexType;
@@ -57,6 +61,14 @@ typedef BlockBasedTableOptions::IndexType IndexType;
5761
// design that just works.
5862
class IndexBuilder {
5963
public:
64+
// Index builder will construct a set of blocks which contain:
65+
// 1. One primary index block.
66+
// 2. (Optional) a set of metablocks that contains the metadata of the
67+
// primary index.
68+
struct IndexBlocks {
69+
Slice index_block_contents;
70+
std::unordered_map<std::string, Slice> meta_blocks;
71+
};
6072
explicit IndexBuilder(const Comparator* comparator)
6173
: comparator_(comparator) {}
6274

@@ -72,15 +84,19 @@ class IndexBuilder {
7284
// the last one in the table
7385
//
7486
// REQUIRES: Finish() has not yet been called.
75-
virtual void AddEntry(std::string* last_key_in_current_block,
76-
const Slice* first_key_in_next_block,
77-
const BlockHandle& block_handle) = 0;
87+
virtual void AddIndexEntry(std::string* last_key_in_current_block,
88+
const Slice* first_key_in_next_block,
89+
const BlockHandle& block_handle) = 0;
90+
91+
// This method will be called whenever a key is added. The subclasses may
92+
// override OnKeyAdded() if they need to collect additional information.
93+
virtual void OnKeyAdded(const Slice& key) {}
7894

7995
// Inform the index builder that all entries has been written. Block builder
8096
// may therefore perform any operation required for block finalization.
8197
//
8298
// REQUIRES: Finish() has not yet been called.
83-
virtual Slice Finish() = 0;
99+
virtual Status Finish(IndexBlocks* index_blocks) = 0;
84100

85101
// Get the estimated size for index block.
86102
virtual size_t EstimatedSize() const = 0;
@@ -103,9 +119,9 @@ class ShortenedIndexBuilder : public IndexBuilder {
103119
: IndexBuilder(comparator),
104120
index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {}
105121

106-
virtual void AddEntry(std::string* last_key_in_current_block,
107-
const Slice* first_key_in_next_block,
108-
const BlockHandle& block_handle) override {
122+
virtual void AddIndexEntry(std::string* last_key_in_current_block,
123+
const Slice* first_key_in_next_block,
124+
const BlockHandle& block_handle) override {
109125
if (first_key_in_next_block != nullptr) {
110126
comparator_->FindShortestSeparator(last_key_in_current_block,
111127
*first_key_in_next_block);
@@ -118,7 +134,10 @@ class ShortenedIndexBuilder : public IndexBuilder {
118134
index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
119135
}
120136

121-
virtual Slice Finish() override { return index_block_builder_.Finish(); }
137+
virtual Status Finish(IndexBlocks* index_blocks) {
138+
index_blocks->index_block_contents = index_block_builder_.Finish();
139+
return Status::OK();
140+
}
122141

123142
virtual size_t EstimatedSize() const {
124143
return index_block_builder_.CurrentSizeEstimate();
@@ -128,38 +147,125 @@ class ShortenedIndexBuilder : public IndexBuilder {
128147
BlockBuilder index_block_builder_;
129148
};
130149

131-
// FullKeyIndexBuilder is also based on BlockBuilder. It works pretty much like
132-
// ShortenedIndexBuilder, but preserves the full key instead the substitude key.
133-
class FullKeyIndexBuilder : public IndexBuilder {
150+
// HashIndexBuilder contains a binary-searchable primary index and the
151+
// metadata for secondary hash index construction.
152+
// The metadata for hash index consists two parts:
153+
// - a metablock that compactly contains a sequence of prefixes. All prefixes
154+
// are stored consectively without any metadata (like, prefix sizes) being
155+
// stored, which is kept in the other metablock.
156+
// - a metablock contains the metadata of the prefixes, including prefix size,
157+
// restart index and number of block it spans. The format looks like:
158+
//
159+
// +-----------------+---------------------------+---------------------+ <=prefix 1
160+
// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
161+
// +-----------------+---------------------------+---------------------+ <=prefix 2
162+
// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
163+
// +-----------------+---------------------------+---------------------+
164+
// | |
165+
// | .... |
166+
// | |
167+
// +-----------------+---------------------------+---------------------+ <=prefix n
168+
// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
169+
// +-----------------+---------------------------+---------------------+
170+
//
171+
// The reason of separating these two metablocks is to enable the efficiently
172+
// reuse the first metablock during hash index construction without unnecessary
173+
// data copy or small heap allocations for prefixes.
174+
class HashIndexBuilder : public IndexBuilder {
134175
public:
135-
explicit FullKeyIndexBuilder(const Comparator* comparator)
176+
explicit HashIndexBuilder(const Comparator* comparator,
177+
const SliceTransform* hash_key_extractor)
136178
: IndexBuilder(comparator),
137-
index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {}
179+
primary_index_builder(comparator),
180+
hash_key_extractor_(hash_key_extractor) {}
181+
182+
virtual void AddIndexEntry(std::string* last_key_in_current_block,
183+
const Slice* first_key_in_next_block,
184+
const BlockHandle& block_handle) override {
185+
++current_restart_index_;
186+
primary_index_builder.AddIndexEntry(last_key_in_current_block,
187+
first_key_in_next_block, block_handle);
188+
}
138189

139-
virtual void AddEntry(std::string* last_key_in_current_block,
140-
const Slice* first_key_in_next_block,
141-
const BlockHandle& block_handle) override {
142-
std::string handle_encoding;
143-
block_handle.EncodeTo(&handle_encoding);
144-
index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
190+
virtual void OnKeyAdded(const Slice& key) override {
191+
auto key_prefix = hash_key_extractor_->Transform(key);
192+
bool is_first_entry = pending_block_num_ == 0;
193+
194+
// Keys may share the prefix
195+
if (is_first_entry || pending_entry_prefix_ != key_prefix) {
196+
if (!is_first_entry) {
197+
FlushPendingPrefix();
198+
}
199+
200+
// need a hard copy otherwise the underlying data changes all the time.
201+
// TODO(kailiu) ToString() is expensive. We may speed up can avoid data
202+
// copy.
203+
pending_entry_prefix_ = key_prefix.ToString();
204+
pending_block_num_ = 1;
205+
pending_entry_index_ = current_restart_index_;
206+
} else {
207+
// entry number increments when keys share the prefix reside in
208+
// differnt data blocks.
209+
auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1;
210+
assert(last_restart_index <= current_restart_index_);
211+
if (last_restart_index != current_restart_index_) {
212+
++pending_block_num_;
213+
}
214+
}
145215
}
146216

147-
virtual Slice Finish() override { return index_block_builder_.Finish(); }
217+
virtual Status Finish(IndexBlocks* index_blocks) {
218+
FlushPendingPrefix();
219+
primary_index_builder.Finish(index_blocks);
220+
index_blocks->meta_blocks.insert(
221+
{kHashIndexPrefixesBlock.c_str(), prefix_block_});
222+
index_blocks->meta_blocks.insert(
223+
{kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_});
224+
return Status::OK();
225+
}
148226

149227
virtual size_t EstimatedSize() const {
150-
return index_block_builder_.CurrentSizeEstimate();
228+
return primary_index_builder.EstimatedSize() + prefix_block_.size() +
229+
prefix_meta_block_.size();
151230
}
152231

153232
private:
154-
BlockBuilder index_block_builder_;
233+
void FlushPendingPrefix() {
234+
prefix_block_.append(pending_entry_prefix_.data(),
235+
pending_entry_prefix_.size());
236+
PutVarint32(&prefix_meta_block_, pending_entry_prefix_.size());
237+
PutVarint32(&prefix_meta_block_, pending_entry_index_);
238+
PutVarint32(&prefix_meta_block_, pending_block_num_);
239+
}
240+
241+
ShortenedIndexBuilder primary_index_builder;
242+
const SliceTransform* hash_key_extractor_;
243+
244+
// stores a sequence of prefixes
245+
std::string prefix_block_;
246+
// stores the metadata of prefixes
247+
std::string prefix_meta_block_;
248+
249+
// The following 3 variables keeps unflushed prefix and its metadata.
250+
// The details of block_num and entry_index can be found in
251+
// "block_hash_index.{h,cc}"
252+
uint32_t pending_block_num_ = 0;
253+
uint32_t pending_entry_index_ = 0;
254+
std::string pending_entry_prefix_;
255+
256+
uint64_t current_restart_index_ = 0;
155257
};
156258

157259
// Create a index builder based on its type.
158-
IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator) {
260+
IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator,
261+
const SliceTransform* prefix_extractor) {
159262
switch (type) {
160263
case BlockBasedTableOptions::kBinarySearch: {
161264
return new ShortenedIndexBuilder(comparator);
162265
}
266+
case BlockBasedTableOptions::kHashSearch: {
267+
return new HashIndexBuilder(comparator, prefix_extractor);
268+
}
163269
default: {
164270
assert(!"Do not recognize the index type ");
165271
return nullptr;
@@ -249,7 +355,7 @@ extern const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull;
249355
class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
250356
: public TablePropertiesCollector {
251357
public:
252-
BlockBasedTablePropertiesCollector(
358+
explicit BlockBasedTablePropertiesCollector(
253359
BlockBasedTableOptions::IndexType index_type)
254360
: index_type_(index_type) {}
255361

@@ -288,6 +394,8 @@ struct BlockBasedTableBuilder::Rep {
288394
uint64_t offset = 0;
289395
Status status;
290396
BlockBuilder data_block;
397+
398+
InternalKeySliceTransform internal_prefix_transform;
291399
std::unique_ptr<IndexBuilder> index_builder;
292400

293401
std::string last_key;
@@ -316,8 +424,9 @@ struct BlockBasedTableBuilder::Rep {
316424
internal_comparator(icomparator),
317425
file(f),
318426
data_block(options, &internal_comparator),
319-
index_builder(
320-
CreateIndexBuilder(index_block_type, &internal_comparator)),
427+
internal_prefix_transform(options.prefix_extractor.get()),
428+
index_builder(CreateIndexBuilder(index_block_type, &internal_comparator,
429+
&this->internal_prefix_transform)),
321430
compression_type(compression_type),
322431
checksum_type(checksum_type),
323432
filter_block(opt.filter_policy == nullptr
@@ -335,16 +444,13 @@ struct BlockBasedTableBuilder::Rep {
335444
}
336445
};
337446

338-
// TODO(sdong): Currently only write out binary search index. In
339-
// BlockBasedTableReader, Hash index will be built using binary search index.
340447
BlockBasedTableBuilder::BlockBasedTableBuilder(
341448
const Options& options, const BlockBasedTableOptions& table_options,
342449
const InternalKeyComparator& internal_comparator, WritableFile* file,
343450
CompressionType compression_type)
344451
: rep_(new Rep(options, internal_comparator, file,
345452
table_options.flush_block_policy_factory.get(),
346-
compression_type,
347-
BlockBasedTableOptions::IndexType::kBinarySearch,
453+
compression_type, table_options.index_type,
348454
table_options.checksum)) {
349455
if (rep_->filter_block != nullptr) {
350456
rep_->filter_block->StartBlock(0);
@@ -370,7 +476,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
370476
if (r->props.num_entries > 0) {
371477
assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0);
372478
}
373-
479+
r->index_builder->OnKeyAdded(key);
374480
auto should_flush = r->flush_block_policy->Update(key, value);
375481
if (should_flush) {
376482
assert(!r->data_block.empty());
@@ -385,7 +491,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
385491
// entries in the first block and < all entries in subsequent
386492
// blocks.
387493
if (ok()) {
388-
r->index_builder->AddEntry(&r->last_key, &key, r->pending_handle);
494+
r->index_builder->AddIndexEntry(&r->last_key, &key, r->pending_handle);
389495
}
390496
}
391497

@@ -561,24 +667,36 @@ Status BlockBasedTableBuilder::Finish() {
561667
// block, we will finish writing all index entries here and flush them
562668
// to storage after metaindex block is written.
563669
if (ok() && !empty_data_block) {
564-
r->index_builder->AddEntry(&r->last_key, nullptr /* no next data block */,
565-
r->pending_handle);
670+
r->index_builder->AddIndexEntry(
671+
&r->last_key, nullptr /* no next data block */, r->pending_handle);
672+
}
673+
674+
IndexBuilder::IndexBlocks index_blocks;
675+
auto s = r->index_builder->Finish(&index_blocks);
676+
if (!s.ok()) {
677+
return s;
566678
}
567679

568680
// Write meta blocks and metaindex block with the following order.
569681
// 1. [meta block: filter]
570-
// 2. [meta block: properties]
571-
// 3. [metaindex block]
572-
if (ok()) {
573-
MetaIndexBuilder meta_index_builer;
682+
// 2. [other meta blocks]
683+
// 3. [meta block: properties]
684+
// 4. [metaindex block]
685+
// write meta blocks
686+
MetaIndexBuilder meta_index_builder;
687+
for (const auto& item : index_blocks.meta_blocks) {
688+
BlockHandle block_handle;
689+
WriteBlock(item.second, &block_handle);
690+
meta_index_builder.Add(item.first, block_handle);
691+
}
574692

575-
// Write filter block.
693+
if (ok()) {
576694
if (r->filter_block != nullptr) {
577695
// Add mapping from "<filter_block_prefix>.Name" to location
578696
// of filter data.
579697
std::string key = BlockBasedTable::kFilterBlockPrefix;
580698
key.append(r->options.filter_policy->Name());
581-
meta_index_builer.Add(key, filter_block_handle);
699+
meta_index_builder.Add(key, filter_block_handle);
582700
}
583701

584702
// Write properties block.
@@ -605,20 +723,16 @@ Status BlockBasedTableBuilder::Finish() {
605723
&properties_block_handle
606724
);
607725

608-
meta_index_builer.Add(kPropertiesBlock,
609-
properties_block_handle);
726+
meta_index_builder.Add(kPropertiesBlock, properties_block_handle);
610727
} // end of properties block writing
611-
612-
WriteRawBlock(
613-
meta_index_builer.Finish(),
614-
kNoCompression,
615-
&metaindex_block_handle
616-
);
617-
} // meta blocks and metaindex block.
728+
} // meta blocks
618729

619730
// Write index block
620731
if (ok()) {
621-
WriteBlock(r->index_builder->Finish(), &index_block_handle);
732+
// flush the meta index block
733+
WriteRawBlock(meta_index_builder.Finish(), kNoCompression,
734+
&metaindex_block_handle);
735+
WriteBlock(index_blocks.index_block_contents, &index_block_handle);
622736
}
623737

624738
// Write footer
@@ -685,7 +799,6 @@ uint64_t BlockBasedTableBuilder::FileSize() const {
685799
return rep_->offset;
686800
}
687801

688-
const std::string BlockBasedTable::kFilterBlockPrefix =
689-
"filter.";
802+
const std::string BlockBasedTable::kFilterBlockPrefix = "filter.";
690803

691804
} // namespace rocksdb

table/block_based_table_factory.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,5 +56,8 @@ TableFactory* NewBlockBasedTableFactory(
5656

5757
const std::string BlockBasedTablePropertyNames::kIndexType =
5858
"rocksdb.block.based.table.index.type";
59+
const std::string kHashIndexPrefixesBlock = "rocksdb.hashindex.prefixes";
60+
const std::string kHashIndexPrefixesMetadataBlock =
61+
"rocksdb.hashindex.metadata";
5962

6063
} // namespace rocksdb

0 commit comments

Comments
 (0)