1515
1616#include < map>
1717#include < memory>
18+ #include < string>
19+ #include < unordered_map>
1820
1921#include " db/dbformat.h"
2022
4143
4244namespace rocksdb {
4345
46+ extern const std::string kHashIndexPrefixesBlock ;
47+ extern const std::string kHashIndexPrefixesMetadataBlock ;
4448namespace {
4549
4650typedef BlockBasedTableOptions::IndexType IndexType;
@@ -57,6 +61,14 @@ typedef BlockBasedTableOptions::IndexType IndexType;
5761// design that just works.
5862class IndexBuilder {
5963 public:
64+ // Index builder will construct a set of blocks which contain:
65+ // 1. One primary index block.
66+ // 2. (Optional) a set of metablocks that contains the metadata of the
67+ // primary index.
68+ struct IndexBlocks {
69+ Slice index_block_contents;
70+ std::unordered_map<std::string, Slice> meta_blocks;
71+ };
6072 explicit IndexBuilder (const Comparator* comparator)
6173 : comparator_(comparator) {}
6274
@@ -72,15 +84,19 @@ class IndexBuilder {
7284 // the last one in the table
7385 //
7486 // REQUIRES: Finish() has not yet been called.
75- virtual void AddEntry (std::string* last_key_in_current_block,
76- const Slice* first_key_in_next_block,
77- const BlockHandle& block_handle) = 0;
87+ virtual void AddIndexEntry (std::string* last_key_in_current_block,
88+ const Slice* first_key_in_next_block,
89+ const BlockHandle& block_handle) = 0;
90+
91+ // This method will be called whenever a key is added. The subclasses may
92+ // override OnKeyAdded() if they need to collect additional information.
93+ virtual void OnKeyAdded (const Slice& key) {}
7894
7995 // Inform the index builder that all entries has been written. Block builder
8096 // may therefore perform any operation required for block finalization.
8197 //
8298 // REQUIRES: Finish() has not yet been called.
83- virtual Slice Finish () = 0;
99+ virtual Status Finish (IndexBlocks* index_blocks ) = 0;
84100
85101 // Get the estimated size for index block.
86102 virtual size_t EstimatedSize () const = 0;
@@ -103,9 +119,9 @@ class ShortenedIndexBuilder : public IndexBuilder {
103119 : IndexBuilder(comparator),
104120 index_block_builder_(1 /* block_restart_interval == 1 */ , comparator) {}
105121
106- virtual void AddEntry (std::string* last_key_in_current_block,
107- const Slice* first_key_in_next_block,
108- const BlockHandle& block_handle) override {
122+ virtual void AddIndexEntry (std::string* last_key_in_current_block,
123+ const Slice* first_key_in_next_block,
124+ const BlockHandle& block_handle) override {
109125 if (first_key_in_next_block != nullptr ) {
110126 comparator_->FindShortestSeparator (last_key_in_current_block,
111127 *first_key_in_next_block);
@@ -118,7 +134,10 @@ class ShortenedIndexBuilder : public IndexBuilder {
118134 index_block_builder_.Add (*last_key_in_current_block, handle_encoding);
119135 }
120136
121- virtual Slice Finish () override { return index_block_builder_.Finish (); }
137+ virtual Status Finish (IndexBlocks* index_blocks) {
138+ index_blocks->index_block_contents = index_block_builder_.Finish ();
139+ return Status::OK ();
140+ }
122141
123142 virtual size_t EstimatedSize () const {
124143 return index_block_builder_.CurrentSizeEstimate ();
@@ -128,38 +147,125 @@ class ShortenedIndexBuilder : public IndexBuilder {
128147 BlockBuilder index_block_builder_;
129148};
130149
131- // FullKeyIndexBuilder is also based on BlockBuilder. It works pretty much like
132- // ShortenedIndexBuilder, but preserves the full key instead the substitude key.
133- class FullKeyIndexBuilder : public IndexBuilder {
150+ // HashIndexBuilder contains a binary-searchable primary index and the
151+ // metadata for secondary hash index construction.
152+ // The metadata for hash index consists two parts:
153+ // - a metablock that compactly contains a sequence of prefixes. All prefixes
154+ // are stored consectively without any metadata (like, prefix sizes) being
155+ // stored, which is kept in the other metablock.
156+ // - a metablock contains the metadata of the prefixes, including prefix size,
157+ // restart index and number of block it spans. The format looks like:
158+ //
159+ // +-----------------+---------------------------+---------------------+ <=prefix 1
160+ // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
161+ // +-----------------+---------------------------+---------------------+ <=prefix 2
162+ // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
163+ // +-----------------+---------------------------+---------------------+
164+ // | |
165+ // | .... |
166+ // | |
167+ // +-----------------+---------------------------+---------------------+ <=prefix n
168+ // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
169+ // +-----------------+---------------------------+---------------------+
170+ //
171+ // The reason of separating these two metablocks is to enable the efficiently
172+ // reuse the first metablock during hash index construction without unnecessary
173+ // data copy or small heap allocations for prefixes.
174+ class HashIndexBuilder : public IndexBuilder {
134175 public:
135- explicit FullKeyIndexBuilder (const Comparator* comparator)
176+ explicit HashIndexBuilder (const Comparator* comparator,
177+ const SliceTransform* hash_key_extractor)
136178 : IndexBuilder(comparator),
137- index_block_builder_(1 /* block_restart_interval == 1 */ , comparator) {}
179+ primary_index_builder(comparator),
180+ hash_key_extractor_(hash_key_extractor) {}
181+
182+ virtual void AddIndexEntry (std::string* last_key_in_current_block,
183+ const Slice* first_key_in_next_block,
184+ const BlockHandle& block_handle) override {
185+ ++current_restart_index_;
186+ primary_index_builder.AddIndexEntry (last_key_in_current_block,
187+ first_key_in_next_block, block_handle);
188+ }
138189
139- virtual void AddEntry (std::string* last_key_in_current_block,
140- const Slice* first_key_in_next_block,
141- const BlockHandle& block_handle) override {
142- std::string handle_encoding;
143- block_handle.EncodeTo (&handle_encoding);
144- index_block_builder_.Add (*last_key_in_current_block, handle_encoding);
190+ virtual void OnKeyAdded (const Slice& key) override {
191+ auto key_prefix = hash_key_extractor_->Transform (key);
192+ bool is_first_entry = pending_block_num_ == 0 ;
193+
194+ // Keys may share the prefix
195+ if (is_first_entry || pending_entry_prefix_ != key_prefix) {
196+ if (!is_first_entry) {
197+ FlushPendingPrefix ();
198+ }
199+
200+ // need a hard copy otherwise the underlying data changes all the time.
201+ // TODO(kailiu) ToString() is expensive. We may speed up can avoid data
202+ // copy.
203+ pending_entry_prefix_ = key_prefix.ToString ();
204+ pending_block_num_ = 1 ;
205+ pending_entry_index_ = current_restart_index_;
206+ } else {
207+ // entry number increments when keys share the prefix reside in
208+ // differnt data blocks.
209+ auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1 ;
210+ assert (last_restart_index <= current_restart_index_);
211+ if (last_restart_index != current_restart_index_) {
212+ ++pending_block_num_;
213+ }
214+ }
145215 }
146216
147- virtual Slice Finish () override { return index_block_builder_.Finish (); }
217+ virtual Status Finish (IndexBlocks* index_blocks) {
218+ FlushPendingPrefix ();
219+ primary_index_builder.Finish (index_blocks);
220+ index_blocks->meta_blocks .insert (
221+ {kHashIndexPrefixesBlock .c_str (), prefix_block_});
222+ index_blocks->meta_blocks .insert (
223+ {kHashIndexPrefixesMetadataBlock .c_str (), prefix_meta_block_});
224+ return Status::OK ();
225+ }
148226
149227 virtual size_t EstimatedSize () const {
150- return index_block_builder_.CurrentSizeEstimate ();
228+ return primary_index_builder.EstimatedSize () + prefix_block_.size () +
229+ prefix_meta_block_.size ();
151230 }
152231
153232 private:
154- BlockBuilder index_block_builder_;
233+ void FlushPendingPrefix () {
234+ prefix_block_.append (pending_entry_prefix_.data (),
235+ pending_entry_prefix_.size ());
236+ PutVarint32 (&prefix_meta_block_, pending_entry_prefix_.size ());
237+ PutVarint32 (&prefix_meta_block_, pending_entry_index_);
238+ PutVarint32 (&prefix_meta_block_, pending_block_num_);
239+ }
240+
241+ ShortenedIndexBuilder primary_index_builder;
242+ const SliceTransform* hash_key_extractor_;
243+
244+ // stores a sequence of prefixes
245+ std::string prefix_block_;
246+ // stores the metadata of prefixes
247+ std::string prefix_meta_block_;
248+
249+ // The following 3 variables keeps unflushed prefix and its metadata.
250+ // The details of block_num and entry_index can be found in
251+ // "block_hash_index.{h,cc}"
252+ uint32_t pending_block_num_ = 0 ;
253+ uint32_t pending_entry_index_ = 0 ;
254+ std::string pending_entry_prefix_;
255+
256+ uint64_t current_restart_index_ = 0 ;
155257};
156258
157259// Create a index builder based on its type.
158- IndexBuilder* CreateIndexBuilder (IndexType type, const Comparator* comparator) {
260+ IndexBuilder* CreateIndexBuilder (IndexType type, const Comparator* comparator,
261+ const SliceTransform* prefix_extractor) {
159262 switch (type) {
160263 case BlockBasedTableOptions::kBinarySearch : {
161264 return new ShortenedIndexBuilder (comparator);
162265 }
266+ case BlockBasedTableOptions::kHashSearch : {
267+ return new HashIndexBuilder (comparator, prefix_extractor);
268+ }
163269 default : {
164270 assert (!" Do not recognize the index type " );
165271 return nullptr ;
@@ -249,7 +355,7 @@ extern const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull;
249355class BlockBasedTableBuilder ::BlockBasedTablePropertiesCollector
250356 : public TablePropertiesCollector {
251357 public:
252- BlockBasedTablePropertiesCollector (
358+ explicit BlockBasedTablePropertiesCollector (
253359 BlockBasedTableOptions::IndexType index_type)
254360 : index_type_(index_type) {}
255361
@@ -288,6 +394,8 @@ struct BlockBasedTableBuilder::Rep {
288394 uint64_t offset = 0 ;
289395 Status status;
290396 BlockBuilder data_block;
397+
398+ InternalKeySliceTransform internal_prefix_transform;
291399 std::unique_ptr<IndexBuilder> index_builder;
292400
293401 std::string last_key;
@@ -316,8 +424,9 @@ struct BlockBasedTableBuilder::Rep {
316424 internal_comparator (icomparator),
317425 file(f),
318426 data_block(options, &internal_comparator),
319- index_builder(
320- CreateIndexBuilder (index_block_type, &internal_comparator)),
427+ internal_prefix_transform(options.prefix_extractor.get()),
428+ index_builder(CreateIndexBuilder(index_block_type, &internal_comparator,
429+ &this ->internal_prefix_transform)),
321430 compression_type(compression_type),
322431 checksum_type(checksum_type),
323432 filter_block(opt.filter_policy == nullptr
@@ -335,16 +444,13 @@ struct BlockBasedTableBuilder::Rep {
335444 }
336445};
337446
338- // TODO(sdong): Currently only write out binary search index. In
339- // BlockBasedTableReader, Hash index will be built using binary search index.
340447BlockBasedTableBuilder::BlockBasedTableBuilder (
341448 const Options& options, const BlockBasedTableOptions& table_options,
342449 const InternalKeyComparator& internal_comparator, WritableFile* file,
343450 CompressionType compression_type)
344451 : rep_(new Rep(options, internal_comparator, file,
345452 table_options.flush_block_policy_factory.get(),
346- compression_type,
347- BlockBasedTableOptions::IndexType::kBinarySearch,
453+ compression_type, table_options.index_type,
348454 table_options.checksum)) {
349455 if (rep_->filter_block != nullptr ) {
350456 rep_->filter_block ->StartBlock (0 );
@@ -370,7 +476,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
370476 if (r->props .num_entries > 0 ) {
371477 assert (r->internal_comparator .Compare (key, Slice (r->last_key )) > 0 );
372478 }
373-
479+ r-> index_builder -> OnKeyAdded (key);
374480 auto should_flush = r->flush_block_policy ->Update (key, value);
375481 if (should_flush) {
376482 assert (!r->data_block .empty ());
@@ -385,7 +491,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
385491 // entries in the first block and < all entries in subsequent
386492 // blocks.
387493 if (ok ()) {
388- r->index_builder ->AddEntry (&r->last_key , &key, r->pending_handle );
494+ r->index_builder ->AddIndexEntry (&r->last_key , &key, r->pending_handle );
389495 }
390496 }
391497
@@ -561,24 +667,36 @@ Status BlockBasedTableBuilder::Finish() {
561667 // block, we will finish writing all index entries here and flush them
562668 // to storage after metaindex block is written.
563669 if (ok () && !empty_data_block) {
564- r->index_builder ->AddEntry (&r->last_key , nullptr /* no next data block */ ,
565- r->pending_handle );
670+ r->index_builder ->AddIndexEntry (
671+ &r->last_key , nullptr /* no next data block */ , r->pending_handle );
672+ }
673+
674+ IndexBuilder::IndexBlocks index_blocks;
675+ auto s = r->index_builder ->Finish (&index_blocks);
676+ if (!s.ok ()) {
677+ return s;
566678 }
567679
568680 // Write meta blocks and metaindex block with the following order.
569681 // 1. [meta block: filter]
570- // 2. [meta block: properties]
571- // 3. [metaindex block]
572- if (ok ()) {
573- MetaIndexBuilder meta_index_builer;
682+ // 2. [other meta blocks]
683+ // 3. [meta block: properties]
684+ // 4. [metaindex block]
685+ // write meta blocks
686+ MetaIndexBuilder meta_index_builder;
687+ for (const auto & item : index_blocks.meta_blocks ) {
688+ BlockHandle block_handle;
689+ WriteBlock (item.second , &block_handle);
690+ meta_index_builder.Add (item.first , block_handle);
691+ }
574692
575- // Write filter block.
693+ if ( ok ()) {
576694 if (r->filter_block != nullptr ) {
577695 // Add mapping from "<filter_block_prefix>.Name" to location
578696 // of filter data.
579697 std::string key = BlockBasedTable::kFilterBlockPrefix ;
580698 key.append (r->options .filter_policy ->Name ());
581- meta_index_builer .Add (key, filter_block_handle);
699+ meta_index_builder .Add (key, filter_block_handle);
582700 }
583701
584702 // Write properties block.
@@ -605,20 +723,16 @@ Status BlockBasedTableBuilder::Finish() {
605723 &properties_block_handle
606724 );
607725
608- meta_index_builer.Add (kPropertiesBlock ,
609- properties_block_handle);
726+ meta_index_builder.Add (kPropertiesBlock , properties_block_handle);
610727 } // end of properties block writing
611-
612- WriteRawBlock (
613- meta_index_builer.Finish (),
614- kNoCompression ,
615- &metaindex_block_handle
616- );
617- } // meta blocks and metaindex block.
728+ } // meta blocks
618729
619730 // Write index block
620731 if (ok ()) {
621- WriteBlock (r->index_builder ->Finish (), &index_block_handle);
732+ // flush the meta index block
733+ WriteRawBlock (meta_index_builder.Finish (), kNoCompression ,
734+ &metaindex_block_handle);
735+ WriteBlock (index_blocks.index_block_contents , &index_block_handle);
622736 }
623737
624738 // Write footer
@@ -685,7 +799,6 @@ uint64_t BlockBasedTableBuilder::FileSize() const {
685799 return rep_->offset ;
686800}
687801
688- const std::string BlockBasedTable::kFilterBlockPrefix =
689- " filter." ;
802+ const std::string BlockBasedTable::kFilterBlockPrefix = " filter." ;
690803
691804} // namespace rocksdb
0 commit comments