Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
287 changes: 87 additions & 200 deletions extension/parquet/column_reader.cpp

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions extension/parquet/decoder/byte_stream_split_decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ void ByteStreamSplitDecoder::Read(uint8_t *defines, idx_t read_count, Vector &re

auto &allocator = reader.reader.allocator;
decoded_data_buffer.reset();
switch (reader.schema.type) {
switch (reader.Schema().parquet_type) {
case duckdb_parquet::Type::FLOAT:
decoded_data_buffer.resize(allocator, sizeof(float) * valid_count);
bss_decoder->GetBatch<float>(decoded_data_buffer.ptr, valid_count);
Expand All @@ -39,7 +39,7 @@ void ByteStreamSplitDecoder::Read(uint8_t *defines, idx_t read_count, Vector &re

void ByteStreamSplitDecoder::Skip(uint8_t *defines, idx_t skip_count) {
idx_t valid_count = reader.GetValidCount(defines, skip_count);
switch (reader.schema.type) {
switch (reader.Schema().parquet_type) {
case duckdb_parquet::Type::FLOAT:
bss_decoder->Skip<float>(valid_count);
break;
Expand Down
6 changes: 2 additions & 4 deletions extension/parquet/decoder/delta_binary_packed_decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,10 @@ void DeltaBinaryPackedDecoder::Read(uint8_t *defines, idx_t read_count, Vector &

auto &allocator = reader.reader.allocator;
decoded_data_buffer.reset();
switch (reader.schema.type) {
switch (reader.Schema().parquet_type) {
case duckdb_parquet::Type::INT32:
decoded_data_buffer.resize(allocator, sizeof(int32_t) * (valid_count));
dbp_decoder->GetBatch<int32_t>(decoded_data_buffer.ptr, valid_count);

break;
case duckdb_parquet::Type::INT64:
decoded_data_buffer.resize(allocator, sizeof(int64_t) * (valid_count));
Expand All @@ -39,10 +38,9 @@ void DeltaBinaryPackedDecoder::Read(uint8_t *defines, idx_t read_count, Vector &

void DeltaBinaryPackedDecoder::Skip(uint8_t *defines, idx_t skip_count) {
idx_t valid_count = reader.GetValidCount(defines, skip_count);
switch (reader.schema.type) {
switch (reader.Schema().parquet_type) {
case duckdb_parquet::Type::INT32:
dbp_decoder->Skip<int32_t>(valid_count);

break;
case duckdb_parquet::Type::INT64:
dbp_decoder->Skip<int64_t>(valid_count);
Expand Down
6 changes: 3 additions & 3 deletions extension/parquet/decoder/delta_byte_array_decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ void DeltaByteArrayDecoder::ReadDbpData(Allocator &allocator, ResizeableBuffer &
}

void DeltaByteArrayDecoder::InitializePage() {
if (reader.type.InternalType() != PhysicalType::VARCHAR) {
if (reader.Type().InternalType() != PhysicalType::VARCHAR) {
throw std::runtime_error("Delta Byte Array encoding is only supported for string/blob data");
}
auto &block = *reader.block;
Expand Down Expand Up @@ -69,7 +69,7 @@ void DeltaByteArrayDecoder::Read(uint8_t *defines, idx_t read_count, Vector &res
auto &result_mask = FlatVector::Validity(result);
auto string_data = FlatVector::GetData<string_t>(*byte_array_data);
for (idx_t row_idx = 0; row_idx < read_count; row_idx++) {
if (defines && defines[row_idx + result_offset] != reader.max_define) {
if (defines && defines[row_idx + result_offset] != reader.MaxDefine()) {
result_mask.SetInvalid(row_idx + result_offset);
continue;
}
Expand All @@ -88,7 +88,7 @@ void DeltaByteArrayDecoder::Skip(uint8_t *defines, idx_t skip_count) {
throw std::runtime_error("Internal error - DeltaByteArray called but there was no byte_array_data set");
}
for (idx_t row_idx = 0; row_idx < skip_count; row_idx++) {
if (defines && defines[row_idx] != reader.max_define) {
if (defines && defines[row_idx] != reader.MaxDefine()) {
continue;
}
if (delta_offset >= byte_array_count) {
Expand Down
6 changes: 3 additions & 3 deletions extension/parquet/decoder/delta_length_byte_array_decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ DeltaLengthByteArrayDecoder::DeltaLengthByteArrayDecoder(ColumnReader &reader)
}

void DeltaLengthByteArrayDecoder::InitializePage() {
if (reader.type.InternalType() != PhysicalType::VARCHAR) {
if (reader.Type().InternalType() != PhysicalType::VARCHAR) {
throw std::runtime_error("Delta Length Byte Array encoding is only supported for string/blob data");
}
// read the binary packed lengths
Expand All @@ -28,7 +28,7 @@ void DeltaLengthByteArrayDecoder::Read(uint8_t *defines, idx_t read_count, Vecto
auto &result_mask = FlatVector::Validity(result);
for (idx_t row_idx = 0; row_idx < read_count; row_idx++) {
auto result_idx = result_offset + row_idx;
if (defines && defines[result_idx] != reader.max_define) {
if (defines && defines[result_idx] != reader.MaxDefine()) {
result_mask.SetInvalid(result_idx);
continue;
}
Expand All @@ -52,7 +52,7 @@ void DeltaLengthByteArrayDecoder::Skip(uint8_t *defines, idx_t skip_count) {
auto &block = *reader.block;
auto length_data = reinterpret_cast<uint32_t *>(length_buffer.ptr);
for (idx_t row_idx = 0; row_idx < skip_count; row_idx++) {
if (defines && defines[row_idx] != reader.max_define) {
if (defines && defines[row_idx] != reader.MaxDefine()) {
continue;
}
if (length_idx >= byte_array_count) {
Expand Down
7 changes: 4 additions & 3 deletions extension/parquet/decoder/dictionary_decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ void DictionaryDecoder::InitializeDictionary(idx_t new_dictionary_size, optional
filter_count = 0;
// we use the first value in the dictionary to keep a NULL
if (!dictionary) {
dictionary = make_uniq<Vector>(reader.type, dictionary_size + 1);
dictionary = make_uniq<Vector>(reader.Type(), dictionary_size + 1);
} else if (dictionary_size > old_dict_size) {
dictionary->Resize(old_dict_size, dictionary_size + 1);
}
dictionary_id = reader.reader.file_name + "_" + reader.schema.name + "_" + std::to_string(reader.chunk_read_offset);
dictionary_id =
reader.reader.file_name + "_" + reader.Schema().name + "_" + std::to_string(reader.chunk_read_offset);
// we use the last entry as a NULL, dictionary vectors don't have a separate validity mask
auto &dict_validity = FlatVector::Validity(*dictionary);
dict_validity.Reset(dictionary_size + 1);
Expand Down Expand Up @@ -77,7 +78,7 @@ idx_t DictionaryDecoder::GetValidValues(uint8_t *defines, idx_t read_count, idx_
for (idx_t i = 0; i < read_count; i++) {
valid_sel.set_index(valid_count, i);
dictionary_selection_vector.set_index(i, dictionary_size);
valid_count += defines[result_offset + i] == reader.max_define;
valid_count += defines[result_offset + i] == reader.MaxDefine();
}
}
return valid_count;
Expand Down
4 changes: 2 additions & 2 deletions extension/parquet/decoder/rle_decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ RLEDecoder::RLEDecoder(ColumnReader &reader) : reader(reader), decoded_data_buff
}

void RLEDecoder::InitializePage() {
if (reader.type.id() != LogicalTypeId::BOOLEAN) {
if (reader.Type().id() != LogicalTypeId::BOOLEAN) {
throw std::runtime_error("RLE encoding is only supported for boolean data");
}
auto &block = reader.block;
Expand All @@ -19,7 +19,7 @@ void RLEDecoder::InitializePage() {

void RLEDecoder::Read(uint8_t *defines, idx_t read_count, Vector &result, idx_t result_offset) {
// RLE encoding for boolean
D_ASSERT(reader.type.id() == LogicalTypeId::BOOLEAN);
D_ASSERT(reader.Type().id() == LogicalTypeId::BOOLEAN);
idx_t valid_count = reader.GetValidCount(defines, read_count, result_offset);
decoded_data_buffer.reset();
decoded_data_buffer.resize(reader.reader.allocator, sizeof(bool) * valid_count);
Expand Down
20 changes: 12 additions & 8 deletions extension/parquet/geo_parquet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -382,21 +382,26 @@ bool GeoParquetFileMetadata::IsGeoParquetConversionEnabled(const ClientContext &
return true;
}

LogicalType GeoParquetFileMetadata::GeometryType() {
auto blob_type = LogicalType(LogicalTypeId::BLOB);
blob_type.SetAlias("GEOMETRY");
return blob_type;
}

unique_ptr<ColumnReader> GeoParquetFileMetadata::CreateColumnReader(ParquetReader &reader,
const LogicalType &logical_type,
const SchemaElement &s_ele, idx_t schema_idx_p,
idx_t max_define_p, idx_t max_repeat_p,
const ParquetColumnSchema &schema,
ClientContext &context) {

D_ASSERT(IsGeometryColumn(s_ele.name));
D_ASSERT(IsGeometryColumn(schema.name));

const auto &column = geometry_columns[s_ele.name];
const auto &column = geometry_columns[schema.name];

// Get the catalog
auto &catalog = Catalog::GetSystemCatalog(context);

// WKB encoding
if (logical_type.id() == LogicalTypeId::BLOB && column.geometry_encoding == GeoParquetColumnEncoding::WKB) {
if (schema.children[0].type.id() == LogicalTypeId::BLOB &&
column.geometry_encoding == GeoParquetColumnEncoding::WKB) {
// Look for a conversion function in the catalog
auto &conversion_func_set =
catalog.GetEntry(context, CatalogType::SCALAR_FUNCTION_ENTRY, DEFAULT_SCHEMA, "st_geomfromwkb")
Expand All @@ -410,8 +415,7 @@ unique_ptr<ColumnReader> GeoParquetFileMetadata::CreateColumnReader(ParquetReade
make_uniq<BoundFunctionExpression>(conversion_func.return_type, conversion_func, std::move(args), nullptr);

// Create a child reader
auto child_reader =
ColumnReader::CreateReader(reader, logical_type, s_ele, schema_idx_p, max_define_p, max_repeat_p);
auto child_reader = ColumnReader::CreateReader(reader, schema.children[0]);

// Create an expression reader that applies the conversion function to the child reader
return make_uniq<ExpressionColumnReader>(context, std::move(child_reader), std::move(expr));
Expand Down
59 changes: 29 additions & 30 deletions extension/parquet/include/column_reader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "decoder/rle_decoder.hpp"
#include "decoder/delta_length_byte_array_decoder.hpp"
#include "decoder/delta_byte_array_decoder.hpp"
#include "parquet_column_schema.hpp"
#ifndef DUCKDB_AMALGAMATION

#include "duckdb/common/operator/cast_operators.hpp"
Expand Down Expand Up @@ -60,14 +61,11 @@ class ColumnReader {
friend class RLEDecoder;

public:
ColumnReader(ParquetReader &reader, LogicalType type_p, const SchemaElement &schema_p, idx_t file_idx_p,
idx_t max_define_p, idx_t max_repeat_p);
ColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema_p);
virtual ~ColumnReader();

public:
static unique_ptr<ColumnReader> CreateReader(ParquetReader &reader, const LogicalType &type_p,
const SchemaElement &schema_p, idx_t schema_idx_p, idx_t max_define,
idx_t max_repeat);
static unique_ptr<ColumnReader> CreateReader(ParquetReader &reader, const ParquetColumnSchema &schema);
virtual void InitializeRead(idx_t row_group_index, const vector<ColumnChunk> &columns, TProtocol &protocol_p);
virtual idx_t Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result_out);
virtual void Filter(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result_out,
Expand All @@ -78,14 +76,22 @@ class ColumnReader {
virtual void Skip(idx_t num_values);

ParquetReader &Reader();
const LogicalType &Type() const;
const SchemaElement &Schema() const;
optional_ptr<const SchemaElement> GetParentSchema() const;
void SetParentSchema(const SchemaElement &parent_schema);
const LogicalType &Type() const {
return column_schema.type;
}
const ParquetColumnSchema &Schema() const {
return column_schema;
}

idx_t FileIdx() const;
idx_t MaxDefine() const;
idx_t MaxRepeat() const;
inline idx_t ColumnIndex() const {
return column_schema.column_index;
}
inline idx_t MaxDefine() const {
return column_schema.max_define;
}
idx_t MaxRepeat() const {
return column_schema.max_repeat;
}

virtual idx_t FileOffset() const;
virtual uint64_t TotalCompressedSize();
Expand All @@ -94,7 +100,7 @@ class ColumnReader {
// register the range this reader will touch for prefetching
virtual void RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge);

virtual unique_ptr<BaseStatistics> Stats(idx_t row_group_idx_p, const vector<ColumnChunk> &columns);
unique_ptr<BaseStatistics> Stats(idx_t row_group_idx_p, const vector<ColumnChunk> &columns);

template <class VALUE_TYPE, class CONVERSION, bool HAS_DEFINES>
void PlainTemplatedDefines(ByteBuffer &plain_data, uint8_t *defines, uint64_t num_values, idx_t result_offset,
Expand Down Expand Up @@ -141,7 +147,7 @@ class ColumnReader {
}
idx_t valid_count = 0;
for (idx_t i = offset; i < offset + count; i++) {
valid_count += defines[i] == max_define;
valid_count += defines[i] == MaxDefine();
}
return valid_count;
}
Expand Down Expand Up @@ -177,7 +183,7 @@ class ColumnReader {
}
auto &result_mask = FlatVector::Validity(result);
for (idx_t row_idx = result_offset; row_idx < result_offset + num_values; row_idx++) {
if (HAS_DEFINES && defines[row_idx] != max_define) {
if (HAS_DEFINES && defines[row_idx] != MaxDefine()) {
result_mask.SetInvalid(row_idx);
continue;
}
Expand All @@ -193,7 +199,7 @@ class ColumnReader {
return;
}
for (idx_t row_idx = 0; row_idx < num_values; row_idx++) {
if (HAS_DEFINES && defines[row_idx] != max_define) {
if (HAS_DEFINES && defines[row_idx] != MaxDefine()) {
continue;
}
CONVERSION::template PlainSkip<CHECKED>(plain_data, *this);
Expand All @@ -211,25 +217,18 @@ class ColumnReader {
// applies any skips that were registered using Skip()
virtual void ApplyPendingSkips(data_ptr_t define_out, data_ptr_t repeat_out);

bool HasDefines() const {
return max_define > 0;
inline bool HasDefines() const {
return MaxDefine() > 0;
}

bool HasRepeats() const {
return max_repeat > 0;
inline bool HasRepeats() const {
return MaxRepeat() > 0;
}

protected:
const SchemaElement &schema;
optional_ptr<const SchemaElement> parent_schema;

idx_t file_idx;
idx_t max_define;
idx_t max_repeat;
const ParquetColumnSchema &column_schema;

ParquetReader &reader;
LogicalType type;

idx_t pending_skips = 0;
bool page_is_filtered_out = false;

Expand Down Expand Up @@ -271,15 +270,15 @@ class ColumnReader {
public:
template <class TARGET>
TARGET &Cast() {
if (TARGET::TYPE != PhysicalType::INVALID && type.InternalType() != TARGET::TYPE) {
if (TARGET::TYPE != PhysicalType::INVALID && Type().InternalType() != TARGET::TYPE) {
throw InternalException("Failed to cast column reader to type - type mismatch");
}
return reinterpret_cast<TARGET &>(*this);
}

template <class TARGET>
const TARGET &Cast() const {
if (TARGET::TYPE != PhysicalType::INVALID && type.InternalType() != TARGET::TYPE) {
if (TARGET::TYPE != PhysicalType::INVALID && Type().InternalType() != TARGET::TYPE) {
throw InternalException("Failed to cast column reader to type - type mismatch");
}
return reinterpret_cast<const TARGET &>(*this);
Expand Down
7 changes: 4 additions & 3 deletions extension/parquet/include/geo_parquet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "parquet_types.h"

namespace duckdb {
struct ParquetColumnSchema;

enum class WKBGeometryType : uint16_t {
POINT = 1,
Expand Down Expand Up @@ -127,14 +128,14 @@ class GeoParquetFileMetadata {
void FlushColumnMeta(const string &column_name, const GeoParquetColumnMetadata &meta);
const unordered_map<string, GeoParquetColumnMetadata> &GetColumnMeta() const;

unique_ptr<ColumnReader> CreateColumnReader(ParquetReader &reader, const LogicalType &logical_type,
const duckdb_parquet::SchemaElement &s_ele, idx_t schema_idx_p,
idx_t max_define_p, idx_t max_repeat_p, ClientContext &context);
unique_ptr<ColumnReader> CreateColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema,
ClientContext &context);

bool IsGeometryColumn(const string &column_name) const;
void RegisterGeometryColumn(const string &column_name);

static bool IsGeoParquetConversionEnabled(const ClientContext &context);
static LogicalType GeometryType();

private:
mutex write_lock;
Expand Down
Loading
Loading