duckdb · Mytherin · Feb 11, 2025 · Feb 10, 2025 · Feb 10, 2025 · Feb 10, 2025
diff --git a/extension/parquet/column_reader.cpp b/extension/parquet/column_reader.cpp
diff --git a/extension/parquet/decoder/byte_stream_split_decoder.cpp b/extension/parquet/decoder/byte_stream_split_decoder.cpp
@@ -21,7 +21,7 @@ void ByteStreamSplitDecoder::Read(uint8_t *defines, idx_t read_count, Vector &re
 
 	auto &allocator = reader.reader.allocator;
 	decoded_data_buffer.reset();
-	switch (reader.schema.type) {
+	switch (reader.Schema().parquet_type) {
 	case duckdb_parquet::Type::FLOAT:
 		decoded_data_buffer.resize(allocator, sizeof(float) * valid_count);
 		bss_decoder->GetBatch<float>(decoded_data_buffer.ptr, valid_count);
@@ -39,7 +39,7 @@ void ByteStreamSplitDecoder::Read(uint8_t *defines, idx_t read_count, Vector &re
 
 void ByteStreamSplitDecoder::Skip(uint8_t *defines, idx_t skip_count) {
 	idx_t valid_count = reader.GetValidCount(defines, skip_count);
-	switch (reader.schema.type) {
+	switch (reader.Schema().parquet_type) {
 	case duckdb_parquet::Type::FLOAT:
 		bss_decoder->Skip<float>(valid_count);
 		break;

diff --git a/extension/parquet/decoder/delta_binary_packed_decoder.cpp b/extension/parquet/decoder/delta_binary_packed_decoder.cpp
@@ -19,11 +19,10 @@ void DeltaBinaryPackedDecoder::Read(uint8_t *defines, idx_t read_count, Vector &
 
 	auto &allocator = reader.reader.allocator;
 	decoded_data_buffer.reset();
-	switch (reader.schema.type) {
+	switch (reader.Schema().parquet_type) {
 	case duckdb_parquet::Type::INT32:
 		decoded_data_buffer.resize(allocator, sizeof(int32_t) * (valid_count));
 		dbp_decoder->GetBatch<int32_t>(decoded_data_buffer.ptr, valid_count);
-
 		break;
 	case duckdb_parquet::Type::INT64:
 		decoded_data_buffer.resize(allocator, sizeof(int64_t) * (valid_count));
@@ -39,10 +38,9 @@ void DeltaBinaryPackedDecoder::Read(uint8_t *defines, idx_t read_count, Vector &
 
 void DeltaBinaryPackedDecoder::Skip(uint8_t *defines, idx_t skip_count) {
 	idx_t valid_count = reader.GetValidCount(defines, skip_count);
-	switch (reader.schema.type) {
+	switch (reader.Schema().parquet_type) {
 	case duckdb_parquet::Type::INT32:
 		dbp_decoder->Skip<int32_t>(valid_count);
-
 		break;
 	case duckdb_parquet::Type::INT64:
 		dbp_decoder->Skip<int64_t>(valid_count);

diff --git a/extension/parquet/decoder/delta_byte_array_decoder.cpp b/extension/parquet/decoder/delta_byte_array_decoder.cpp
@@ -20,7 +20,7 @@ void DeltaByteArrayDecoder::ReadDbpData(Allocator &allocator, ResizeableBuffer &
 }
 
 void DeltaByteArrayDecoder::InitializePage() {
-	if (reader.type.InternalType() != PhysicalType::VARCHAR) {
+	if (reader.Type().InternalType() != PhysicalType::VARCHAR) {
 		throw std::runtime_error("Delta Byte Array encoding is only supported for string/blob data");
 	}
 	auto &block = *reader.block;
@@ -69,7 +69,7 @@ void DeltaByteArrayDecoder::Read(uint8_t *defines, idx_t read_count, Vector &res
 	auto &result_mask = FlatVector::Validity(result);
 	auto string_data = FlatVector::GetData<string_t>(*byte_array_data);
 	for (idx_t row_idx = 0; row_idx < read_count; row_idx++) {
-		if (defines && defines[row_idx + result_offset] != reader.max_define) {
+		if (defines && defines[row_idx + result_offset] != reader.MaxDefine()) {
 			result_mask.SetInvalid(row_idx + result_offset);
 			continue;
 		}
@@ -88,7 +88,7 @@ void DeltaByteArrayDecoder::Skip(uint8_t *defines, idx_t skip_count) {
 		throw std::runtime_error("Internal error - DeltaByteArray called but there was no byte_array_data set");
 	}
 	for (idx_t row_idx = 0; row_idx < skip_count; row_idx++) {
-		if (defines && defines[row_idx] != reader.max_define) {
+		if (defines && defines[row_idx] != reader.MaxDefine()) {
 			continue;
 		}
 		if (delta_offset >= byte_array_count) {

diff --git a/extension/parquet/decoder/delta_length_byte_array_decoder.cpp b/extension/parquet/decoder/delta_length_byte_array_decoder.cpp
@@ -11,7 +11,7 @@ DeltaLengthByteArrayDecoder::DeltaLengthByteArrayDecoder(ColumnReader &reader)
 }
 
 void DeltaLengthByteArrayDecoder::InitializePage() {
-	if (reader.type.InternalType() != PhysicalType::VARCHAR) {
+	if (reader.Type().InternalType() != PhysicalType::VARCHAR) {
 		throw std::runtime_error("Delta Length Byte Array encoding is only supported for string/blob data");
 	}
 	// read the binary packed lengths
@@ -28,7 +28,7 @@ void DeltaLengthByteArrayDecoder::Read(uint8_t *defines, idx_t read_count, Vecto
 	auto &result_mask = FlatVector::Validity(result);
 	for (idx_t row_idx = 0; row_idx < read_count; row_idx++) {
 		auto result_idx = result_offset + row_idx;
-		if (defines && defines[result_idx] != reader.max_define) {
+		if (defines && defines[result_idx] != reader.MaxDefine()) {
 			result_mask.SetInvalid(result_idx);
 			continue;
 		}
@@ -52,7 +52,7 @@ void DeltaLengthByteArrayDecoder::Skip(uint8_t *defines, idx_t skip_count) {
 	auto &block = *reader.block;
 	auto length_data = reinterpret_cast<uint32_t *>(length_buffer.ptr);
 	for (idx_t row_idx = 0; row_idx < skip_count; row_idx++) {
-		if (defines && defines[row_idx] != reader.max_define) {
+		if (defines && defines[row_idx] != reader.MaxDefine()) {
 			continue;
 		}
 		if (length_idx >= byte_array_count) {

diff --git a/extension/parquet/decoder/dictionary_decoder.cpp b/extension/parquet/decoder/dictionary_decoder.cpp
@@ -19,11 +19,12 @@ void DictionaryDecoder::InitializeDictionary(idx_t new_dictionary_size, optional
 	filter_count = 0;
 	// we use the first value in the dictionary to keep a NULL
 	if (!dictionary) {
-		dictionary = make_uniq<Vector>(reader.type, dictionary_size + 1);
+		dictionary = make_uniq<Vector>(reader.Type(), dictionary_size + 1);
 	} else if (dictionary_size > old_dict_size) {
 		dictionary->Resize(old_dict_size, dictionary_size + 1);
 	}
-	dictionary_id = reader.reader.file_name + "_" + reader.schema.name + "_" + std::to_string(reader.chunk_read_offset);
+	dictionary_id =
+	    reader.reader.file_name + "_" + reader.Schema().name + "_" + std::to_string(reader.chunk_read_offset);
 	// we use the last entry as a NULL, dictionary vectors don't have a separate validity mask
 	auto &dict_validity = FlatVector::Validity(*dictionary);
 	dict_validity.Reset(dictionary_size + 1);
@@ -77,7 +78,7 @@ idx_t DictionaryDecoder::GetValidValues(uint8_t *defines, idx_t read_count, idx_
 		for (idx_t i = 0; i < read_count; i++) {
 			valid_sel.set_index(valid_count, i);
 			dictionary_selection_vector.set_index(i, dictionary_size);
-			valid_count += defines[result_offset + i] == reader.max_define;
+			valid_count += defines[result_offset + i] == reader.MaxDefine();
 		}
 	}
 	return valid_count;

diff --git a/extension/parquet/decoder/rle_decoder.cpp b/extension/parquet/decoder/rle_decoder.cpp
@@ -9,7 +9,7 @@ RLEDecoder::RLEDecoder(ColumnReader &reader) : reader(reader), decoded_data_buff
 }
 
 void RLEDecoder::InitializePage() {
-	if (reader.type.id() != LogicalTypeId::BOOLEAN) {
+	if (reader.Type().id() != LogicalTypeId::BOOLEAN) {
 		throw std::runtime_error("RLE encoding is only supported for boolean data");
 	}
 	auto &block = reader.block;
@@ -19,7 +19,7 @@ void RLEDecoder::InitializePage() {
 
 void RLEDecoder::Read(uint8_t *defines, idx_t read_count, Vector &result, idx_t result_offset) {
 	// RLE encoding for boolean
-	D_ASSERT(reader.type.id() == LogicalTypeId::BOOLEAN);
+	D_ASSERT(reader.Type().id() == LogicalTypeId::BOOLEAN);
 	idx_t valid_count = reader.GetValidCount(defines, read_count, result_offset);
 	decoded_data_buffer.reset();
 	decoded_data_buffer.resize(reader.reader.allocator, sizeof(bool) * valid_count);

diff --git a/extension/parquet/geo_parquet.cpp b/extension/parquet/geo_parquet.cpp
@@ -382,21 +382,26 @@ bool GeoParquetFileMetadata::IsGeoParquetConversionEnabled(const ClientContext &
 	return true;
 }
 
+LogicalType GeoParquetFileMetadata::GeometryType() {
+	auto blob_type = LogicalType(LogicalTypeId::BLOB);
+	blob_type.SetAlias("GEOMETRY");
+	return blob_type;
+}
+
 unique_ptr<ColumnReader> GeoParquetFileMetadata::CreateColumnReader(ParquetReader &reader,
-                                                                    const LogicalType &logical_type,
-                                                                    const SchemaElement &s_ele, idx_t schema_idx_p,
-                                                                    idx_t max_define_p, idx_t max_repeat_p,
+                                                                    const ParquetColumnSchema &schema,
                                                                     ClientContext &context) {
 
-	D_ASSERT(IsGeometryColumn(s_ele.name));
+	D_ASSERT(IsGeometryColumn(schema.name));
 
-	const auto &column = geometry_columns[s_ele.name];
+	const auto &column = geometry_columns[schema.name];
 
 	// Get the catalog
 	auto &catalog = Catalog::GetSystemCatalog(context);
 
 	// WKB encoding
-	if (logical_type.id() == LogicalTypeId::BLOB && column.geometry_encoding == GeoParquetColumnEncoding::WKB) {
+	if (schema.children[0].type.id() == LogicalTypeId::BLOB &&
+	    column.geometry_encoding == GeoParquetColumnEncoding::WKB) {
 		// Look for a conversion function in the catalog
 		auto &conversion_func_set =
 		    catalog.GetEntry(context, CatalogType::SCALAR_FUNCTION_ENTRY, DEFAULT_SCHEMA, "st_geomfromwkb")
@@ -410,8 +415,7 @@ unique_ptr<ColumnReader> GeoParquetFileMetadata::CreateColumnReader(ParquetReade
 		    make_uniq<BoundFunctionExpression>(conversion_func.return_type, conversion_func, std::move(args), nullptr);
 
 		// Create a child reader
-		auto child_reader =
-		    ColumnReader::CreateReader(reader, logical_type, s_ele, schema_idx_p, max_define_p, max_repeat_p);
+		auto child_reader = ColumnReader::CreateReader(reader, schema.children[0]);
 
 		// Create an expression reader that applies the conversion function to the child reader
 		return make_uniq<ExpressionColumnReader>(context, std::move(child_reader), std::move(expr));

diff --git a/extension/parquet/include/column_reader.hpp b/extension/parquet/include/column_reader.hpp
@@ -20,6 +20,7 @@
 #include "decoder/rle_decoder.hpp"
 #include "decoder/delta_length_byte_array_decoder.hpp"
 #include "decoder/delta_byte_array_decoder.hpp"
+#include "parquet_column_schema.hpp"
 #ifndef DUCKDB_AMALGAMATION
 
 #include "duckdb/common/operator/cast_operators.hpp"
@@ -60,14 +61,11 @@ class ColumnReader {
 	friend class RLEDecoder;
 
 public:
-	ColumnReader(ParquetReader &reader, LogicalType type_p, const SchemaElement &schema_p, idx_t file_idx_p,
-	             idx_t max_define_p, idx_t max_repeat_p);
+	ColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema_p);
 	virtual ~ColumnReader();
 
 public:
-	static unique_ptr<ColumnReader> CreateReader(ParquetReader &reader, const LogicalType &type_p,
-	                                             const SchemaElement &schema_p, idx_t schema_idx_p, idx_t max_define,
-	                                             idx_t max_repeat);
+	static unique_ptr<ColumnReader> CreateReader(ParquetReader &reader, const ParquetColumnSchema &schema);
 	virtual void InitializeRead(idx_t row_group_index, const vector<ColumnChunk> &columns, TProtocol &protocol_p);
 	virtual idx_t Read(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result_out);
 	virtual void Filter(uint64_t num_values, data_ptr_t define_out, data_ptr_t repeat_out, Vector &result_out,
@@ -78,14 +76,22 @@ class ColumnReader {
 	virtual void Skip(idx_t num_values);
 
 	ParquetReader &Reader();
-	const LogicalType &Type() const;
-	const SchemaElement &Schema() const;
-	optional_ptr<const SchemaElement> GetParentSchema() const;
-	void SetParentSchema(const SchemaElement &parent_schema);
+	const LogicalType &Type() const {
+		return column_schema.type;
+	}
+	const ParquetColumnSchema &Schema() const {
+		return column_schema;
+	}
 
-	idx_t FileIdx() const;
-	idx_t MaxDefine() const;
-	idx_t MaxRepeat() const;
+	inline idx_t ColumnIndex() const {
+		return column_schema.column_index;
+	}
+	inline idx_t MaxDefine() const {
+		return column_schema.max_define;
+	}
+	idx_t MaxRepeat() const {
+		return column_schema.max_repeat;
+	}
 
 	virtual idx_t FileOffset() const;
 	virtual uint64_t TotalCompressedSize();
@@ -94,7 +100,7 @@ class ColumnReader {
 	// register the range this reader will touch for prefetching
 	virtual void RegisterPrefetch(ThriftFileTransport &transport, bool allow_merge);
 
-	virtual unique_ptr<BaseStatistics> Stats(idx_t row_group_idx_p, const vector<ColumnChunk> &columns);
+	unique_ptr<BaseStatistics> Stats(idx_t row_group_idx_p, const vector<ColumnChunk> &columns);
 
 	template <class VALUE_TYPE, class CONVERSION, bool HAS_DEFINES>
 	void PlainTemplatedDefines(ByteBuffer &plain_data, uint8_t *defines, uint64_t num_values, idx_t result_offset,
@@ -141,7 +147,7 @@ class ColumnReader {
 		}
 		idx_t valid_count = 0;
 		for (idx_t i = offset; i < offset + count; i++) {
-			valid_count += defines[i] == max_define;
+			valid_count += defines[i] == MaxDefine();
 		}
 		return valid_count;
 	}
@@ -177,7 +183,7 @@ class ColumnReader {
 		}
 		auto &result_mask = FlatVector::Validity(result);
 		for (idx_t row_idx = result_offset; row_idx < result_offset + num_values; row_idx++) {
-			if (HAS_DEFINES && defines[row_idx] != max_define) {
+			if (HAS_DEFINES && defines[row_idx] != MaxDefine()) {
 				result_mask.SetInvalid(row_idx);
 				continue;
 			}
@@ -193,7 +199,7 @@ class ColumnReader {
 			return;
 		}
 		for (idx_t row_idx = 0; row_idx < num_values; row_idx++) {
-			if (HAS_DEFINES && defines[row_idx] != max_define) {
+			if (HAS_DEFINES && defines[row_idx] != MaxDefine()) {
 				continue;
 			}
 			CONVERSION::template PlainSkip<CHECKED>(plain_data, *this);
@@ -211,25 +217,18 @@ class ColumnReader {
 	// applies any skips that were registered using Skip()
 	virtual void ApplyPendingSkips(data_ptr_t define_out, data_ptr_t repeat_out);
 
-	bool HasDefines() const {
-		return max_define > 0;
+	inline bool HasDefines() const {
+		return MaxDefine() > 0;
 	}
 
-	bool HasRepeats() const {
-		return max_repeat > 0;
+	inline bool HasRepeats() const {
+		return MaxRepeat() > 0;
 	}
 
 protected:
-	const SchemaElement &schema;
-	optional_ptr<const SchemaElement> parent_schema;
-
-	idx_t file_idx;
-	idx_t max_define;
-	idx_t max_repeat;
+	const ParquetColumnSchema &column_schema;
 
 	ParquetReader &reader;
-	LogicalType type;
-
 	idx_t pending_skips = 0;
 	bool page_is_filtered_out = false;
 
@@ -271,15 +270,15 @@ class ColumnReader {
 public:
 	template <class TARGET>
 	TARGET &Cast() {
-		if (TARGET::TYPE != PhysicalType::INVALID && type.InternalType() != TARGET::TYPE) {
+		if (TARGET::TYPE != PhysicalType::INVALID && Type().InternalType() != TARGET::TYPE) {
 			throw InternalException("Failed to cast column reader to type - type mismatch");
 		}
 		return reinterpret_cast<TARGET &>(*this);
 	}
 
 	template <class TARGET>
 	const TARGET &Cast() const {
-		if (TARGET::TYPE != PhysicalType::INVALID && type.InternalType() != TARGET::TYPE) {
+		if (TARGET::TYPE != PhysicalType::INVALID && Type().InternalType() != TARGET::TYPE) {
 			throw InternalException("Failed to cast column reader to type - type mismatch");
 		}
 		return reinterpret_cast<const TARGET &>(*this);

diff --git a/extension/parquet/include/geo_parquet.hpp b/extension/parquet/include/geo_parquet.hpp
@@ -16,6 +16,7 @@
 #include "parquet_types.h"
 
 namespace duckdb {
+struct ParquetColumnSchema;
 
 enum class WKBGeometryType : uint16_t {
 	POINT = 1,
@@ -127,14 +128,14 @@ class GeoParquetFileMetadata {
 	void FlushColumnMeta(const string &column_name, const GeoParquetColumnMetadata &meta);
 	const unordered_map<string, GeoParquetColumnMetadata> &GetColumnMeta() const;
 
-	unique_ptr<ColumnReader> CreateColumnReader(ParquetReader &reader, const LogicalType &logical_type,
-	                                            const duckdb_parquet::SchemaElement &s_ele, idx_t schema_idx_p,
-	                                            idx_t max_define_p, idx_t max_repeat_p, ClientContext &context);
+	unique_ptr<ColumnReader> CreateColumnReader(ParquetReader &reader, const ParquetColumnSchema &schema,
+	                                            ClientContext &context);
 
 	bool IsGeometryColumn(const string &column_name) const;
 	void RegisterGeometryColumn(const string &column_name);
 
 	static bool IsGeoParquetConversionEnabled(const ClientContext &context);
+	static LogicalType GeometryType();
 
 private:
 	mutex write_lock;