Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit ed10956

Browse files
committed
fixup! Enforce writeStatsAsJson and writeStatsAsStruct option in Delta Lake
1 parent c36e823 commit ed10956

File tree

9 files changed

+558
-60
lines changed

9 files changed

+558
-60
lines changed

plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/DeltaLakeParquetStatisticsUtils.java

+132
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,16 @@
1414
package io.trino.plugin.deltalake.transactionlog;
1515

1616
import io.airlift.log.Logger;
17+
import io.airlift.slice.Slice;
1718
import io.trino.plugin.base.type.DecodedTimestamp;
19+
import io.trino.spi.block.BlockBuilder;
20+
import io.trino.spi.block.RowBlockBuilder;
1821
import io.trino.spi.type.DateType;
1922
import io.trino.spi.type.DecimalType;
23+
import io.trino.spi.type.Decimals;
24+
import io.trino.spi.type.Int128;
25+
import io.trino.spi.type.RowType;
26+
import io.trino.spi.type.TimestampType;
2027
import io.trino.spi.type.TimestampWithTimeZoneType;
2128
import io.trino.spi.type.Type;
2229
import io.trino.spi.type.VarcharType;
@@ -29,12 +36,16 @@
2936
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
3037
import org.apache.parquet.schema.LogicalTypeAnnotation;
3138

39+
import javax.annotation.Nullable;
40+
3241
import java.math.BigDecimal;
3342
import java.math.BigInteger;
3443
import java.time.Instant;
3544
import java.time.LocalDate;
3645
import java.time.ZonedDateTime;
3746
import java.util.Collection;
47+
import java.util.HashMap;
48+
import java.util.List;
3849
import java.util.Map;
3950
import java.util.Optional;
4051
import java.util.function.BiFunction;
@@ -44,11 +55,19 @@
4455
import static io.trino.parquet.ParquetTimestampUtils.decodeInt96Timestamp;
4556
import static io.trino.spi.type.BigintType.BIGINT;
4657
import static io.trino.spi.type.BooleanType.BOOLEAN;
58+
import static io.trino.spi.type.DateTimeEncoding.unpackMillisUtc;
59+
import static io.trino.spi.type.Decimals.isShortDecimal;
4760
import static io.trino.spi.type.DoubleType.DOUBLE;
4861
import static io.trino.spi.type.IntegerType.INTEGER;
4962
import static io.trino.spi.type.RealType.REAL;
5063
import static io.trino.spi.type.SmallintType.SMALLINT;
64+
import static io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MILLIS;
65+
import static io.trino.spi.type.Timestamps.MICROSECONDS_PER_MILLISECOND;
5166
import static io.trino.spi.type.TinyintType.TINYINT;
67+
import static io.trino.spi.type.TypeUtils.writeNativeValue;
68+
import static java.lang.Float.floatToRawIntBits;
69+
import static java.lang.Float.intBitsToFloat;
70+
import static java.lang.Math.toIntExact;
5271
import static java.nio.charset.StandardCharsets.UTF_8;
5372
import static java.time.ZoneOffset.UTC;
5473
import static java.time.format.DateTimeFormatter.ISO_INSTANT;
@@ -71,6 +90,119 @@ public static boolean hasInvalidStatistics(Collection<ColumnChunkMetaData> metad
7190
(!metadata.getStatistics().hasNonNullValue() && metadata.getStatistics().getNumNulls() != metadata.getValueCount()));
7291
}
7392

93+
@Nullable
94+
public static Object toTrinoValue(Type type, @Nullable Object value)
95+
{
96+
if (value == null) {
97+
return null;
98+
}
99+
100+
if (type == SMALLINT || type == TINYINT || type == INTEGER) {
101+
return (long) (int) value;
102+
}
103+
if (type == BIGINT) {
104+
return (long) (int) value;
105+
}
106+
if (type == REAL) {
107+
return (long) floatToRawIntBits((float) (double) value);
108+
}
109+
if (type == DOUBLE) {
110+
return value;
111+
}
112+
if (type instanceof DecimalType) {
113+
BigDecimal decimal;
114+
checkArgument(value instanceof String || value instanceof Double, "Value must be instance of String or Double");
115+
if (value instanceof String) {
116+
decimal = new BigDecimal((String) value);
117+
}
118+
else {
119+
decimal = BigDecimal.valueOf((double) value);
120+
}
121+
122+
if (isShortDecimal(type)) {
123+
return Decimals.encodeShortScaledValue(decimal, ((DecimalType) type).getScale());
124+
}
125+
return Decimals.encodeScaledValue(decimal, ((DecimalType) type).getScale());
126+
}
127+
if (type instanceof VarcharType) {
128+
return value;
129+
}
130+
if (type.equals(DateType.DATE)) {
131+
return LocalDate.parse((String) value).toEpochDay();
132+
}
133+
if (type instanceof TimestampType) {
134+
return Instant.parse((String) value).toEpochMilli() * MICROSECONDS_PER_MILLISECOND;
135+
}
136+
if (type instanceof RowType) {
137+
RowType rowType = (RowType) type;
138+
Map<?, ?> values = (Map<?, ?>) value;
139+
List<Type> fieldTypes = rowType.getTypeParameters();
140+
BlockBuilder blockBuilder = new RowBlockBuilder(fieldTypes, null, 1);
141+
BlockBuilder singleRowBlockWriter = blockBuilder.beginBlockEntry();
142+
for (int i = 0; i < values.size(); ++i) {
143+
Type fieldType = fieldTypes.get(i);
144+
Object fieldValue = toTrinoValue(fieldType, values.get(rowType.getFields().get(i).getName().orElseThrow()));
145+
writeNativeValue(fieldType, singleRowBlockWriter, fieldValue);
146+
}
147+
148+
blockBuilder.closeEntry();
149+
return blockBuilder.build();
150+
}
151+
152+
throw new UnsupportedOperationException("Unsupported type: " + type);
153+
}
154+
155+
public static Optional<Map<String, Object>> toJsonValues(Map<String, Type> columnTypeMapping, Optional<Map<String, Object>> values)
156+
{
157+
if (values.isEmpty()) {
158+
return values;
159+
}
160+
161+
Map<String, Object> jsonValues = new HashMap<>();
162+
for (Map.Entry<String, Object> value : values.get().entrySet()) {
163+
jsonValues.put(value.getKey(), toJsonValue(columnTypeMapping.get(value.getKey()), value.getValue()));
164+
}
165+
return Optional.of(jsonValues);
166+
}
167+
168+
@Nullable
169+
private static Object toJsonValue(Type type, @Nullable Object value)
170+
{
171+
if (value == null) {
172+
return null;
173+
}
174+
175+
if (type == SMALLINT || type == TINYINT || type == INTEGER || type == BIGINT) {
176+
return value;
177+
}
178+
if (type == REAL) {
179+
return intBitsToFloat(toIntExact((long) value));
180+
}
181+
if (type == DOUBLE) {
182+
return value;
183+
}
184+
if (type instanceof DecimalType) {
185+
DecimalType decimalType = (DecimalType) type;
186+
if (decimalType.isShort()) {
187+
return Decimals.toString((long) value, decimalType.getScale());
188+
}
189+
return Decimals.toString((Int128) value, decimalType.getScale());
190+
}
191+
192+
if (type instanceof VarcharType) {
193+
return ((Slice) value).toStringUtf8();
194+
}
195+
if (type == DateType.DATE) {
196+
return LocalDate.ofEpochDay((long) value).format(ISO_LOCAL_DATE);
197+
}
198+
if (type == TIMESTAMP_TZ_MILLIS) {
199+
Instant ts = Instant.ofEpochMilli(unpackMillisUtc((long) value));
200+
return ISO_INSTANT.format(ZonedDateTime.ofInstant(ts, UTC));
201+
}
202+
203+
throw new UnsupportedOperationException("Unsupported type: " + type);
204+
}
205+
74206
public static Map<String, Object> jsonEncodeMin(Map<String, Optional<Statistics<?>>> stats, Map<String, Type> typeForColumn)
75207
{
76208
return jsonEncode(stats, typeForColumn, DeltaLakeParquetStatisticsUtils::getMin);

plugin/trino-delta-lake/src/main/java/io/trino/plugin/deltalake/transactionlog/checkpoint/CheckpointEntryIterator.java

+31-41
Original file line numberDiff line numberDiff line change
@@ -359,47 +359,37 @@ private DeltaLakeTransactionLogEntry buildAddEntry(ConnectorSession session, Blo
359359
Block addEntryBlock = block.getObject(pagePosition, Block.class);
360360
log.debug("Block %s has %s fields", block, addEntryBlock.getPositionCount());
361361

362-
Map<String, String> partitionValues = getMap(addEntryBlock, 1);
363-
long size = getLong(addEntryBlock, 2);
364-
long modificationTime = getLong(addEntryBlock, 3);
365-
boolean dataChange = getByte(addEntryBlock, 4) != 0;
366-
Map<String, String> tags = getMap(addEntryBlock, 7);
367-
368-
String path = getString(addEntryBlock, 0);
369-
AddFileEntry result;
370-
if (!addEntryBlock.isNull(6)) {
371-
result = new AddFileEntry(
372-
path,
373-
partitionValues,
374-
size,
375-
modificationTime,
376-
dataChange,
377-
Optional.empty(),
378-
Optional.of(parseStatisticsFromParquet(addEntryBlock.getObject(6, Block.class))),
379-
tags);
380-
}
381-
else if (!addEntryBlock.isNull(5)) {
382-
result = new AddFileEntry(
383-
path,
384-
partitionValues,
385-
size,
386-
modificationTime,
387-
dataChange,
388-
Optional.of(getString(addEntryBlock, 5)),
389-
Optional.empty(),
390-
tags);
391-
}
392-
else {
393-
result = new AddFileEntry(
394-
path,
395-
partitionValues,
396-
size,
397-
modificationTime,
398-
dataChange,
399-
Optional.empty(),
400-
Optional.empty(),
401-
tags);
402-
}
362+
int totalFieldCount = addEntryBlock.getPositionCount();
363+
int fieldId = 0;
364+
String path = getString(addEntryBlock, fieldId++);
365+
Map<String, String> partitionValues = getMap(addEntryBlock, fieldId++);
366+
long size = getLong(addEntryBlock, fieldId++);
367+
long modificationTime = getLong(addEntryBlock, fieldId++);
368+
boolean dataChange = getByte(addEntryBlock, fieldId++) != 0;
369+
Optional<String> stats = Optional.empty();
370+
if (!addEntryBlock.isNull(fieldId)) {
371+
stats = Optional.of(getString(addEntryBlock, fieldId++));
372+
}
373+
else if (totalFieldCount == 8) {
374+
fieldId++; // stats field exists, but it's empty
375+
}
376+
Optional<DeltaLakeParquetFileStatistics> parsedStats = Optional.empty();
377+
if (!addEntryBlock.isNull(fieldId)) {
378+
parsedStats = Optional.of(parseStatisticsFromParquet(addEntryBlock.getObject(fieldId++, Block.class)));
379+
}
380+
else if (totalFieldCount == 8) {
381+
fieldId++; // stats_parsed field exists, but it's empty
382+
}
383+
Map<String, String> tags = getMap(addEntryBlock, fieldId++);
384+
AddFileEntry result = new AddFileEntry(
385+
path,
386+
partitionValues,
387+
size,
388+
modificationTime,
389+
dataChange,
390+
stats,
391+
parsedStats,
392+
tags);
403393

404394
log.debug("Result: %s", result);
405395
return DeltaLakeTransactionLogEntry.addFileEntry(result);

0 commit comments

Comments
 (0)