Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 955190f

Browse files
committed
ORC-124. Port of HIVE-15335 fast decimal to ORC. (omalley reviewed by
Matt McCline) Fixes apache#82 Signed-off-by: Owen O'Malley <[email protected]>
1 parent f15557c commit 955190f

File tree

5 files changed

+77
-54
lines changed

5 files changed

+77
-54
lines changed

java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java

Lines changed: 27 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import java.sql.Date;
2121
import java.sql.Timestamp;
2222

23+
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
2324
import org.apache.hadoop.hive.common.type.HiveDecimal;
2425
import org.apache.hadoop.hive.serde2.io.DateWritable;
2526
import org.apache.hadoop.io.BytesWritable;
@@ -781,9 +782,11 @@ public int hashCode() {
781782

782783
private static final class DecimalStatisticsImpl extends ColumnStatisticsImpl
783784
implements DecimalColumnStatistics {
784-
private HiveDecimal minimum = null;
785-
private HiveDecimal maximum = null;
786-
private HiveDecimal sum = HiveDecimal.ZERO;
785+
786+
// These objects are mutable for better performance.
787+
private HiveDecimalWritable minimum = null;
788+
private HiveDecimalWritable maximum = null;
789+
private HiveDecimalWritable sum = new HiveDecimalWritable(0);
787790

788791
DecimalStatisticsImpl() {
789792
}
@@ -792,13 +795,13 @@ private static final class DecimalStatisticsImpl extends ColumnStatisticsImpl
792795
super(stats);
793796
OrcProto.DecimalStatistics dec = stats.getDecimalStatistics();
794797
if (dec.hasMaximum()) {
795-
maximum = HiveDecimal.create(dec.getMaximum());
798+
maximum = new HiveDecimalWritable(dec.getMaximum());
796799
}
797800
if (dec.hasMinimum()) {
798-
minimum = HiveDecimal.create(dec.getMinimum());
801+
minimum = new HiveDecimalWritable(dec.getMinimum());
799802
}
800803
if (dec.hasSum()) {
801-
sum = HiveDecimal.create(dec.getSum());
804+
sum = new HiveDecimalWritable(dec.getSum());
802805
} else {
803806
sum = null;
804807
}
@@ -809,21 +812,21 @@ public void reset() {
809812
super.reset();
810813
minimum = null;
811814
maximum = null;
812-
sum = HiveDecimal.ZERO;
815+
sum = new HiveDecimalWritable(0);
813816
}
814817

815818
@Override
816-
public void updateDecimal(HiveDecimal value) {
819+
public void updateDecimal(HiveDecimalWritable value) {
817820
if (minimum == null) {
818-
minimum = value;
819-
maximum = value;
821+
minimum = new HiveDecimalWritable(value);
822+
maximum = new HiveDecimalWritable(value);
820823
} else if (minimum.compareTo(value) > 0) {
821-
minimum = value;
824+
minimum.set(value);
822825
} else if (maximum.compareTo(value) < 0) {
823-
maximum = value;
826+
maximum.set(value);
824827
}
825828
if (sum != null) {
826-
sum = sum.add(value);
829+
sum.mutateAdd(value);
827830
}
828831
}
829832

@@ -832,20 +835,20 @@ public void merge(ColumnStatisticsImpl other) {
832835
if (other instanceof DecimalStatisticsImpl) {
833836
DecimalStatisticsImpl dec = (DecimalStatisticsImpl) other;
834837
if (minimum == null) {
835-
minimum = dec.minimum;
836-
maximum = dec.maximum;
838+
minimum = (dec.minimum != null ? new HiveDecimalWritable(dec.minimum) : null);
839+
maximum = (dec.maximum != null ? new HiveDecimalWritable(dec.maximum) : null);
837840
sum = dec.sum;
838841
} else if (dec.minimum != null) {
839842
if (minimum.compareTo(dec.minimum) > 0) {
840-
minimum = dec.minimum;
843+
minimum.set(dec.minimum);
841844
}
842845
if (maximum.compareTo(dec.maximum) < 0) {
843-
maximum = dec.maximum;
846+
maximum.set(dec.maximum);
844847
}
845848
if (sum == null || dec.sum == null) {
846849
sum = null;
847850
} else {
848-
sum = sum.add(dec.sum);
851+
sum.mutateAdd(dec.sum);
849852
}
850853
}
851854
} else {
@@ -865,7 +868,8 @@ public OrcProto.ColumnStatistics.Builder serialize() {
865868
dec.setMinimum(minimum.toString());
866869
dec.setMaximum(maximum.toString());
867870
}
868-
if (sum != null) {
871+
// Check isSet for overflow.
872+
if (sum != null && sum.isSet()) {
869873
dec.setSum(sum.toString());
870874
}
871875
result.setDecimalStatistics(dec);
@@ -874,17 +878,17 @@ public OrcProto.ColumnStatistics.Builder serialize() {
874878

875879
@Override
876880
public HiveDecimal getMinimum() {
877-
return minimum;
881+
return minimum.getHiveDecimal();
878882
}
879883

880884
@Override
881885
public HiveDecimal getMaximum() {
882-
return maximum;
886+
return maximum.getHiveDecimal();
883887
}
884888

885889
@Override
886890
public HiveDecimal getSum() {
887-
return sum;
891+
return sum.getHiveDecimal();
888892
}
889893

890894
@Override
@@ -1303,7 +1307,7 @@ public void updateBinary(byte[] bytes, int offset, int length,
13031307
throw new UnsupportedOperationException("Can't update string");
13041308
}
13051309

1306-
public void updateDecimal(HiveDecimal value) {
1310+
public void updateDecimal(HiveDecimalWritable value) {
13071311
throw new UnsupportedOperationException("Can't update decimal");
13081312
}
13091313

java/core/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -818,7 +818,7 @@ public static class FloatFromDecimalTreeReader extends ConvertTreeReader {
818818
@Override
819819
public void setConvertVectorElement(int elementNum) throws IOException {
820820
doubleColVector.vector[elementNum] =
821-
(float) decimalColVector.vector[elementNum].getHiveDecimal().doubleValue();
821+
(float) decimalColVector.vector[elementNum].doubleValue();
822822
}
823823

824824
@Override
@@ -1024,7 +1024,7 @@ public static class DoubleFromDecimalTreeReader extends ConvertTreeReader {
10241024
@Override
10251025
public void setConvertVectorElement(int elementNum) throws IOException {
10261026
doubleColVector.vector[elementNum] =
1027-
decimalColVector.vector[elementNum].getHiveDecimal().doubleValue();
1027+
decimalColVector.vector[elementNum].doubleValue();
10281028
}
10291029

10301030
@Override
@@ -1361,14 +1361,8 @@ public static class DecimalFromDecimalTreeReader extends ConvertTreeReader {
13611361
@Override
13621362
public void setConvertVectorElement(int elementNum) throws IOException {
13631363

1364-
HiveDecimalWritable valueWritable = HiveDecimalWritable.enforcePrecisionScale(
1365-
fileDecimalColVector.vector[elementNum], readerPrecision, readerScale);
1366-
if (valueWritable != null) {
1367-
decimalColVector.set(elementNum, valueWritable);
1368-
} else {
1369-
decimalColVector.noNulls = false;
1370-
decimalColVector.isNull[elementNum] = true;
1371-
}
1364+
decimalColVector.set(elementNum, fileDecimalColVector.vector[elementNum]);
1365+
13721366
}
13731367

13741368
@Override
@@ -1530,6 +1524,7 @@ public static class StringGroupFromDecimalTreeReader extends ConvertTreeReader {
15301524
private final TypeDescription readerType;
15311525
private DecimalColumnVector decimalColVector;
15321526
private BytesColumnVector bytesColVector;
1527+
private byte[] scratchBuffer;
15331528

15341529
StringGroupFromDecimalTreeReader(int columnId, TypeDescription fileType,
15351530
TypeDescription readerType, Context context) throws IOException {

java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
4141
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
4242
import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr;
43+
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
4344
import org.apache.orc.TypeDescription;
4445
import org.apache.orc.OrcProto;
4546

@@ -1096,6 +1097,7 @@ public static class DecimalTreeReader extends TreeReader {
10961097
protected InStream valueStream;
10971098
protected IntegerReader scaleReader = null;
10981099
private int[] scratchScaleVector;
1100+
private byte[] scratchBytes;
10991101

11001102
DecimalTreeReader(int columnId, Context context) throws IOException {
11011103
this(columnId, null, null, null, null, context);
@@ -1107,6 +1109,7 @@ protected DecimalTreeReader(int columnId, InStream present,
11071109
super(columnId, present, context);
11081110
this.scratchScaleVector = new int[VectorizedRowBatch.DEFAULT_SIZE];
11091111
this.valueStream = valueStream;
1112+
this.scratchBytes = new byte[HiveDecimal.SCRATCH_BUFFER_LEN_SERIALIZATION_UTILS_READ];
11101113
if (scaleStream != null && encoding != null) {
11111114
checkEncoding(encoding);
11121115
this.scaleReader = createIntegerReader(encoding.getKind(), scaleStream, true, context);
@@ -1159,18 +1162,30 @@ public void nextVector(ColumnVector previousVector,
11591162
// read the scales
11601163
scaleReader.nextVector(result, scratchScaleVector, batchSize);
11611164
// Read value entries based on isNull entries
1165+
// Use the fast ORC deserialization method that emulates SerializationUtils.readBigInteger
1166+
// provided by HiveDecimalWritable.
1167+
HiveDecimalWritable[] vector = result.vector;
1168+
HiveDecimalWritable decWritable;
11621169
if (result.noNulls) {
11631170
for (int r=0; r < batchSize; ++r) {
1164-
BigInteger bInt = SerializationUtils.readBigInteger(valueStream);
1165-
HiveDecimal dec = HiveDecimal.create(bInt, scratchScaleVector[r]);
1166-
result.set(r, dec);
1171+
decWritable = vector[r];
1172+
if (!decWritable.serializationUtilsRead(
1173+
valueStream, scratchScaleVector[r],
1174+
scratchBytes)) {
1175+
result.isNull[r] = true;
1176+
result.noNulls = false;
1177+
}
11671178
}
11681179
} else if (!result.isRepeating || !result.isNull[0]) {
11691180
for (int r=0; r < batchSize; ++r) {
11701181
if (!result.isNull[r]) {
1171-
BigInteger bInt = SerializationUtils.readBigInteger(valueStream);
1172-
HiveDecimal dec = HiveDecimal.create(bInt, scratchScaleVector[r]);
1173-
result.set(r, dec);
1182+
decWritable = vector[r];
1183+
if (!decWritable.serializationUtilsRead(
1184+
valueStream, scratchScaleVector[r],
1185+
scratchBytes)) {
1186+
result.isNull[r] = true;
1187+
result.noNulls = false;
1188+
}
11741189
}
11751190
}
11761191
}
@@ -1179,8 +1194,9 @@ public void nextVector(ColumnVector previousVector,
11791194
@Override
11801195
void skipRows(long items) throws IOException {
11811196
items = countNonNulls(items);
1197+
HiveDecimalWritable scratchDecWritable = new HiveDecimalWritable();
11821198
for (int i = 0; i < items; i++) {
1183-
SerializationUtils.readBigInteger(valueStream);
1199+
scratchDecWritable.serializationUtilsRead(valueStream, 0, scratchBytes);
11841200
}
11851201
scaleReader.skip(items);
11861202
}

java/core/src/java/org/apache/orc/impl/WriterImpl.java

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
import io.airlift.compress.lz4.Lz4Decompressor;
3636
import io.airlift.compress.lzo.LzoCompressor;
3737
import io.airlift.compress.lzo.LzoDecompressor;
38+
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
3839
import org.apache.hadoop.hive.ql.util.JavaDataModel;
3940
import org.apache.orc.BinaryColumnStatistics;
4041
import org.apache.orc.ColumnStatistics;
@@ -1980,6 +1981,11 @@ OrcProto.ColumnEncoding getEncoding() {
19801981

19811982
private static class DecimalTreeWriter extends TreeWriter {
19821983
private final PositionedOutputStream valueStream;
1984+
1985+
// These scratch buffers allow us to serialize decimals much faster.
1986+
private final long[] scratchLongs;
1987+
private final byte[] scratchBuffer;
1988+
19831989
private final IntegerWriter scaleStream;
19841990
private final boolean isDirectV2;
19851991

@@ -1990,6 +1996,8 @@ private static class DecimalTreeWriter extends TreeWriter {
19901996
super(columnId, schema, writer, nullable);
19911997
this.isDirectV2 = isNewWriteFormat(writer);
19921998
valueStream = writer.createStream(id, OrcProto.Stream.Kind.DATA);
1999+
scratchLongs = new long[HiveDecimal.SCRATCH_LONGS_LEN];
2000+
scratchBuffer = new byte[HiveDecimal.SCRATCH_BUFFER_LEN_TO_BYTES];
19932001
this.scaleStream = createIntegerWriter(writer.createStream(id,
19942002
OrcProto.Stream.Kind.SECONDARY), true, isDirectV2, writer);
19952003
recordPosition(rowIndexPosition);
@@ -2012,31 +2020,30 @@ void writeBatch(ColumnVector vector, int offset,
20122020
DecimalColumnVector vec = (DecimalColumnVector) vector;
20132021
if (vector.isRepeating) {
20142022
if (vector.noNulls || !vector.isNull[0]) {
2015-
HiveDecimal value = vec.vector[0].getHiveDecimal();
2023+
HiveDecimalWritable value = vec.vector[0];
20162024
indexStatistics.updateDecimal(value);
20172025
if (createBloomFilter) {
2018-
String str = value.toString();
2026+
String str = value.toString(scratchBuffer);
20192027
if (bloomFilter != null) {
20202028
bloomFilter.addString(str);
20212029
}
20222030
bloomFilterUtf8.addString(str);
20232031
}
20242032
for(int i=0; i < length; ++i) {
2025-
SerializationUtils.writeBigInteger(valueStream,
2026-
value.unscaledValue());
2033+
value.serializationUtilsWrite(valueStream,
2034+
scratchLongs);
20272035
scaleStream.write(value.scale());
20282036
}
20292037
}
20302038
} else {
20312039
for(int i=0; i < length; ++i) {
20322040
if (vec.noNulls || !vec.isNull[i + offset]) {
2033-
HiveDecimal value = vec.vector[i + offset].getHiveDecimal();
2034-
SerializationUtils.writeBigInteger(valueStream,
2035-
value.unscaledValue());
2041+
HiveDecimalWritable value = vec.vector[i + offset];
2042+
value.serializationUtilsWrite(valueStream, scratchLongs);
20362043
scaleStream.write(value.scale());
20372044
indexStatistics.updateDecimal(value);
20382045
if (createBloomFilter) {
2039-
String str = value.toString();
2046+
String str = value.toString(scratchBuffer);
20402047
if (bloomFilter != null) {
20412048
bloomFilter.addString(str);
20422049
}

java/core/src/test/org/apache/orc/TestColumnStatistics.java

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import org.apache.hadoop.conf.Configuration;
3131
import org.apache.hadoop.fs.FileSystem;
3232
import org.apache.hadoop.fs.Path;
33+
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
3334
import org.apache.hadoop.hive.common.type.HiveDecimal;
3435
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
3536
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
@@ -168,17 +169,17 @@ public void testDecimalMerge() throws Exception {
168169

169170
ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
170171
ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
171-
stats1.updateDecimal(HiveDecimal.create(10));
172-
stats1.updateDecimal(HiveDecimal.create(100));
173-
stats2.updateDecimal(HiveDecimal.create(1));
174-
stats2.updateDecimal(HiveDecimal.create(1000));
172+
stats1.updateDecimal(new HiveDecimalWritable(10));
173+
stats1.updateDecimal(new HiveDecimalWritable(100));
174+
stats2.updateDecimal(new HiveDecimalWritable(1));
175+
stats2.updateDecimal(new HiveDecimalWritable(1000));
175176
stats1.merge(stats2);
176177
DecimalColumnStatistics typed = (DecimalColumnStatistics) stats1;
177178
assertEquals(1, typed.getMinimum().longValue());
178179
assertEquals(1000, typed.getMaximum().longValue());
179180
stats1.reset();
180-
stats1.updateDecimal(HiveDecimal.create(-10));
181-
stats1.updateDecimal(HiveDecimal.create(10000));
181+
stats1.updateDecimal(new HiveDecimalWritable(-10));
182+
stats1.updateDecimal(new HiveDecimalWritable(10000));
182183
stats1.merge(stats2);
183184
assertEquals(-10, typed.getMinimum().longValue());
184185
assertEquals(10000, typed.getMaximum().longValue());

0 commit comments

Comments
 (0)