|
18 | 18 | */
|
19 | 19 | package org.apache.iceberg.arrow.vectorized;
|
20 | 20 |
|
| 21 | +import static org.apache.iceberg.arrow.vectorized.ArrowVectorAccessors.getVectorAccessor; |
| 22 | + |
21 | 23 | import java.util.Map;
|
22 | 24 | import org.apache.arrow.memory.ArrowBuf;
|
23 | 25 | import org.apache.arrow.memory.BufferAllocator;
|
@@ -461,6 +463,49 @@ public static VectorizedArrowReader positionsWithSetArrowValidityVector() {
|
461 | 463 | return new PositionVectorReader(true);
|
462 | 464 | }
|
463 | 465 |
|
| 466 | + public static VectorizedArrowReader rowIds(long baseRowId, VectorizedArrowReader idReader) { |
| 467 | + return new RowIdVectorReader(baseRowId, idReader); |
| 468 | + } |
| 469 | + |
| 470 | + public static VectorizedArrowReader lastUpdated( |
| 471 | + Long baseRowId, Long fileLastUpdated, VectorizedArrowReader seqReader) { |
| 472 | + if (fileLastUpdated != null && baseRowId != null) { |
| 473 | + return new LastUpdatedSeqVectorReader(fileLastUpdated, seqReader); |
| 474 | + } else { |
| 475 | + return nulls(); |
| 476 | + } |
| 477 | + } |
| 478 | + |
| 479 | + public static VectorizedReader<?> replaceWithMetadataReader( |
| 480 | + Types.NestedField icebergField, |
| 481 | + VectorizedReader<?> reader, |
| 482 | + Map<Integer, ?> idToConstant, |
| 483 | + boolean setArrowValidityVector) { |
| 484 | + int id = icebergField.fieldId(); |
| 485 | + if (id == MetadataColumns.ROW_ID.fieldId()) { |
| 486 | + Long baseRowId = (Long) idToConstant.get(id); |
| 487 | + return rowIds(baseRowId, (VectorizedArrowReader) reader); |
| 488 | + } else if (id == MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.fieldId()) { |
| 489 | + Long baseRowId = (Long) idToConstant.get(id); |
| 490 | + Long fileSeqNumber = (Long) idToConstant.get(id); |
| 491 | + return VectorizedArrowReader.lastUpdated( |
| 492 | + baseRowId, fileSeqNumber, (VectorizedArrowReader) reader); |
| 493 | + } else if (idToConstant.containsKey(id)) { |
| 494 | + // containsKey is used because the constant may be null |
| 495 | + return new ConstantVectorReader<>(icebergField, idToConstant.get(id)); |
| 496 | + } else if (id == MetadataColumns.ROW_POSITION.fieldId()) { |
| 497 | + if (setArrowValidityVector) { |
| 498 | + return positionsWithSetArrowValidityVector(); |
| 499 | + } else { |
| 500 | + return VectorizedArrowReader.positions(); |
| 501 | + } |
| 502 | + } else if (id == MetadataColumns.IS_DELETED.fieldId()) { |
| 503 | + return new DeletedVectorReader(); |
| 504 | + } |
| 505 | + |
| 506 | + return reader; |
| 507 | + } |
| 508 | + |
464 | 509 | private static final class NullVectorReader extends VectorizedArrowReader {
|
465 | 510 | private static final NullVectorReader INSTANCE = new NullVectorReader();
|
466 | 511 |
|
@@ -530,12 +575,6 @@ private static BigIntVector newVector(int valueCount) {
|
530 | 575 | return vector;
|
531 | 576 | }
|
532 | 577 |
|
533 |
| - private static NullabilityHolder newNullabilityHolder(int size) { |
534 |
| - NullabilityHolder nullabilityHolder = new NullabilityHolder(size); |
535 |
| - nullabilityHolder.setNotNulls(0, size); |
536 |
| - return nullabilityHolder; |
537 |
| - } |
538 |
| - |
539 | 578 | @Override
|
540 | 579 | public void setRowGroupInfo(
|
541 | 580 | PageReadStore source, Map<ColumnPath, ColumnChunkMetaData> metadata) {
|
@@ -567,6 +606,135 @@ public void close() {
|
567 | 606 | }
|
568 | 607 | }
|
569 | 608 |
|
| 609 | + private static final class RowIdVectorReader extends VectorizedArrowReader { |
| 610 | + private static final Field ROW_ID_ARROW_FIELD = ArrowSchemaUtil.convert(MetadataColumns.ROW_ID); |
| 611 | + |
| 612 | + private final long firstRowId; |
| 613 | + private final VectorizedReader<VectorHolder> idReader; |
| 614 | + private final VectorizedReader<VectorHolder> posReader; |
| 615 | + private NullabilityHolder nulls; |
| 616 | + |
| 617 | + private RowIdVectorReader(long firstRowId, VectorizedArrowReader idReader) { |
| 618 | + this.firstRowId = firstRowId; |
| 619 | + this.idReader = idReader != null ? idReader : nulls(); |
| 620 | + this.posReader = new PositionVectorReader(true); |
| 621 | + } |
| 622 | + |
| 623 | + @Override |
| 624 | + public VectorHolder read(VectorHolder reuse, int numValsToRead) { |
| 625 | + FieldVector positions = posReader.read(null, numValsToRead).vector(); |
| 626 | + VectorHolder ids = idReader.read(null, numValsToRead); |
| 627 | + BigIntVector vec = allocateBigIntVector(ROW_ID_ARROW_FIELD, numValsToRead); |
| 628 | + ArrowBuf dataBuffer = vec.getDataBuffer(); |
| 629 | + boolean isNullReader = ids.vector() == null; |
| 630 | + ArrowVectorAccessor<?, String, ?, ?> idsAccessor = |
| 631 | + isNullReader ? null : getVectorAccessor(ids); |
| 632 | + for (int i = 0; i < numValsToRead; i += 1) { |
| 633 | + long bufferOffset = (long) i * Long.BYTES; |
| 634 | + if (isNullReader || ids.nullabilityHolder().isNullAt(i) == 1) { |
| 635 | + long rowId = firstRowId + (Long) positions.getObject(i); |
| 636 | + dataBuffer.setLong(bufferOffset, rowId); |
| 637 | + } else { |
| 638 | + long materializedRowId = idsAccessor.getLong(i); |
| 639 | + dataBuffer.setLong(bufferOffset, materializedRowId); |
| 640 | + } |
| 641 | + } |
| 642 | + |
| 643 | + vec.setValueCount(numValsToRead); |
| 644 | + return VectorHolder.vectorHolder(vec, MetadataColumns.ROW_ID, nulls); |
| 645 | + } |
| 646 | + |
| 647 | + @Override |
| 648 | + public void setRowGroupInfo( |
| 649 | + PageReadStore source, Map<ColumnPath, ColumnChunkMetaData> metadata) { |
| 650 | + idReader.setRowGroupInfo(source, metadata); |
| 651 | + posReader.setRowGroupInfo(source, metadata); |
| 652 | + } |
| 653 | + |
| 654 | + @Override |
| 655 | + public void setBatchSize(int batchSize) { |
| 656 | + if (nulls == null || nulls.size() < batchSize) { |
| 657 | + this.nulls = newNullabilityHolder(batchSize); |
| 658 | + } |
| 659 | + |
| 660 | + idReader.setBatchSize(batchSize); |
| 661 | + posReader.setBatchSize(batchSize); |
| 662 | + } |
| 663 | + |
| 664 | + @Override |
| 665 | + public void close() { |
| 666 | + // don't close vectors as they are not owned by readers |
| 667 | + } |
| 668 | + } |
| 669 | + |
| 670 | + private static final class LastUpdatedSeqVectorReader extends VectorizedArrowReader { |
| 671 | + private static final Field LAST_UPDATED_SEQ = |
| 672 | + ArrowSchemaUtil.convert(MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER); |
| 673 | + |
| 674 | + private final long lastUpdatedSeq; |
| 675 | + private final VectorizedReader<VectorHolder> seqReader; |
| 676 | + private NullabilityHolder nulls; |
| 677 | + |
| 678 | + private LastUpdatedSeqVectorReader( |
| 679 | + long lastUpdatedSeq, VectorizedReader<VectorHolder> seqReader) { |
| 680 | + this.lastUpdatedSeq = lastUpdatedSeq; |
| 681 | + this.seqReader = seqReader == null ? nulls() : seqReader; |
| 682 | + } |
| 683 | + |
| 684 | + @Override |
| 685 | + public VectorHolder read(VectorHolder reuse, int numValsToRead) { |
| 686 | + BigIntVector vec = allocateBigIntVector(LAST_UPDATED_SEQ, numValsToRead); |
| 687 | + ArrowBuf dataBuffer = vec.getDataBuffer(); |
| 688 | + VectorHolder seqNumbers = seqReader.read(null, numValsToRead); |
| 689 | + ArrowVectorAccessor<?, String, ?, ?> accessor = |
| 690 | + seqNumbers.vector() == null ? null : getVectorAccessor(seqNumbers); |
| 691 | + for (int i = 0; i < numValsToRead; i += 1) { |
| 692 | + long bufferOffset = (long) i * Long.BYTES; |
| 693 | + if (seqNumbers.vector() == null || seqNumbers.nullabilityHolder().isNullAt(i) == 1) { |
| 694 | + dataBuffer.setLong(bufferOffset, lastUpdatedSeq); |
| 695 | + } else { |
| 696 | + long materializedSeqNumber = accessor.getLong(i); |
| 697 | + dataBuffer.setLong(bufferOffset, materializedSeqNumber); |
| 698 | + } |
| 699 | + } |
| 700 | + |
| 701 | + vec.setValueCount(numValsToRead); |
| 702 | + return VectorHolder.vectorHolder(vec, MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER, nulls); |
| 703 | + } |
| 704 | + |
| 705 | + @Override |
| 706 | + public void setRowGroupInfo( |
| 707 | + PageReadStore source, Map<ColumnPath, ColumnChunkMetaData> metadata) { |
| 708 | + seqReader.setRowGroupInfo(source, metadata); |
| 709 | + } |
| 710 | + |
| 711 | + @Override |
| 712 | + public void setBatchSize(int batchSize) { |
| 713 | + if (nulls == null || nulls.size() < batchSize) { |
| 714 | + this.nulls = newNullabilityHolder(batchSize); |
| 715 | + } |
| 716 | + |
| 717 | + seqReader.setBatchSize(batchSize); |
| 718 | + } |
| 719 | + |
| 720 | + @Override |
| 721 | + public void close() { |
| 722 | + // don't close vectors as they are not owned by readers |
| 723 | + } |
| 724 | + } |
| 725 | + |
| 726 | + private static BigIntVector allocateBigIntVector(Field field, int valueCount) { |
| 727 | + BigIntVector vector = (BigIntVector) field.createVector(ArrowAllocation.rootAllocator()); |
| 728 | + vector.allocateNew(valueCount); |
| 729 | + return vector; |
| 730 | + } |
| 731 | + |
| 732 | + private static NullabilityHolder newNullabilityHolder(int size) { |
| 733 | + NullabilityHolder nullabilityHolder = new NullabilityHolder(size); |
| 734 | + nullabilityHolder.setNotNulls(0, size); |
| 735 | + return nullabilityHolder; |
| 736 | + } |
| 737 | + |
570 | 738 | /**
|
571 | 739 | * A Dummy Vector Reader which doesn't actually read files, instead it returns a dummy
|
572 | 740 | * VectorHolder which indicates the constant value which should be used for this column.
|
|
0 commit comments