Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 6c40b46

Browse files
Support IGNORE_ERRORS when scanning from pyarrow/pandas (#4646)
* Support IGNORE_ERRORS when scanning from pyarrow/pandas * minor fix --------- Co-authored-by: xiyang <[email protected]>
1 parent b1c399c commit 6c40b46

7 files changed

Lines changed: 127 additions & 14 deletions

File tree

src_cpp/include/pandas/pandas_scan.h

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
#pragma once
22

3-
#include "function/scalar_function.h"
43
#include "function/table/bind_data.h"
54
#include "function/table/scan_functions.h"
65
#include "function/table_functions.h"
76
#include "pandas_bind.h"
8-
#include "pybind_include.h"
97

108
namespace kuzu {
119

@@ -33,17 +31,24 @@ struct PandasScanFunction {
3331
struct PandasScanFunctionData : public function::TableFuncBindData {
3432
py::handle df;
3533
std::vector<std::unique_ptr<PandasColumnBindData>> columnBindData;
34+
common::ReaderConfig config;
3635

3736
PandasScanFunctionData(binder::expression_vector columns, py::handle df, uint64_t numRows,
38-
std::vector<std::unique_ptr<PandasColumnBindData>> columnBindData)
37+
std::vector<std::unique_ptr<PandasColumnBindData>> columnBindData,
38+
common::ReaderConfig config)
3939
: TableFuncBindData{std::move(columns), 0 /* numWarningDataColumns */, numRows}, df{df},
40-
columnBindData{std::move(columnBindData)} {}
40+
columnBindData{std::move(columnBindData)}, config(std::move(config)) {}
4141

4242
~PandasScanFunctionData() override {
4343
py::gil_scoped_acquire acquire;
4444
columnBindData.clear();
4545
}
4646

47+
bool getIgnoreErrorsOption() const override {
48+
return config.getOption(common::CopyConstants::IGNORE_ERRORS_OPTION_NAME,
49+
common::CopyConstants::DEFAULT_IGNORE_ERRORS);
50+
}
51+
4752
std::vector<std::unique_ptr<PandasColumnBindData>> copyColumnBindData() const;
4853

4954
std::unique_ptr<function::TableFuncBindData> copy() const override {
@@ -56,6 +61,7 @@ struct PandasScanFunctionData : public function::TableFuncBindData {
5661
for (const auto& i : other.columnBindData) {
5762
columnBindData.push_back(i->copy());
5863
}
64+
config = other.config.copy();
5965
}
6066
};
6167

src_cpp/include/pyarrow/pyarrow_scan.h

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#pragma once
22

3+
#include <utility>
4+
35
#include "common/arrow/arrow.h"
46
#include "function/scalar_function.h"
57
#include "function/table/bind_data.h"
@@ -12,6 +14,7 @@ namespace kuzu {
1214
struct PyArrowScanConfig {
1315
uint64_t skipNum;
1416
uint64_t limitNum;
17+
bool ignoreErrors;
1518
explicit PyArrowScanConfig(const common::case_insensitive_map_t<common::Value>& options);
1619
};
1720

@@ -35,19 +38,22 @@ struct PyArrowTableScanSharedState final : public function::BaseScanSharedStateW
3538
struct PyArrowTableScanFunctionData final : public function::TableFuncBindData {
3639
std::shared_ptr<ArrowSchemaWrapper> schema;
3740
std::vector<std::shared_ptr<ArrowArrayWrapper>> arrowArrayBatches;
41+
bool ignoreErrors;
3842

3943
PyArrowTableScanFunctionData(binder::expression_vector columns,
4044
std::shared_ptr<ArrowSchemaWrapper> schema,
41-
std::vector<std::shared_ptr<ArrowArrayWrapper>> arrowArrayBatches, uint64_t numRows)
45+
std::vector<std::shared_ptr<ArrowArrayWrapper>> arrowArrayBatches, uint64_t numRows,
46+
bool ignoreErrors)
4247
: TableFuncBindData{std::move(columns), 0 /* numWarningDataColumns */, numRows},
43-
schema{std::move(schema)}, arrowArrayBatches{std::move(arrowArrayBatches)} {}
48+
schema{std::move(schema)}, arrowArrayBatches{std::move(arrowArrayBatches)},
49+
ignoreErrors(ignoreErrors) {}
4450

4551
~PyArrowTableScanFunctionData() override {}
4652

53+
bool getIgnoreErrorsOption() const override { return ignoreErrors; }
54+
4755
private:
48-
PyArrowTableScanFunctionData(const PyArrowTableScanFunctionData& other)
49-
: TableFuncBindData{other}, schema{other.schema},
50-
arrowArrayBatches{other.arrowArrayBatches} {}
56+
PyArrowTableScanFunctionData(const PyArrowTableScanFunctionData& other) = default;
5157

5258
public:
5359
std::unique_ptr<function::TableFuncBindData> copy() const override {

src_cpp/pandas/pandas_scan.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "common/exception/runtime.h"
66
#include "function/table/bind_input.h"
77
#include "numpy/numpy_scan.h"
8+
#include "processor/execution_context.h"
89
#include "py_connection.h"
910
#include "pyarrow/pyarrow_scan.h"
1011
#include "pybind11/pytypes.h"
@@ -31,8 +32,9 @@ std::unique_ptr<TableFuncBindData> bindFunc(ClientContext* /*context*/,
3132
auto getFunc = df.attr("__getitem__");
3233
auto numRows = py::len(getFunc(columns[0]));
3334
auto returnColumns = input->binder->createVariables(names, returnTypes);
35+
auto scanConfig = input->extraInput->constPtrCast<ExtraScanTableFuncBindInput>()->config.copy();
3436
return std::make_unique<PandasScanFunctionData>(std::move(returnColumns), df, numRows,
35-
std::move(columnBindData));
37+
std::move(columnBindData), std::move(scanConfig));
3638
}
3739

3840
bool sharedStateNext(const TableFuncBindData* /*bindData*/, PandasScanLocalState* localState,
@@ -119,6 +121,10 @@ static double progressFunc(TableFuncSharedState* sharedState) {
119121
return static_cast<double>(pandasSharedState->numRowsRead) / pandasSharedState->numRows;
120122
}
121123

124+
static void finalizeFunc(const processor::ExecutionContext* ctx, TableFuncSharedState*) {
125+
ctx->clientContext->getWarningContextUnsafe().defaultPopulateAllWarnings(ctx->queryID);
126+
}
127+
122128
function_set PandasScanFunction::getFunctionSet() {
123129
function_set functionSet;
124130
functionSet.push_back(getFunction().copy());
@@ -132,6 +138,7 @@ TableFunction PandasScanFunction::getFunction() {
132138
function.initSharedStateFunc = initSharedState;
133139
function.initLocalStateFunc = initLocalState;
134140
function.progressFunc = progressFunc;
141+
function.finalizeFunc = finalizeFunc;
135142
return function;
136143
}
137144

src_cpp/pyarrow/pyarrow_scan.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "common/arrow/arrow_converter.h"
66
#include "function/cast/functions/numeric_limits.h"
77
#include "function/table/bind_input.h"
8+
#include "processor/execution_context.h"
89
#include "py_connection.h"
910
#include "pyarrow/pyarrow_bind.h"
1011
#include "pybind11/pytypes.h"
@@ -18,6 +19,7 @@ namespace kuzu {
1819
PyArrowScanConfig::PyArrowScanConfig(const case_insensitive_map_t<Value>& options) {
1920
skipNum = 0;
2021
limitNum = NumericLimits<uint64_t>::maximum();
22+
ignoreErrors = CopyConstants::DEFAULT_IGNORE_ERRORS;
2123
for (const auto& i : options) {
2224
if (i.first == "SKIP") {
2325
if (i.second.getDataType().getLogicalTypeID() != LogicalTypeID::INT64 ||
@@ -31,8 +33,14 @@ PyArrowScanConfig::PyArrowScanConfig(const case_insensitive_map_t<Value>& option
3133
throw BinderException("LIMIT Option must be a positive integer literal.");
3234
}
3335
limitNum = i.second.val.int64Val;
36+
} else if (i.first == CopyConstants::IGNORE_ERRORS_OPTION_NAME) {
37+
if (i.second.getDataType().getLogicalTypeID() != LogicalTypeID::BOOL) {
38+
throw BinderException("IGNORE_ERRORS Option must be a boolean.");
39+
}
40+
ignoreErrors = i.second.val.booleanVal;
3441
} else {
35-
throw BinderException(stringFormat("{} Option not recognized by pyArrow scanner."));
42+
throw BinderException(
43+
stringFormat("{} Option not recognized by pyArrow scanner.", i.first));
3644
}
3745
}
3846
}
@@ -82,7 +90,7 @@ static std::unique_ptr<TableFuncBindData> bindFunc(ClientContext*,
8290

8391
auto columns = input->binder->createVariables(names, returnTypes);
8492
return std::make_unique<PyArrowTableScanFunctionData>(std::move(columns), std::move(schema),
85-
arrowArrayBatches, numRows);
93+
arrowArrayBatches, numRows, config.ignoreErrors);
8694
}
8795

8896
ArrowArrayWrapper* PyArrowTableScanSharedState::getNextChunk() {
@@ -143,13 +151,18 @@ function_set PyArrowTableScanFunction::getFunctionSet() {
143151
return functionSet;
144152
}
145153

154+
static void finalizeFunc(const processor::ExecutionContext* ctx, TableFuncSharedState*) {
155+
ctx->clientContext->getWarningContextUnsafe().defaultPopulateAllWarnings(ctx->queryID);
156+
}
157+
146158
TableFunction PyArrowTableScanFunction::getFunction() {
147159
auto function = TableFunction(name, std::vector{LogicalTypeID::POINTER});
148160
function.tableFunc = tableFunc;
149161
function.bindFunc = bindFunc;
150162
function.initSharedStateFunc = initSharedState;
151163
function.initLocalStateFunc = initLocalState;
152164
function.progressFunc = progressFunc;
165+
function.finalizeFunc = finalizeFunc;
153166
return function;
154167
}
155168

test/test_scan_pandas.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -429,5 +429,29 @@ def test_scan_string_to_nested(tmp_path: Path) -> None:
429429
)
430430
conn.execute("COPY tab from df")
431431
result = conn.execute("match (t:tab) return t.*")
432-
assert result.get_next() == [1, [1, 2, 3], {"'a'": 1, "'b'": 2}, {"a": 1, "b": 2}, [[], [1, 2, 3], [4, 5, 6]]]
432+
assert result.get_next() == [
433+
1,
434+
[1, 2, 3],
435+
{"'a'": 1, "'b'": 2},
436+
{"a": 1, "b": 2},
437+
[[], [1, 2, 3], [4, 5, 6]],
438+
]
433439
assert not result.has_next()
440+
441+
442+
def test_pandas_scan_ignore_errors(tmp_path: Path) -> None:
443+
db = kuzu.Database(tmp_path)
444+
conn = kuzu.Connection(db)
445+
df = pd.DataFrame({"id": [1, 2, 3, 1]})
446+
conn.execute("CREATE NODE TABLE person(id INT64, PRIMARY KEY(id))")
447+
conn.execute("COPY person FROM df(IGNORE_ERRORS=true)")
448+
449+
people = conn.execute("MATCH (p:person) RETURN p.id")
450+
assert people.get_next() == [1]
451+
assert people.get_next() == [2]
452+
assert people.get_next() == [3]
453+
assert not people.has_next()
454+
455+
warnings = conn.execute("CALL show_warnings() RETURN *")
456+
assert warnings.get_next()[1].startswith("Found duplicated primary key value 1")
457+
assert not warnings.has_next()

test/test_scan_polars.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,5 +47,25 @@ def test_polars_error(conn_db_readonly: ConnDB) -> None:
4747
with pytest.raises(RuntimeError, match="Binder exception: Variable df is not in scope."):
4848
conn.execute("LOAD FROM df RETURN *;")
4949
df = []
50-
with pytest.raises(RuntimeError, match="Binder exception: Variable df found but no matches were scannable"):
50+
with pytest.raises(
51+
RuntimeError,
52+
match="Binder exception: Variable df found but no matches were scannable",
53+
):
5154
conn.execute("LOAD FROM df RETURN *;")
55+
56+
57+
def test_polars_scan_ignore_errors(conn_db_readwrite: ConnDB) -> None:
58+
conn, db = conn_db_readwrite
59+
df = pl.DataFrame({"id": [1, 2, 3, 1]})
60+
conn.execute("CREATE NODE TABLE ids(id INT64, PRIMARY KEY(id))")
61+
conn.execute("COPY ids FROM df(IGNORE_ERRORS=true)")
62+
63+
people = conn.execute("MATCH (i:ids) RETURN i.id")
64+
assert people.get_next() == [1]
65+
assert people.get_next() == [2]
66+
assert people.get_next() == [3]
67+
assert not people.has_next()
68+
69+
warnings = conn.execute("CALL show_warnings() RETURN *")
70+
assert warnings.get_next()[1].startswith("Found duplicated primary key value 1")
71+
assert not warnings.has_next()

test/test_scan_pyarrow.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import pyarrow as pa
2+
import pytest
23
from type_aliases import ConnDB
34

45

@@ -64,3 +65,39 @@ def test_pyarrow_copy_from(conn_db_readwrite: ConnDB) -> None:
6465
assert result.get_next() == [1, "honk", 2]
6566
assert result.get_next() == [2, "shoo", 3]
6667
assert result.get_next() == [3, "mimimimimimimi", 1]
68+
69+
70+
def test_pyarrow_scan_ignore_errors(conn_db_readwrite: ConnDB) -> None:
71+
conn, db = conn_db_readwrite
72+
tab = pa.Table.from_arrays(
73+
[
74+
pa.array([1, 2, 3, 1], type=pa.int32()),
75+
],
76+
names=["id"],
77+
)
78+
conn.execute("CREATE NODE TABLE ids(id INT64, PRIMARY KEY(id))")
79+
conn.execute("COPY ids FROM tab(IGNORE_ERRORS=true)")
80+
81+
people = conn.execute("MATCH (i:ids) RETURN i.id")
82+
assert people.get_next() == [1]
83+
assert people.get_next() == [2]
84+
assert people.get_next() == [3]
85+
assert not people.has_next()
86+
87+
warnings = conn.execute("CALL show_warnings() RETURN *")
88+
assert warnings.get_next()[1].startswith("Found duplicated primary key value 1")
89+
assert not warnings.has_next()
90+
91+
92+
def test_pyarrow_scan_invalid_option(conn_db_readwrite: ConnDB) -> None:
93+
conn, db = conn_db_readwrite
94+
tab = pa.Table.from_arrays(
95+
[
96+
pa.array([1, 2, 3], type=pa.int32()),
97+
],
98+
names=["id"],
99+
)
100+
conn.execute("CREATE NODE TABLE ids(id INT64, PRIMARY KEY(id))")
101+
error_message = "INVALID_OPTION Option not recognized by pyArrow scanner."
102+
with pytest.raises(RuntimeError, match=error_message):
103+
conn.execute("COPY ids FROM tab(INVALID_OPTION=1)")

0 commit comments

Comments
 (0)