Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 62f02c0

Browse files
author
lijinglun
committed
fix: fts match query on column without inverted index
1 parent 8658d2c commit 62f02c0

File tree

12 files changed

+860
-119
lines changed

12 files changed

+860
-119
lines changed

python/python/tests/test_scalar_index.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -569,6 +569,38 @@ def test_full_text_search(dataset, with_position):
569569
)
570570

571571

572+
def test_unindexed_full_text_search_on_empty_index(tmp_path):
573+
# Create fts index on empty table
574+
schema = pa.schema({"text": pa.string()})
575+
ds = lance.write_dataset(pa.Table.from_pylist([], schema=schema), tmp_path)
576+
ds.create_scalar_index("text", "INVERTED")
577+
578+
# Append unindexed data
579+
ds.insert(pa.Table.from_pylist([{"text": "hello!"}], schema=schema))
580+
581+
# Fts search
582+
results = ds.scanner(
583+
columns=["text"],
584+
full_text_query="hello",
585+
).to_table()
586+
assert results.num_rows == 1
587+
588+
589+
def test_full_text_search_without_index(dataset):
590+
row = dataset.take(indices=[0], columns=["doc"])
591+
query_text = row.column(0)[0].as_py()
592+
query_text = query_text.split(" ")[0]
593+
query = MatchQuery(query_text, column="doc")
594+
results = dataset.scanner(
595+
columns=["doc"],
596+
full_text_query=query,
597+
).to_table()
598+
assert results.num_rows > 0
599+
results = results.column(0)
600+
for row in results:
601+
assert query_text in row.as_py()
602+
603+
572604
def test_rowid_order(dataset):
573605
dataset.create_scalar_index("doc", index_type="INVERTED", with_position=False)
574606
results = dataset.scanner(

rust/lance-datafusion/src/datagen.rs

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@ use datafusion::{
77
execution::SendableRecordBatchStream,
88
physical_plan::{stream::RecordBatchStreamAdapter, ExecutionPlan},
99
};
10-
use datafusion_common::DataFusionError;
10+
use datafusion_common::{DataFusionError, Result};
1111
use futures::TryStreamExt;
1212
use lance_datagen::{BatchCount, BatchGeneratorBuilder, RowCount};
1313

14-
use crate::exec::OneShotExec;
14+
use crate::exec::{OneShotExec, RecordBatchExec};
1515

1616
pub trait DatafusionDatagenExt {
1717
fn into_df_stream(
@@ -20,7 +20,17 @@ pub trait DatafusionDatagenExt {
2020
num_batches: BatchCount,
2121
) -> SendableRecordBatchStream;
2222

23-
fn into_df_exec(self, batch_size: RowCount, num_batches: BatchCount) -> Arc<dyn ExecutionPlan>;
23+
fn into_df_once_exec(
24+
self,
25+
batch_size: RowCount,
26+
num_batches: BatchCount,
27+
) -> Arc<dyn ExecutionPlan>;
28+
29+
fn into_df_repeat_exec(
30+
self,
31+
batch_size: RowCount,
32+
num_batches: BatchCount,
33+
) -> Result<Arc<dyn ExecutionPlan>>;
2434
}
2535

2636
impl DatafusionDatagenExt for BatchGeneratorBuilder {
@@ -34,8 +44,29 @@ impl DatafusionDatagenExt for BatchGeneratorBuilder {
3444
Box::pin(RecordBatchStreamAdapter::new(schema, stream))
3545
}
3646

37-
fn into_df_exec(self, batch_size: RowCount, num_batches: BatchCount) -> Arc<dyn ExecutionPlan> {
47+
fn into_df_once_exec(
48+
self,
49+
batch_size: RowCount,
50+
num_batches: BatchCount,
51+
) -> Arc<dyn ExecutionPlan> {
3852
let stream = self.into_df_stream(batch_size, num_batches);
3953
Arc::new(OneShotExec::new(stream))
4054
}
55+
56+
fn into_df_repeat_exec<'a>(
57+
self,
58+
batch_size: RowCount,
59+
num_batches: BatchCount,
60+
) -> Result<Arc<dyn ExecutionPlan>> {
61+
let reader = self.into_reader_rows(batch_size, num_batches);
62+
let batches = reader
63+
.collect::<Vec<_>>()
64+
.into_iter()
65+
.map(|r| match r {
66+
Ok(batch) => Ok(batch),
67+
Err(e) => Err(DataFusionError::Execution(e.to_string())),
68+
})
69+
.collect::<Result<Vec<_>>>()?;
70+
Ok(Arc::new(RecordBatchExec::new(batches)?))
71+
}
4172
}

rust/lance-datafusion/src/exec.rs

Lines changed: 102 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,9 @@
33

44
//! Utilities for working with datafusion execution plans
55
6-
use std::{
7-
collections::HashMap,
8-
fmt::{self, Formatter},
9-
sync::{Arc, LazyLock, Mutex},
10-
time::Duration,
11-
};
12-
136
use arrow_array::RecordBatch;
14-
use arrow_schema::Schema as ArrowSchema;
7+
use arrow_schema::{Schema as ArrowSchema, SchemaRef};
8+
use datafusion::physical_plan::memory::MemoryStream;
159
use datafusion::{
1610
catalog::streaming::StreamingTable,
1711
dataframe::DataFrame,
@@ -33,6 +27,13 @@ use datafusion::{
3327
};
3428
use datafusion_common::{DataFusionError, Statistics};
3529
use datafusion_physical_expr::{EquivalenceProperties, Partitioning};
30+
use std::any::Any;
31+
use std::{
32+
collections::HashMap,
33+
fmt::{self, Formatter},
34+
sync::{Arc, LazyLock, Mutex},
35+
time::Duration,
36+
};
3637

3738
use futures::{stream, StreamExt};
3839
use lance_arrow::SchemaExt;
@@ -201,6 +202,99 @@ impl ExecutionPlan for OneShotExec {
201202
}
202203
}
203204

205+
/// A source execution node created from existing record batches.
206+
pub struct RecordBatchExec {
207+
batches: Vec<RecordBatch>,
208+
schema: SchemaRef,
209+
properties: PlanProperties,
210+
}
211+
212+
impl RecordBatchExec {
213+
pub fn new(batches: Vec<RecordBatch>) -> Result<Self> {
214+
if batches.is_empty() {
215+
return Err(Error::InvalidInput {
216+
source: "RecordBatchExec requires at least one batch".into(),
217+
location: location!(),
218+
});
219+
}
220+
let schema = batches[0].schema();
221+
Ok(Self {
222+
batches,
223+
schema: schema.clone(),
224+
properties: PlanProperties::new(
225+
EquivalenceProperties::new(schema),
226+
Partitioning::RoundRobinBatch(1),
227+
EmissionType::Incremental,
228+
Boundedness::Bounded,
229+
),
230+
})
231+
}
232+
}
233+
234+
impl std::fmt::Debug for RecordBatchExec {
235+
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
236+
write!(f, "RecordBatchExec")
237+
}
238+
}
239+
240+
impl DisplayAs for RecordBatchExec {
241+
fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> fmt::Result {
242+
match t {
243+
DisplayFormatType::Default
244+
| DisplayFormatType::Verbose
245+
| DisplayFormatType::TreeRender => {
246+
write!(f, "RecordBatchExec")
247+
}
248+
}
249+
}
250+
}
251+
252+
impl ExecutionPlan for RecordBatchExec {
253+
fn name(&self) -> &str {
254+
"RecordBatchExec"
255+
}
256+
257+
fn as_any(&self) -> &dyn Any {
258+
self
259+
}
260+
261+
fn properties(&self) -> &PlanProperties {
262+
&self.properties
263+
}
264+
265+
fn schema(&self) -> SchemaRef {
266+
self.schema.clone()
267+
}
268+
269+
fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
270+
vec![]
271+
}
272+
273+
fn with_new_children(
274+
self: Arc<Self>,
275+
children: Vec<Arc<dyn ExecutionPlan>>,
276+
) -> datafusion_common::Result<Arc<dyn ExecutionPlan>> {
277+
if !children.is_empty() {
278+
return Err(DataFusionError::Internal(
279+
"RecordBatchExec does not support children".to_string(),
280+
));
281+
}
282+
Ok(self)
283+
}
284+
285+
fn execute(
286+
&self,
287+
_partition: usize,
288+
_context: Arc<TaskContext>,
289+
) -> datafusion_common::Result<SendableRecordBatchStream> {
290+
Ok(Box::pin(MemoryStream::try_new(
291+
self.batches.clone(),
292+
self.schema.clone(),
293+
None,
294+
)?))
295+
}
296+
}
297+
204298
struct TracedExec {
205299
input: Arc<dyn ExecutionPlan>,
206300
properties: PlanProperties,

rust/lance-index/src/scalar/btree.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2143,7 +2143,7 @@ mod tests {
21432143
let data = gen_batch()
21442144
.col("value", array::cycle::<Float64Type>(values.clone()))
21452145
.col("_rowid", array::step::<UInt64Type>())
2146-
.into_df_exec(RowCount::from(10), BatchCount::from(100));
2146+
.into_df_once_exec(RowCount::from(10), BatchCount::from(100));
21472147
let schema = data.schema();
21482148
let sort_expr = PhysicalSortExpr::new_default(col("value", schema.as_ref()).unwrap());
21492149
let plan = Arc::new(SortExec::new([sort_expr].into(), data));
@@ -2185,7 +2185,7 @@ mod tests {
21852185
let data = gen_batch()
21862186
.col("value", array::step::<Float32Type>())
21872187
.col("_rowid", array::step::<UInt64Type>())
2188-
.into_df_exec(RowCount::from(1000), BatchCount::from(10));
2188+
.into_df_once_exec(RowCount::from(1000), BatchCount::from(10));
21892189
let schema = data.schema();
21902190
let sort_expr = PhysicalSortExpr::new_default(col("value", schema.as_ref()).unwrap());
21912191
let plan = Arc::new(SortExec::new([sort_expr].into(), data));

rust/lance-index/src/scalar/inverted/builder.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -655,15 +655,15 @@ pub enum PositionRecorder {
655655
}
656656

657657
impl PositionRecorder {
658-
fn new(with_position: bool) -> Self {
658+
pub fn new(with_position: bool) -> Self {
659659
if with_position {
660660
Self::Position(Vec::new())
661661
} else {
662662
Self::Count(0)
663663
}
664664
}
665665

666-
fn push(&mut self, position: u32) {
666+
pub fn push(&mut self, position: u32) {
667667
match self {
668668
Self::Position(positions) => positions.push(position),
669669
Self::Count(count) => *count += 1,

0 commit comments

Comments
 (0)