Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 5eb532f

Browse files
authored
Merge branch 'main' into inline_transaction_model
2 parents 026c038 + 20fceb1 commit 5eb532f

File tree

20 files changed

+1678
-205
lines changed

20 files changed

+1678
-205
lines changed

java/lance-jni/src/blocking_dataset.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -182,27 +182,27 @@ impl BlockingDataset {
182182
}
183183

184184
pub fn list_tags(&self) -> Result<HashMap<String, TagContents>> {
185-
let tags = RT.block_on(self.inner.tags.list())?;
185+
let tags = RT.block_on(self.inner.tags().list())?;
186186
Ok(tags)
187187
}
188188

189189
pub fn create_tag(&mut self, tag: &str, version: u64) -> Result<()> {
190-
RT.block_on(self.inner.tags.create(tag, version))?;
190+
RT.block_on(self.inner.tags().create(tag, version))?;
191191
Ok(())
192192
}
193193

194194
pub fn delete_tag(&mut self, tag: &str) -> Result<()> {
195-
RT.block_on(self.inner.tags.delete(tag))?;
195+
RT.block_on(self.inner.tags().delete(tag))?;
196196
Ok(())
197197
}
198198

199199
pub fn update_tag(&mut self, tag: &str, version: u64) -> Result<()> {
200-
RT.block_on(self.inner.tags.update(tag, version))?;
200+
RT.block_on(self.inner.tags().update(tag, version))?;
201201
Ok(())
202202
}
203203

204204
pub fn get_version(&self, tag: &str) -> Result<u64> {
205-
let version = RT.block_on(self.inner.tags.get_version(tag))?;
205+
let version = RT.block_on(self.inner.tags().get_version(tag))?;
206206
Ok(version)
207207
}
208208

protos/table.proto

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,9 @@ message Manifest {
178178
// base_paths[id = 0] + /data/ + file.path
179179
// the key(a.k.a index) starts from 0, increased by 1 for each new base path.
180180
repeated BasePath base_paths = 18;
181+
182+
// The branch of the dataset. None means main branch.
183+
optional string branch = 20;
181184
} // Manifest
182185

183186
// external dataset base path

protos/transaction.proto

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -172,13 +172,14 @@ message Transaction {
172172
// integrity guarantees provided by the storage backend.
173173
bool is_shallow = 1;
174174
// the reference name in the source dataset
175-
// for branch scenario, this is the branch name in the source dataset
176-
// for non branch scenario(normal cloning), this is an optional tag name in the source dataset
175+
// in most cases it should be the the branch or tag name in the source dataset
177176
optional string ref_name = 2;
178177
// the version of the source dataset for cloning
179178
uint64 ref_version = 3;
180179
// the absolute base path of the source dataset for cloning
181180
string ref_path = 4;
181+
// if the target dataset is a branch, this is the branch name of the target dataset
182+
optional string branch_name = 5;
182183
}
183184

184185
// An operation that updates rows but does not add or remove rows.

python/python/tests/test_vector_index.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -609,6 +609,30 @@ def test_create_ivf_pq_with_target_partition_size(dataset, tmp_path):
609609
assert ann_ds.stats.index_stats("vector_idx")["indices"][0]["num_partitions"] == 2
610610

611611

612+
def test_index_size_stats(tmp_path: Path):
613+
num_rows = 512
614+
dims = 32
615+
schema = pa.schema([pa.field("a", pa.list_(pa.float32(), dims), False)])
616+
values = pc.random(num_rows * dims).cast("float32")
617+
table = pa.Table.from_pydict(
618+
{"a": pa.FixedSizeListArray.from_arrays(values, dims)}, schema=schema
619+
)
620+
621+
base_dir = tmp_path / "test"
622+
623+
dataset = lance.write_dataset(table, base_dir)
624+
625+
index_name = "vec_idx"
626+
dataset.create_index(
627+
"a", "IVF_PQ", name=index_name, num_partitions=2, num_sub_vectors=1
628+
)
629+
630+
# Expect to see non-zero sizes here but all sizes are zero
631+
stats = dataset.stats.index_stats(index_name)
632+
stats = stats["indices"][0]
633+
assert stats["partitions"][0]["size"] + stats["partitions"][1]["size"] == num_rows
634+
635+
612636
def test_ivf_flat_over_binary_vector(tmp_path):
613637
dim = 128
614638
nvec = 1000

python/src/dataset.rs

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1414,7 +1414,7 @@ impl Dataset {
14141414
}
14151415

14161416
fn get_version(self_: PyRef<'_, Self>, tag: String) -> PyResult<u64> {
1417-
let inner_result = RT.block_on(None, self_.ds.tags.get_version(&tag))?;
1417+
let inner_result = RT.block_on(None, self_.ds.tags().get_version(&tag))?;
14181418

14191419
inner_result.map_err(|err: lance::Error| match err {
14201420
lance::Error::NotFound { .. } => {
@@ -1428,8 +1428,8 @@ impl Dataset {
14281428
}
14291429

14301430
fn create_tag(&mut self, tag: String, version: u64) -> PyResult<()> {
1431-
let mut new_self = self.ds.as_ref().clone();
1432-
RT.block_on(None, new_self.tags.create(tag.as_str(), version))?
1431+
let new_self = self.ds.as_ref().clone();
1432+
RT.block_on(None, new_self.tags().create(tag.as_str(), version))?
14331433
.map_err(|err| match err {
14341434
lance::Error::NotFound { .. } => PyValueError::new_err(err.to_string()),
14351435
lance::Error::RefConflict { .. } => PyValueError::new_err(err.to_string()),
@@ -1441,8 +1441,8 @@ impl Dataset {
14411441
}
14421442

14431443
fn delete_tag(&mut self, tag: String) -> PyResult<()> {
1444-
let mut new_self = self.ds.as_ref().clone();
1445-
RT.block_on(None, new_self.tags.delete(tag.as_str()))?
1444+
let new_self = self.ds.as_ref().clone();
1445+
RT.block_on(None, new_self.tags().delete(tag.as_str()))?
14461446
.map_err(|err| match err {
14471447
lance::Error::NotFound { .. } => PyValueError::new_err(err.to_string()),
14481448
lance::Error::RefNotFound { .. } => PyValueError::new_err(err.to_string()),
@@ -1453,8 +1453,8 @@ impl Dataset {
14531453
}
14541454

14551455
fn update_tag(&mut self, tag: String, version: u64) -> PyResult<()> {
1456-
let mut new_self = self.ds.as_ref().clone();
1457-
RT.block_on(None, new_self.tags.update(tag.as_str(), version))?
1456+
let new_self = self.ds.as_ref().clone();
1457+
RT.block_on(None, new_self.tags().update(tag.as_str(), version))?
14581458
.infer_error()?;
14591459
self.ds = Arc::new(new_self);
14601460
Ok(())
@@ -2453,7 +2453,7 @@ impl Dataset {
24532453
}
24542454

24552455
fn list_tags(&self) -> PyResult<HashMap<String, TagContents>> {
2456-
RT.block_on(None, self.ds.tags.list())?.infer_error()
2456+
RT.block_on(None, self.ds.tags().list())?.infer_error()
24572457
}
24582458

24592459
fn list_tags_ordered(&self, order: Option<&str>) -> PyResult<Vec<(String, TagContents)>> {
@@ -2469,7 +2469,7 @@ impl Dataset {
24692469
None => None,
24702470
};
24712471
RT.block_on(None, async {
2472-
self.ds.tags.list_tags_ordered(ordering).await
2472+
self.ds.tags().list_tags_ordered(ordering).await
24732473
})?
24742474
.infer_error()
24752475
}

rust/lance-core/src/datatypes.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ impl TryFrom<&DataType> for LogicalType {
155155
},
156156
DataType::FixedSizeList(field, len) => {
157157
if is_bfloat16_field(field) {
158-
// Don't want to directly use `blfoat16`, in case a built-in type is added
158+
// Don't want to directly use `bfloat16`, in case a built-in type is added
159159
// that isn't identical to our extension type.
160160
format!("fixed_size_list:lance.bfloat16:{}", *len)
161161
} else {

rust/lance-index/src/vector/storage.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,10 @@ impl<Q: Quantization> IvfQuantizationStorage<Q> {
248248
self.reader.num_rows()
249249
}
250250

251+
pub fn partition_size(&self, part_id: usize) -> usize {
252+
self.ivf.partition_size(part_id)
253+
}
254+
251255
pub fn quantizer(&self) -> Result<Quantizer> {
252256
let metadata = self.metadata();
253257
Q::from_metadata(metadata, self.distance_type)

rust/lance-io/src/object_writer.rs

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -548,11 +548,9 @@ mod tests {
548548
async fn test_abort_write() {
549549
let store = LanceObjectStore::memory();
550550

551-
let mut object_writer = futures::executor::block_on(async move {
552-
ObjectWriter::new(&store, &Path::from("/foo"))
553-
.await
554-
.unwrap()
555-
});
551+
let mut object_writer = ObjectWriter::new(&store, &Path::from("/foo"))
552+
.await
553+
.unwrap();
556554
object_writer.abort().await;
557555
}
558556
}

rust/lance-table/src/format/manifest.rs

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,6 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// SPDX-FileCopyrightText: Copyright The Lance Authors
33

4-
use std::collections::{BTreeMap, HashMap};
5-
use std::ops::Range;
6-
use std::sync::Arc;
7-
84
use async_trait::async_trait;
95
use chrono::prelude::*;
106
use deepsize::DeepSizeOf;
@@ -15,6 +11,9 @@ use lance_io::traits::{ProtoStruct, Reader};
1511
use object_store::path::Path;
1612
use prost::Message;
1713
use prost_types::Timestamp;
14+
use std::collections::{BTreeMap, HashMap};
15+
use std::ops::Range;
16+
use std::sync::Arc;
1817

1918
use super::Fragment;
2019
use crate::feature_flags::{has_deprecated_v2_feature_flag, FLAG_STABLE_ROW_IDS};
@@ -43,6 +42,9 @@ pub struct Manifest {
4342
/// Dataset version
4443
pub version: u64,
4544

45+
/// Branch name, None if the dataset is the main branch.
46+
pub branch: Option<String>,
47+
4648
/// Version of the writer library that wrote this manifest.
4749
pub writer_version: Option<WriterVersion>,
4850

@@ -70,7 +72,7 @@ pub struct Manifest {
7072
/// The writer flags
7173
pub writer_feature_flags: u64,
7274

73-
/// The max fragment id used so far
75+
/// The max fragment id used so far
7476
/// None means never set, Some(0) means max ID used so far is 0
7577
pub max_fragment_id: Option<u32>,
7678

@@ -185,6 +187,7 @@ impl Manifest {
185187
schema,
186188
local_schema,
187189
version: 1,
190+
branch: None,
188191
writer_version: Some(WriterVersion::default()),
189192
fragments,
190193
version_aux_data: 0,
@@ -221,6 +224,7 @@ impl Manifest {
221224
schema,
222225
local_schema,
223226
version: previous.version + 1,
227+
branch: previous.branch.clone(),
224228
writer_version: Some(WriterVersion::default()),
225229
fragments,
226230
version_aux_data: 0,
@@ -245,11 +249,13 @@ impl Manifest {
245249
/// Performs a shallow_clone of the manifest entirely in memory without:
246250
/// - Any persistent storage operations
247251
/// - Modifications to the original data
252+
/// - If the shallow clone is for branch, ref_name is the source branch
248253
pub fn shallow_clone(
249254
&self,
250255
ref_name: Option<String>,
251256
ref_path: String,
252257
ref_base_id: u32,
258+
branch_name: Option<String>,
253259
transaction_file: String,
254260
) -> Self {
255261
let cloned_fragments = self
@@ -277,6 +283,7 @@ impl Manifest {
277283
schema: self.schema.clone(),
278284
local_schema: self.local_schema.clone(),
279285
version: self.version,
286+
branch: branch_name,
280287
writer_version: self.writer_version.clone(),
281288
fragments: Arc::new(cloned_fragments),
282289
version_aux_data: self.version_aux_data,
@@ -770,6 +777,7 @@ impl TryFrom<pb::Manifest> for Manifest {
770777
schema,
771778
local_schema,
772779
version: p.version,
780+
branch: p.branch,
773781
writer_version,
774782
version_aux_data: p.version_aux_data as usize,
775783
index_section: p.index_section.map(|i| i as usize),
@@ -826,6 +834,7 @@ impl From<&Manifest> for pb::Manifest {
826834
.map(|(k, v)| (k.clone(), v.as_bytes().to_vec()))
827835
.collect(),
828836
version: m.version,
837+
branch: m.branch.clone(),
829838
writer_version: m
830839
.writer_version
831840
.as_ref()

0 commit comments

Comments
 (0)