Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
8 changes: 4 additions & 4 deletions dataframely/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@
from .failure import FailureInfo
from .functional import (
concat_collection_members,
filter_relationship_one_to_at_least_one,
filter_relationship_one_to_one,
require_relationship_one_to_at_least_one,
require_relationship_one_to_one,
)
from .schema import Schema, deserialize_schema, read_parquet_metadata_schema

Expand All @@ -71,8 +71,8 @@
"Config",
"FailureInfo",
"concat_collection_members",
"filter_relationship_one_to_at_least_one",
"filter_relationship_one_to_one",
"require_relationship_one_to_at_least_one",
"require_relationship_one_to_one",
"Schema",
"deserialize_schema",
"read_parquet_metadata_schema",
Expand Down
44 changes: 36 additions & 8 deletions dataframely/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,40 +24,68 @@
# --------------------------------- RELATIONSHIP 1:1 --------------------------------- #


def filter_relationship_one_to_one(
def require_relationship_one_to_one(
lhs: LazyFrame[S] | pl.LazyFrame,
rhs: LazyFrame[T] | pl.LazyFrame,
/,
on: str | list[str],
*,
drop_duplicates: bool = True,
) -> pl.LazyFrame:
"""Express a 1:1 mapping between data frames for a collection filter.

Args:
lhs: The first data frame in the 1:1 mapping.
rhs: The second data frame in the 1:1 mapping.
on: The columns to join the data frames on. If not provided, the join columns
are inferred from the mutual primary keys of the provided data frames.
on: The columns to join the data frames on.
drop_duplicates: If set to `True`, drops rows that are not uniquely identified by the
join columns specified with `on`. If set to `False`, skips uniqueness checks
and avoids performance penalties. Use with caution, as this may lead to unexpected
results if rows in one or both of the data frames are not unique in the join columns.

Returns:
A data frame representing the inner join of the two inputs on the specified
columns, filtered to ensure a 1:1 relationship.
"""
if drop_duplicates:
return lhs.unique(on, keep="none").join(
rhs.unique(on, keep="none"),
on=on,
)

return lhs.join(rhs, on=on)


# ------------------------------- RELATIONSHIP 1:{1,N} ------------------------------- #


def filter_relationship_one_to_at_least_one(
def require_relationship_one_to_at_least_one(
lhs: LazyFrame[S] | pl.LazyFrame,
rhs: LazyFrame[T] | pl.LazyFrame,
/,
on: str | list[str],
*,
drop_duplicates: bool = True,
) -> pl.LazyFrame:
"""Express a 1:{1,N} mapping between data frames for a collection filter.

Args:
lhs: The data frame with exactly one occurrence for a set of key columns.
rhs: The data frame with at least one occurrence for a set of key columns.
on: The columns to join the data frames on. If not provided, the join columns
are inferred from the joint primary keys of the provided data frames.
lhs: The data frame with exactly one occurrence for the set of join columns.
rhs: The data frame with at least one occurrence for the set of join columns.
on: The columns to join the data frames on.
drop_duplicates: If set to `True`, drops rows in `lhs` that are not uniquely
identified by the join columns specified with `on`. If set to `False`,
skips uniqueness checks and avoids performance penalties. Use with
caution, as this may lead to unexpected results if rows in `lhs` are
not unique in the join columns.

Returns:
A data frame representing the inner join of the two inputs on the specified
columns, filtered to ensure a 1:{1,N} relationship.
"""
if drop_duplicates:
return lhs.unique(on, keep="none").join(rhs.unique(on), on=on)

return lhs.join(rhs.unique(on), on=on)


Expand Down
8 changes: 4 additions & 4 deletions tests/benches/test_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class SingleFilterCollection(dy.Collection):

@dy.filter()
def one_to_one(self) -> pl.LazyFrame:
return dy.filter_relationship_one_to_one(self.first, self.second, on="idx")
return dy.require_relationship_one_to_one(self.first, self.second, on="idx")


@pytest.mark.benchmark(group="collection-filter-single")
Expand Down Expand Up @@ -79,17 +79,17 @@ class MultiFilterCollection(dy.Collection):

@dy.filter()
def one_to_one(self) -> pl.LazyFrame:
return dy.filter_relationship_one_to_one(self.first, self.second, on="idx")
return dy.require_relationship_one_to_one(self.first, self.second, on="idx")

@dy.filter()
def one_to_at_least_one(self) -> pl.LazyFrame:
return dy.filter_relationship_one_to_at_least_one(
return dy.require_relationship_one_to_at_least_one(
self.first, self.second, on="idx"
)

@dy.filter()
def one_to_at_least_one_reverse(self) -> pl.LazyFrame:
return dy.filter_relationship_one_to_at_least_one(
return dy.require_relationship_one_to_at_least_one(
self.second, self.first, on="idx"
)

Expand Down
6 changes: 3 additions & 3 deletions tests/collection/test_matches.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ class MyCollection1(dy.Collection):
class MyCollection2(MyCollection1):
@dy.filter()
def test_filter(self) -> pl.LazyFrame:
return dy.filter_relationship_one_to_one(self.x, self.x, ["foo"])
return dy.require_relationship_one_to_one(self.x, self.x, ["foo"])

# Should not match
assert not MyCollection1.matches(MyCollection2)
Expand All @@ -93,12 +93,12 @@ class BaseCollection(dy.Collection):
class MyCollection1(BaseCollection):
@dy.filter()
def test_filter(self) -> pl.LazyFrame:
return dy.filter_relationship_one_to_one(self.x, self.x, ["foo"])
return dy.require_relationship_one_to_one(self.x, self.x, ["foo"])

class MyCollection2(BaseCollection):
@dy.filter()
def test_filter(self) -> pl.LazyFrame:
return dy.filter_relationship_one_to_at_least_one(self.x, self.x, ["foo"])
return dy.require_relationship_one_to_at_least_one(self.x, self.x, ["foo"])

assert not MyCollection1.matches(MyCollection2)

Expand Down
53 changes: 42 additions & 11 deletions tests/functional/test_relationships.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,13 @@ class EmployeeSchema(dy.Schema):

@pytest.fixture()
def departments() -> dy.LazyFrame[DepartmentSchema]:
return DepartmentSchema.cast(pl.LazyFrame({"department_id": [1, 2]}))
return DepartmentSchema.cast(pl.LazyFrame({"department_id": [1, 2, 3]}))


@pytest.fixture()
def managers() -> dy.LazyFrame[ManagerSchema]:
return ManagerSchema.cast(
pl.LazyFrame({"department_id": [1], "name": ["Donald Duck"]})
pl.LazyFrame({"department_id": [1, 3], "name": ["Donald Duck", "Minnie Mouse"]})
)


Expand All @@ -44,9 +44,9 @@ def employees() -> dy.LazyFrame[EmployeeSchema]:
return EmployeeSchema.cast(
pl.LazyFrame(
{
"department_id": [2, 2, 2],
"employee_number": [101, 102, 103],
"name": ["Huey", "Dewey", "Louie"],
"department_id": [2, 2, 2, 3],
"employee_number": [101, 102, 103, 104],
"name": ["Huey", "Dewey", "Louie", "Daisy"],
}
)
)
Expand All @@ -57,21 +57,52 @@ def employees() -> dy.LazyFrame[EmployeeSchema]:
# ------------------------------------------------------------------------------------ #


@pytest.mark.parametrize("drop_duplicates", [True, False])
def test_one_to_one(
departments: dy.LazyFrame[DepartmentSchema],
managers: dy.LazyFrame[ManagerSchema],
drop_duplicates: bool,
) -> None:
actual = dy.filter_relationship_one_to_one(
departments, managers, on="department_id"
actual = dy.require_relationship_one_to_one(
departments,
managers,
on="department_id",
drop_duplicates=drop_duplicates,
)
assert actual.select("department_id").collect().to_series().to_list() == [1]
assert set(actual.select("department_id").collect().to_series().to_list()) == {1, 3}


def test_one_to_one_drop_duplicates_rhs(
departments: dy.LazyFrame[DepartmentSchema],
employees: dy.LazyFrame[EmployeeSchema],
) -> None:
actual = dy.require_relationship_one_to_one(
departments,
employees,
on="department_id",
drop_duplicates=True,
)
assert actual.select("department_id").collect().to_series().to_list() == [3]


def test_one_to_one_drop_duplicates_lhs(
employees: dy.LazyFrame[EmployeeSchema],
managers: dy.LazyFrame[ManagerSchema],
) -> None:
actual = dy.require_relationship_one_to_one(
employees,
managers,
on="department_id",
drop_duplicates=True,
)
assert actual.select("department_id").collect().to_series().to_list() == [3]


def test_one_to_at_least_one(
departments: dy.LazyFrame[DepartmentSchema],
employees: dy.LazyFrame[EmployeeSchema],
) -> None:
actual = dy.filter_relationship_one_to_at_least_one(
departments, employees, on="department_id"
actual = dy.require_relationship_one_to_at_least_one(
departments, employees, on="department_id", drop_duplicates=False
)
assert actual.select("department_id").collect().to_series().to_list() == [2]
assert set(actual.select("department_id").collect().to_series().to_list()) == {2, 3}
Loading