Quantco · delsner · Oct 20, 2025 · Jun 25, 2025 · Jun 25, 2025 · Jul 1, 2025
@@ -54,8 +54,8 @@
 from .failure import FailureInfo
 from .functional import (
     concat_collection_members,
-    filter_relationship_one_to_at_least_one,
-    filter_relationship_one_to_one,
+    require_relationship_one_to_at_least_one,
+    require_relationship_one_to_one,
 )
 from .schema import Schema, deserialize_schema, read_parquet_metadata_schema
 
@@ -71,8 +71,8 @@
     "Config",
     "FailureInfo",
     "concat_collection_members",
-    "filter_relationship_one_to_at_least_one",
-    "filter_relationship_one_to_one",
+    "require_relationship_one_to_at_least_one",
+    "require_relationship_one_to_one",
     "Schema",
     "deserialize_schema",
     "read_parquet_metadata_schema",

@@ -24,40 +24,68 @@
 # --------------------------------- RELATIONSHIP 1:1 --------------------------------- #
 
 
-def filter_relationship_one_to_one(
+def require_relationship_one_to_one(
     lhs: LazyFrame[S] | pl.LazyFrame,
     rhs: LazyFrame[T] | pl.LazyFrame,
     /,
     on: str | list[str],
+    *,
+    drop_duplicates: bool = True,
 ) -> pl.LazyFrame:
     """Express a 1:1 mapping between data frames for a collection filter.
 
     Args:
         lhs: The first data frame in the 1:1 mapping.
         rhs: The second data frame in the 1:1 mapping.
-        on: The columns to join the data frames on. If not provided, the join columns
-            are inferred from the mutual primary keys of the provided data frames.
+        on: The columns to join the data frames on.
+        drop_duplicates: If set to `True`, drops rows that are not uniquely identified by the
+            join columns specified with `on`. If set to `False`, skips uniqueness checks
+            and avoids performance penalties. Use with caution, as this may lead to unexpected
+            results if rows in one or both of the data frames are not unique in the join columns.
+
+    Returns:
+        A data frame representing the inner join of the two inputs on the specified
+        columns, filtered to ensure a 1:1 relationship.
     """
+    if drop_duplicates:
+        return lhs.unique(on, keep="none").join(
+            rhs.unique(on, keep="none"),
+            on=on,
+        )
+
     return lhs.join(rhs, on=on)
 
 
 # ------------------------------- RELATIONSHIP 1:{1,N} ------------------------------- #
 
 
-def filter_relationship_one_to_at_least_one(
+def require_relationship_one_to_at_least_one(
     lhs: LazyFrame[S] | pl.LazyFrame,
     rhs: LazyFrame[T] | pl.LazyFrame,
     /,
     on: str | list[str],
+    *,
+    drop_duplicates: bool = True,
 ) -> pl.LazyFrame:
     """Express a 1:{1,N} mapping between data frames for a collection filter.
 
     Args:
-        lhs: The data frame with exactly one occurrence for a set of key columns.
-        rhs: The data frame with at least one occurrence for a set of key columns.
-        on: The columns to join the data frames on. If not provided, the join columns
-            are inferred from the joint primary keys of the provided data frames.
+        lhs: The data frame with exactly one occurrence for the set of join columns.
+        rhs: The data frame with at least one occurrence for the set of join columns.
+        on: The columns to join the data frames on.
+        drop_duplicates: If set to `True`, drops rows in `lhs` that are not uniquely
+            identified by the join columns specified with `on`. If set to `False`,
+            skips uniqueness checks and avoids performance penalties. Use with
+            caution, as this may lead to unexpected results if rows in `lhs` are
+            not unique in the join columns.
+
+    Returns:
+        A data frame representing the inner join of the two inputs on the specified
+        columns, filtered to ensure a 1:{1,N} relationship.
     """
+    if drop_duplicates:
+        return lhs.unique(on, keep="none").join(rhs.unique(on), on=on)
+
     return lhs.join(rhs.unique(on), on=on)
 
 

@@ -49,7 +49,7 @@ class SingleFilterCollection(dy.Collection):
 
     @dy.filter()
     def one_to_one(self) -> pl.LazyFrame:
-        return dy.filter_relationship_one_to_one(self.first, self.second, on="idx")
+        return dy.require_relationship_one_to_one(self.first, self.second, on="idx")
 
 
 @pytest.mark.benchmark(group="collection-filter-single")
@@ -79,17 +79,17 @@ class MultiFilterCollection(dy.Collection):
 
     @dy.filter()
     def one_to_one(self) -> pl.LazyFrame:
-        return dy.filter_relationship_one_to_one(self.first, self.second, on="idx")
+        return dy.require_relationship_one_to_one(self.first, self.second, on="idx")
 
     @dy.filter()
     def one_to_at_least_one(self) -> pl.LazyFrame:
-        return dy.filter_relationship_one_to_at_least_one(
+        return dy.require_relationship_one_to_at_least_one(
             self.first, self.second, on="idx"
         )
 
     @dy.filter()
     def one_to_at_least_one_reverse(self) -> pl.LazyFrame:
-        return dy.filter_relationship_one_to_at_least_one(
+        return dy.require_relationship_one_to_at_least_one(
             self.second, self.first, on="idx"
         )
 

@@ -74,7 +74,7 @@ class MyCollection1(dy.Collection):
     class MyCollection2(MyCollection1):
         @dy.filter()
         def test_filter(self) -> pl.LazyFrame:
-            return dy.filter_relationship_one_to_one(self.x, self.x, ["foo"])
+            return dy.require_relationship_one_to_one(self.x, self.x, ["foo"])
 
     # Should not match
     assert not MyCollection1.matches(MyCollection2)
@@ -93,12 +93,12 @@ class BaseCollection(dy.Collection):
     class MyCollection1(BaseCollection):
         @dy.filter()
         def test_filter(self) -> pl.LazyFrame:
-            return dy.filter_relationship_one_to_one(self.x, self.x, ["foo"])
+            return dy.require_relationship_one_to_one(self.x, self.x, ["foo"])
 
     class MyCollection2(BaseCollection):
         @dy.filter()
         def test_filter(self) -> pl.LazyFrame:
-            return dy.filter_relationship_one_to_at_least_one(self.x, self.x, ["foo"])
+            return dy.require_relationship_one_to_at_least_one(self.x, self.x, ["foo"])
 
     assert not MyCollection1.matches(MyCollection2)
 

@@ -29,13 +29,13 @@ class EmployeeSchema(dy.Schema):
 
 @pytest.fixture()
 def departments() -> dy.LazyFrame[DepartmentSchema]:
-    return DepartmentSchema.cast(pl.LazyFrame({"department_id": [1, 2]}))
+    return DepartmentSchema.cast(pl.LazyFrame({"department_id": [1, 2, 3]}))
 
 
 @pytest.fixture()
 def managers() -> dy.LazyFrame[ManagerSchema]:
     return ManagerSchema.cast(
-        pl.LazyFrame({"department_id": [1], "name": ["Donald Duck"]})
+        pl.LazyFrame({"department_id": [1, 3], "name": ["Donald Duck", "Minnie Mouse"]})
     )
 
 
@@ -44,9 +44,9 @@ def employees() -> dy.LazyFrame[EmployeeSchema]:
     return EmployeeSchema.cast(
         pl.LazyFrame(
             {
-                "department_id": [2, 2, 2],
-                "employee_number": [101, 102, 103],
-                "name": ["Huey", "Dewey", "Louie"],
+                "department_id": [2, 2, 2, 3],
+                "employee_number": [101, 102, 103, 104],
+                "name": ["Huey", "Dewey", "Louie", "Daisy"],
             }
         )
     )
@@ -57,21 +57,52 @@ def employees() -> dy.LazyFrame[EmployeeSchema]:
 # ------------------------------------------------------------------------------------ #
 
 
+@pytest.mark.parametrize("drop_duplicates", [True, False])
 def test_one_to_one(
     departments: dy.LazyFrame[DepartmentSchema],
     managers: dy.LazyFrame[ManagerSchema],
+    drop_duplicates: bool,
 ) -> None:
-    actual = dy.filter_relationship_one_to_one(
-        departments, managers, on="department_id"
+    actual = dy.require_relationship_one_to_one(
+        departments,
+        managers,
+        on="department_id",
+        drop_duplicates=drop_duplicates,
     )
-    assert actual.select("department_id").collect().to_series().to_list() == [1]
+    assert set(actual.select("department_id").collect().to_series().to_list()) == {1, 3}
+
+
+def test_one_to_one_drop_duplicates_rhs(
+    departments: dy.LazyFrame[DepartmentSchema],
+    employees: dy.LazyFrame[EmployeeSchema],
+) -> None:
+    actual = dy.require_relationship_one_to_one(
+        departments,
+        employees,
+        on="department_id",
+        drop_duplicates=True,
+    )
+    assert actual.select("department_id").collect().to_series().to_list() == [3]
+
+
+def test_one_to_one_drop_duplicates_lhs(
+    employees: dy.LazyFrame[EmployeeSchema],
+    managers: dy.LazyFrame[ManagerSchema],
+) -> None:
+    actual = dy.require_relationship_one_to_one(
+        employees,
+        managers,
+        on="department_id",
+        drop_duplicates=True,
+    )
+    assert actual.select("department_id").collect().to_series().to_list() == [3]
 
 
 def test_one_to_at_least_one(
     departments: dy.LazyFrame[DepartmentSchema],
     employees: dy.LazyFrame[EmployeeSchema],
 ) -> None:
-    actual = dy.filter_relationship_one_to_at_least_one(
-        departments, employees, on="department_id"
+    actual = dy.require_relationship_one_to_at_least_one(
+        departments, employees, on="department_id", drop_duplicates=False
     )
-    assert actual.select("department_id").collect().to_series().to_list() == [2]
+    assert set(actual.select("department_id").collect().to_series().to_list()) == {2, 3}