Dataframely is a Python package to validate the schema and content of polars data frames. Its
purpose is to make data pipelines more robust by ensuring that data meets expectations and more readable by adding
schema information to data frame type hints.
You can install dataframely using your favorite package manager, e.g., pixi or pip:
pixi add dataframely
pip install dataframelyimport dataframely as dy
import polars as pl
class HouseSchema(dy.Schema):
zip_code = dy.String(nullable=False, min_length=3)
num_bedrooms = dy.UInt8(nullable=False)
num_bathrooms = dy.UInt8(nullable=False)
price = dy.Float64(nullable=False)
@dy.rule()
def reasonable_bathroom_to_bedroom_ratio() -> pl.Expr:
ratio = pl.col("num_bathrooms") / pl.col("num_bedrooms")
return (ratio >= 1 / 3) & (ratio <= 3)
@dy.rule(group_by=["zip_code"])
def minimum_zip_code_count() -> pl.Expr:
return pl.len() >= 2import polars as pl
df = pl.DataFrame({
"zip_code": ["01234", "01234", "1", "213", "123", "213"],
"num_bedrooms": [2, 2, 1, None, None, 2],
"num_bathrooms": [1, 2, 1, 1, 0, 8],
"price": [100_000, 110_000, 50_000, 80_000, 60_000, 160_000]
})
# Validate the data and cast columns to expected types
validated_df: dy.DataFrame[HouseSchema] = HouseSchema.validate(df, cast=True)See more advanced usage examples in the documentation.