From 682e314c21fc94bba46fe88f8ab4081e5b1cc60b Mon Sep 17 00:00:00 2001 From: yassinnouh21 Date: Sat, 10 Jan 2026 14:18:28 +0200 Subject: [PATCH 01/21] feat: Add dbt integration for importing models as FeatureViews (#3335) This PR implements the dbt-Feast integration feature requested in #3335, enabling users to import dbt models as Feast FeatureViews. ## New CLI Commands - `feast dbt list` - List dbt models available for import - `feast dbt import` - Import dbt models as Feast objects ## Features - Parse dbt manifest.json files to extract model metadata - Map dbt types to Feast types (38 types supported) - Generate Entity, DataSource, and FeatureView objects - Support for BigQuery, Snowflake, and File data sources - Tag-based filtering (--tag) to select specific models - Code generation (--output) to create Python files - Dry-run mode to preview changes before applying ## Usage Examples ```bash # List models with 'feast' tag feast dbt list -m target/manifest.json --tag feast # Import models to registry feast dbt import -m target/manifest.json -e driver_id --tag feast # Generate Python file instead feast dbt import -m target/manifest.json -e driver_id --output features.py ``` Closes #3335 Signed-off-by: yassinnouh21 --- sdk/python/feast/cli/cli.py | 2 + sdk/python/feast/cli/dbt_import.py | 379 ++++++++++++++++ sdk/python/feast/dbt/__init__.py | 29 ++ sdk/python/feast/dbt/codegen.py | 378 ++++++++++++++++ sdk/python/feast/dbt/mapper.py | 411 ++++++++++++++++++ sdk/python/feast/dbt/parser.py | 227 ++++++++++ sdk/python/tests/unit/dbt/__init__.py | 1 + .../tests/unit/dbt/sample_manifest.json | 170 ++++++++ sdk/python/tests/unit/dbt/test_mapper.py | 309 +++++++++++++ sdk/python/tests/unit/dbt/test_parser.py | 293 +++++++++++++ 10 files changed, 2199 insertions(+) create mode 100644 sdk/python/feast/cli/dbt_import.py create mode 100644 sdk/python/feast/dbt/__init__.py create mode 100644 sdk/python/feast/dbt/codegen.py create mode 100644 sdk/python/feast/dbt/mapper.py create mode 100644 sdk/python/feast/dbt/parser.py create mode 100644 sdk/python/tests/unit/dbt/__init__.py create mode 100644 sdk/python/tests/unit/dbt/sample_manifest.json create mode 100644 sdk/python/tests/unit/dbt/test_mapper.py create mode 100644 sdk/python/tests/unit/dbt/test_parser.py diff --git a/sdk/python/feast/cli/cli.py b/sdk/python/feast/cli/cli.py index 60ea6292488..7812e065584 100644 --- a/sdk/python/feast/cli/cli.py +++ b/sdk/python/feast/cli/cli.py @@ -26,6 +26,7 @@ from feast import utils from feast.cli.data_sources import data_sources_cmd +from feast.cli.dbt_import import dbt_cmd from feast.cli.entities import entities_cmd from feast.cli.feature_services import feature_services_cmd from feast.cli.feature_views import feature_views_cmd @@ -553,6 +554,7 @@ def validate( cli.add_command(serve_offline_command) cli.add_command(serve_registry_command) cli.add_command(serve_transformations_command) +cli.add_command(dbt_cmd) if __name__ == "__main__": cli() diff --git a/sdk/python/feast/cli/dbt_import.py b/sdk/python/feast/cli/dbt_import.py new file mode 100644 index 00000000000..bd47c855a8b --- /dev/null +++ b/sdk/python/feast/cli/dbt_import.py @@ -0,0 +1,379 @@ +""" +CLI commands for importing dbt models as Feast features. + +This module provides the `feast dbt` command group for integrating +dbt models with Feast feature stores. +""" + +from typing import List, Optional + +import click +from colorama import Fore, Style + +from feast.repo_operations import cli_check_repo, create_feature_store + + +@click.group(name="dbt") +def dbt_cmd(): + """Import dbt models as Feast features.""" + pass + + +@dbt_cmd.command("import") +@click.option( + "--manifest-path", + "-m", + required=True, + type=click.Path(exists=True), + help="Path to dbt manifest.json file (typically target/manifest.json)", +) +@click.option( + "--entity-column", + "-e", + required=True, + help="Primary key / entity column name (e.g., driver_id, customer_id)", +) +@click.option( + "--data-source-type", + "-d", + type=click.Choice(["bigquery", "snowflake", "file"]), + default="bigquery", + show_default=True, + help="Type of data source to create", +) +@click.option( + "--timestamp-field", + "-t", + default="event_timestamp", + show_default=True, + help="Timestamp field name for point-in-time joins", +) +@click.option( + "--tag", + "tag_filter", + default=None, + help="Only import models with this dbt tag (e.g., --tag feast)", +) +@click.option( + "--model", + "model_names", + multiple=True, + help="Specific model names to import (can be specified multiple times)", +) +@click.option( + "--ttl-days", + type=int, + default=1, + show_default=True, + help="TTL (time-to-live) in days for feature views", +) +@click.option( + "--dry-run", + is_flag=True, + default=False, + help="Preview what would be created without applying changes", +) +@click.option( + "--exclude-columns", + default=None, + help="Comma-separated list of columns to exclude from features", +) +@click.option( + "--output", + "-o", + type=click.Path(), + default=None, + help="Output Python file path (e.g., features.py). Generates code instead of applying to registry.", +) +@click.pass_context +def import_command( + ctx: click.Context, + manifest_path: str, + entity_column: str, + data_source_type: str, + timestamp_field: str, + tag_filter: Optional[str], + model_names: tuple, + ttl_days: int, + dry_run: bool, + exclude_columns: Optional[str], + output: Optional[str], +): + """ + Import dbt models as Feast FeatureViews. + + This command parses a dbt manifest.json file and creates corresponding + Feast DataSource and FeatureView objects. + + Examples: + + # Import all models with 'feast' tag + feast dbt import -m target/manifest.json -e driver_id --tag feast + + # Import specific models + feast dbt import -m target/manifest.json -e customer_id --model orders --model customers + + # Dry run to preview changes + feast dbt import -m target/manifest.json -e driver_id --tag feast --dry-run + + # Generate Python file instead of applying to registry + feast dbt import -m target/manifest.json -e driver_id --tag feast --output features.py + """ + from feast.dbt.parser import DbtManifestParser + from feast.dbt.mapper import DbtToFeastMapper + + # Parse manifest + click.echo(f"{Fore.CYAN}Parsing dbt manifest: {manifest_path}{Style.RESET_ALL}") + + try: + parser = DbtManifestParser(manifest_path) + parser.parse() + except FileNotFoundError as e: + click.echo(f"{Fore.RED}Error: {e}{Style.RESET_ALL}", err=True) + raise SystemExit(1) + except ValueError as e: + click.echo(f"{Fore.RED}Error: {e}{Style.RESET_ALL}", err=True) + raise SystemExit(1) + + # Display manifest info + if parser.dbt_version: + click.echo(f" dbt version: {parser.dbt_version}") + if parser.project_name: + click.echo(f" Project: {parser.project_name}") + + # Get models with filters + model_list: Optional[List[str]] = list(model_names) if model_names else None + models = parser.get_models(model_names=model_list, tag_filter=tag_filter) + + if not models: + click.echo( + f"{Fore.YELLOW}No models found matching the criteria.{Style.RESET_ALL}" + ) + if tag_filter: + click.echo(f" Tag filter: {tag_filter}") + if model_names: + click.echo(f" Model names: {', '.join(model_names)}") + raise SystemExit(0) + + click.echo( + f"{Fore.GREEN}Found {len(models)} model(s) to import:{Style.RESET_ALL}" + ) + for model in models: + tags_str = f" [tags: {', '.join(model.tags)}]" if model.tags else "" + click.echo(f" - {model.name} ({len(model.columns)} columns){tags_str}") + + # Parse exclude columns + excluded: Optional[List[str]] = None + if exclude_columns: + excluded = [c.strip() for c in exclude_columns.split(",")] + + # Create mapper + mapper = DbtToFeastMapper( + data_source_type=data_source_type, + timestamp_field=timestamp_field, + ttl_days=ttl_days, + ) + + # Generate Feast objects + click.echo(f"\n{Fore.CYAN}Generating Feast objects...{Style.RESET_ALL}") + + all_objects = [] + entities_created = {} + + for model in models: + # Validate timestamp field exists + column_names = [c.name for c in model.columns] + if timestamp_field not in column_names: + click.echo( + f"{Fore.YELLOW}Warning: Model '{model.name}' missing timestamp " + f"field '{timestamp_field}'. Skipping.{Style.RESET_ALL}" + ) + continue + + # Validate entity column exists + if entity_column not in column_names: + click.echo( + f"{Fore.YELLOW}Warning: Model '{model.name}' missing entity " + f"column '{entity_column}'. Skipping.{Style.RESET_ALL}" + ) + continue + + # Create or reuse entity + if entity_column not in entities_created: + entity = mapper.create_entity( + name=entity_column, + description=f"Entity key for dbt models", + ) + entities_created[entity_column] = entity + all_objects.append(entity) + else: + entity = entities_created[entity_column] + + # Create data source + data_source = mapper.create_data_source( + model=model, + timestamp_field=timestamp_field, + ) + all_objects.append(data_source) + + # Create feature view + feature_view = mapper.create_feature_view( + model=model, + source=data_source, + entity_column=entity_column, + entity=entity, + timestamp_field=timestamp_field, + ttl_days=ttl_days, + exclude_columns=excluded, + ) + all_objects.append(feature_view) + + click.echo( + f" {Fore.GREEN}✓{Style.RESET_ALL} {model.name}: " + f"DataSource + FeatureView ({len(feature_view.features)} features)" + ) + + if not all_objects: + click.echo( + f"{Fore.YELLOW}No valid models to import (check warnings above).{Style.RESET_ALL}" + ) + raise SystemExit(0) + + # Filter models that were actually processed (have valid columns) + valid_models = [ + m for m in models + if timestamp_field in [c.name for c in m.columns] + and entity_column in [c.name for c in m.columns] + ] + + # Summary + click.echo(f"\n{Fore.CYAN}Summary:{Style.RESET_ALL}") + click.echo(f" Entities: {len(entities_created)}") + click.echo(f" DataSources: {len(valid_models)}") + click.echo(f" FeatureViews: {len(valid_models)}") + + # Generate Python file if --output specified + if output: + from feast.dbt.codegen import generate_feast_code + + code = generate_feast_code( + models=valid_models, + entity_column=entity_column, + data_source_type=data_source_type, + timestamp_field=timestamp_field, + ttl_days=ttl_days, + manifest_path=manifest_path, + project_name=parser.project_name or "", + exclude_columns=excluded, + online=True, + ) + + with open(output, "w") as f: + f.write(code) + + click.echo( + f"\n{Fore.GREEN}✓ Generated Feast definitions: {output}{Style.RESET_ALL}" + ) + click.echo(f" You can now import this file in your feature_store.yaml repo.") + return + + if dry_run: + click.echo( + f"\n{Fore.YELLOW}Dry run - no changes applied.{Style.RESET_ALL}" + ) + click.echo("Remove --dry-run flag to apply changes.") + return + + # Apply to Feast + click.echo(f"\n{Fore.CYAN}Applying to Feast registry...{Style.RESET_ALL}") + + repo = ctx.obj["CHDIR"] + fs_yaml_file = ctx.obj["FS_YAML_FILE"] + cli_check_repo(repo, fs_yaml_file) + store = create_feature_store(ctx) + + store.apply(all_objects) + + click.echo( + f"{Fore.GREEN}✓ Successfully imported {len(valid_models)} dbt model(s) " + f"to Feast project '{store.project}'{Style.RESET_ALL}" + ) + + +@dbt_cmd.command("list") +@click.option( + "--manifest-path", + "-m", + required=True, + type=click.Path(exists=True), + help="Path to dbt manifest.json file", +) +@click.option( + "--tag", + "tag_filter", + default=None, + help="Filter models by dbt tag", +) +@click.option( + "--show-columns", + is_flag=True, + default=False, + help="Show column details for each model", +) +def list_command( + manifest_path: str, + tag_filter: Optional[str], + show_columns: bool, +): + """ + List dbt models available for import. + + Examples: + + # List all models + feast dbt list -m target/manifest.json + + # List models with specific tag + feast dbt list -m target/manifest.json --tag feast + + # Show column details + feast dbt list -m target/manifest.json --show-columns + """ + from feast.dbt.parser import DbtManifestParser + + click.echo(f"{Fore.CYAN}Parsing dbt manifest: {manifest_path}{Style.RESET_ALL}") + + try: + parser = DbtManifestParser(manifest_path) + parser.parse() + except (FileNotFoundError, ValueError) as e: + click.echo(f"{Fore.RED}Error: {e}{Style.RESET_ALL}", err=True) + raise SystemExit(1) + + if parser.dbt_version: + click.echo(f" dbt version: {parser.dbt_version}") + if parser.project_name: + click.echo(f" Project: {parser.project_name}") + + models = parser.get_models(tag_filter=tag_filter) + + if not models: + click.echo(f"{Fore.YELLOW}No models found.{Style.RESET_ALL}") + return + + click.echo(f"\n{Fore.GREEN}Found {len(models)} model(s):{Style.RESET_ALL}\n") + + for model in models: + tags_str = f" [tags: {', '.join(model.tags)}]" if model.tags else "" + click.echo(f"{Fore.CYAN}{model.name}{Style.RESET_ALL}{tags_str}") + click.echo(f" Table: {model.full_table_name}") + if model.description: + click.echo(f" Description: {model.description[:80]}...") + + if show_columns and model.columns: + click.echo(f" Columns ({len(model.columns)}):") + for col in model.columns: + type_str = col.data_type or "unknown" + click.echo(f" - {col.name}: {type_str}") + + click.echo() diff --git a/sdk/python/feast/dbt/__init__.py b/sdk/python/feast/dbt/__init__.py new file mode 100644 index 00000000000..0c6a290f3cf --- /dev/null +++ b/sdk/python/feast/dbt/__init__.py @@ -0,0 +1,29 @@ +""" +dbt integration for Feast. + +This module provides functionality to import dbt models as Feast FeatureViews, +enabling automatic generation of Feast objects from dbt manifest.json files. + +Example usage: + >>> from feast.dbt import DbtManifestParser, DbtToFeastMapper + >>> parser = DbtManifestParser("target/manifest.json") + >>> parser.parse() + >>> models = parser.get_models(tag_filter="feast") + >>> mapper = DbtToFeastMapper(data_source_type="bigquery") + >>> for model in models: + ... data_source = mapper.create_data_source(model) + ... feature_view = mapper.create_feature_view(model, data_source, "driver_id") +""" + +from feast.dbt.parser import DbtManifestParser, DbtModel, DbtColumn +from feast.dbt.mapper import DbtToFeastMapper +from feast.dbt.codegen import DbtCodeGenerator, generate_feast_code + +__all__ = [ + "DbtManifestParser", + "DbtModel", + "DbtColumn", + "DbtToFeastMapper", + "DbtCodeGenerator", + "generate_feast_code", +] diff --git a/sdk/python/feast/dbt/codegen.py b/sdk/python/feast/dbt/codegen.py new file mode 100644 index 00000000000..ec3cbea3cbc --- /dev/null +++ b/sdk/python/feast/dbt/codegen.py @@ -0,0 +1,378 @@ +""" +Code generator for dbt to Feast imports. + +This module generates Python code files containing Feast object definitions +(Entity, DataSource, FeatureView) from dbt model metadata. +""" + +from typing import Any, List, Optional, Set + +from jinja2 import Environment, BaseLoader + +from feast.dbt.parser import DbtModel +from feast.dbt.mapper import map_dbt_type_to_feast_type +from feast.types import ( + Array, + Bool, + Bytes, + Float32, + Float64, + Int32, + Int64, + String, + UnixTimestamp, +) + + +# Template for generating a complete Feast definitions file +FEAST_FILE_TEMPLATE = '''""" +Feast feature definitions generated from dbt models. + +Source: {{ manifest_path }} +Project: {{ project_name }} +Generated by: feast dbt import +""" + +from datetime import timedelta + +from feast import Entity, FeatureView, Field +{% if type_imports %} +from feast.types import {{ type_imports | join(', ') }} +{% endif %} +{% if data_source_type == 'bigquery' %} +from feast.infra.offline_stores.bigquery_source import BigQuerySource +{% elif data_source_type == 'snowflake' %} +from feast.infra.offline_stores.snowflake_source import SnowflakeSource +{% elif data_source_type == 'file' %} +from feast.infra.offline_stores.file_source import FileSource +{% endif %} + + +# ============================================================================= +# Entities +# ============================================================================= + +{% for entity in entities %} +{{ entity.var_name }} = Entity( + name="{{ entity.name }}", + join_keys=["{{ entity.join_key }}"], + description="{{ entity.description }}", + tags={{ entity.tags }}, +) + +{% endfor %} + +# ============================================================================= +# Data Sources +# ============================================================================= + +{% for source in data_sources %} +{% if data_source_type == 'bigquery' %} +{{ source.var_name }} = BigQuerySource( + name="{{ source.name }}", + table="{{ source.table }}", + timestamp_field="{{ source.timestamp_field }}", + description="{{ source.description }}", + tags={{ source.tags }}, +) +{% elif data_source_type == 'snowflake' %} +{{ source.var_name }} = SnowflakeSource( + name="{{ source.name }}", + database="{{ source.database }}", + schema="{{ source.schema }}", + table="{{ source.table }}", + timestamp_field="{{ source.timestamp_field }}", + description="{{ source.description }}", + tags={{ source.tags }}, +) +{% elif data_source_type == 'file' %} +{{ source.var_name }} = FileSource( + name="{{ source.name }}", + path="{{ source.path }}", + timestamp_field="{{ source.timestamp_field }}", + description="{{ source.description }}", + tags={{ source.tags }}, +) +{% endif %} + +{% endfor %} + +# ============================================================================= +# Feature Views +# ============================================================================= + +{% for fv in feature_views %} +{{ fv.var_name }} = FeatureView( + name="{{ fv.name }}", + entities=[{{ fv.entity_var }}], + ttl=timedelta(days={{ fv.ttl_days }}), + schema=[ +{% for field in fv.fields %} + Field(name="{{ field.name }}", dtype={{ field.dtype }}{% if field.description %}, description="{{ field.description }}"{% endif %}), +{% endfor %} + ], + online={{ fv.online }}, + source={{ fv.source_var }}, + description="{{ fv.description }}", + tags={{ fv.tags }}, +) + +{% endfor %} +''' + + +def _get_feast_type_name(feast_type: Any) -> str: + """Get the string name of a Feast type for code generation.""" + if isinstance(feast_type, Array): + # Handle Array types + base_type_name = _get_feast_type_name(feast_type.base_type) + return f"Array({base_type_name})" + + # Map type objects to their names + type_map = { + String: "String", + Int32: "Int32", + Int64: "Int64", + Float32: "Float32", + Float64: "Float64", + Bool: "Bool", + UnixTimestamp: "UnixTimestamp", + Bytes: "Bytes", + } + + return type_map.get(feast_type, "String") + + +def _make_var_name(name: str) -> str: + """Convert a name to a valid Python variable name.""" + # Replace hyphens and spaces with underscores + var_name = name.replace("-", "_").replace(" ", "_") + # Ensure it starts with a letter or underscore + if var_name and var_name[0].isdigit(): + var_name = f"_{var_name}" + return var_name + + +def _escape_description(desc: Optional[str]) -> str: + """Escape a description string for use in Python code.""" + if not desc: + return "" + # Escape quotes and newlines + return desc.replace("\\", "\\\\").replace('"', '\\"').replace("\n", " ") + + +class DbtCodeGenerator: + """ + Generates Python code for Feast objects from dbt models. + + This class creates complete, importable Python files containing + Entity, DataSource, and FeatureView definitions. + + Example: + >>> generator = DbtCodeGenerator( + ... data_source_type="bigquery", + ... timestamp_field="event_timestamp", + ... ttl_days=7 + ... ) + >>> code = generator.generate( + ... models=models, + ... entity_column="user_id", + ... manifest_path="target/manifest.json", + ... project_name="my_project" + ... ) + >>> with open("features.py", "w") as f: + ... f.write(code) + """ + + def __init__( + self, + data_source_type: str = "bigquery", + timestamp_field: str = "event_timestamp", + ttl_days: int = 1, + ): + self.data_source_type = data_source_type.lower() + self.timestamp_field = timestamp_field + self.ttl_days = ttl_days + + # Set up Jinja2 environment + self.env = Environment( + loader=BaseLoader(), + trim_blocks=True, + lstrip_blocks=True, + ) + self.template = self.env.from_string(FEAST_FILE_TEMPLATE) + + def generate( + self, + models: List[DbtModel], + entity_column: str, + manifest_path: str = "", + project_name: str = "", + exclude_columns: Optional[List[str]] = None, + online: bool = True, + ) -> str: + """ + Generate Python code for Feast objects from dbt models. + + Args: + models: List of DbtModel objects to generate code for + entity_column: The entity/primary key column name + manifest_path: Path to the dbt manifest (for documentation) + project_name: dbt project name (for documentation) + exclude_columns: Columns to exclude from features + online: Whether to enable online serving + + Returns: + Generated Python code as a string + """ + excluded = {entity_column, self.timestamp_field} + if exclude_columns: + excluded.update(exclude_columns) + + # Collect all Feast types used for imports + type_imports: Set[str] = set() + + # Prepare entity data + entities = [] + entity_var = _make_var_name(entity_column) + entities.append({ + "var_name": entity_var, + "name": entity_column, + "join_key": entity_column, + "description": f"Entity key for dbt models", + "tags": {"source": "dbt"}, + }) + + # Prepare data sources and feature views + data_sources = [] + feature_views = [] + + for model in models: + # Check required columns exist + column_names = [c.name for c in model.columns] + if self.timestamp_field not in column_names: + continue + if entity_column not in column_names: + continue + + # Build tags + tags = {"dbt.model": model.name} + for tag in model.tags: + tags[f"dbt.tag.{tag}"] = "true" + + # Data source + source_var = _make_var_name(f"{model.name}_source") + source_data = { + "var_name": source_var, + "name": f"{model.name}_source", + "timestamp_field": self.timestamp_field, + "description": _escape_description(model.description), + "tags": tags, + } + + if self.data_source_type == "bigquery": + source_data["table"] = model.full_table_name + elif self.data_source_type == "snowflake": + source_data["database"] = model.database + source_data["schema"] = model.schema + source_data["table"] = model.alias + elif self.data_source_type == "file": + source_data["path"] = f"/data/{model.name}.parquet" + + data_sources.append(source_data) + + # Feature view fields + fields = [] + for column in model.columns: + if column.name in excluded: + continue + + feast_type = map_dbt_type_to_feast_type(column.data_type) + type_name = _get_feast_type_name(feast_type) + + # Track base type for imports (handle Array specially) + if isinstance(feast_type, Array): + type_imports.add("Array") + # Also add the base type + base_type_name = _get_feast_type_name(feast_type.base_type) + type_imports.add(base_type_name) + else: + type_imports.add(type_name) + + fields.append({ + "name": column.name, + "dtype": type_name, + "description": _escape_description(column.description), + }) + + # Feature view + fv_var = _make_var_name(f"{model.name}_fv") + feature_views.append({ + "var_name": fv_var, + "name": model.name, + "entity_var": entity_var, + "source_var": source_var, + "ttl_days": self.ttl_days, + "fields": fields, + "online": online, + "description": _escape_description(model.description), + "tags": tags, + }) + + # Sort type imports for consistent output + sorted_types = sorted(type_imports) + + # Render template + return self.template.render( + manifest_path=manifest_path, + project_name=project_name, + data_source_type=self.data_source_type, + type_imports=sorted_types, + entities=entities, + data_sources=data_sources, + feature_views=feature_views, + ) + + +def generate_feast_code( + models: List[DbtModel], + entity_column: str, + data_source_type: str = "bigquery", + timestamp_field: str = "event_timestamp", + ttl_days: int = 1, + manifest_path: str = "", + project_name: str = "", + exclude_columns: Optional[List[str]] = None, + online: bool = True, +) -> str: + """ + Convenience function to generate Feast code from dbt models. + + Args: + models: List of DbtModel objects + entity_column: Primary key column name + data_source_type: Type of data source (bigquery, snowflake, file) + timestamp_field: Timestamp column name + ttl_days: TTL in days for feature views + manifest_path: Path to manifest for documentation + project_name: Project name for documentation + exclude_columns: Columns to exclude from features + online: Whether to enable online serving + + Returns: + Generated Python code as a string + """ + generator = DbtCodeGenerator( + data_source_type=data_source_type, + timestamp_field=timestamp_field, + ttl_days=ttl_days, + ) + + return generator.generate( + models=models, + entity_column=entity_column, + manifest_path=manifest_path, + project_name=project_name, + exclude_columns=exclude_columns, + online=online, + ) diff --git a/sdk/python/feast/dbt/mapper.py b/sdk/python/feast/dbt/mapper.py new file mode 100644 index 00000000000..1275b7cf0a1 --- /dev/null +++ b/sdk/python/feast/dbt/mapper.py @@ -0,0 +1,411 @@ +""" +dbt to Feast type and object mapper. + +This module provides functionality to map dbt model metadata to Feast objects +including DataSource, Entity, and FeatureView. +""" + +from datetime import timedelta +from typing import Any, Dict, List, Optional, Union + +from feast.dbt.parser import DbtModel +from feast.entity import Entity +from feast.feature_view import FeatureView +from feast.field import Field +from feast.value_type import ValueType +from feast.types import ( + Array, + Bool, + Bytes, + Float32, + Float64, + Int32, + Int64, + String, + UnixTimestamp, + FeastType, +) + + +# Comprehensive mapping from dbt/warehouse types to Feast types +# Covers BigQuery, Snowflake, Redshift, PostgreSQL, and common SQL types +DBT_TO_FEAST_TYPE_MAP: Dict[str, FeastType] = { + # String types + "STRING": String, + "TEXT": String, + "VARCHAR": String, + "CHAR": String, + "CHARACTER": String, + "NVARCHAR": String, + "NCHAR": String, + "CHARACTER VARYING": String, + # Integer types + "INT": Int64, + "INT32": Int32, + "INT64": Int64, + "INTEGER": Int64, + "BIGINT": Int64, + "SMALLINT": Int32, + "TINYINT": Int32, + "BYTEINT": Int32, + "NUMBER": Int64, # Snowflake - default to Int64, precision handling below + "NUMERIC": Int64, + "DECIMAL": Int64, + # Float types + "FLOAT": Float32, + "FLOAT32": Float32, + "FLOAT64": Float64, + "DOUBLE": Float64, + "DOUBLE PRECISION": Float64, + "REAL": Float32, + # Boolean types + "BOOL": Bool, + "BOOLEAN": Bool, + # Timestamp types + "TIMESTAMP": UnixTimestamp, + "TIMESTAMP_NTZ": UnixTimestamp, + "TIMESTAMP_LTZ": UnixTimestamp, + "TIMESTAMP_TZ": UnixTimestamp, + "DATETIME": UnixTimestamp, + "DATE": UnixTimestamp, + "TIME": UnixTimestamp, + # Binary types + "BYTES": Bytes, + "BINARY": Bytes, + "VARBINARY": Bytes, + "BLOB": Bytes, +} + + +def map_dbt_type_to_feast_type(dbt_type: str) -> FeastType: + """ + Map a dbt data type to a Feast type. + + Handles various database type formats including: + - Simple types: STRING, INT64, FLOAT + - Parameterized types: VARCHAR(255), NUMBER(10,2), DECIMAL(18,0) + - Array types: ARRAY, ARRAY + + Args: + dbt_type: The dbt/database data type string + + Returns: + The corresponding Feast type + + Examples: + >>> map_dbt_type_to_feast_type("STRING") + String + >>> map_dbt_type_to_feast_type("INT64") + Int64 + >>> map_dbt_type_to_feast_type("ARRAY") + Array(String) + """ + if not dbt_type: + return String + + # Normalize the type string + normalized = dbt_type.upper().strip() + + # Handle ARRAY types: ARRAY + if normalized.startswith("ARRAY<") and normalized.endswith(">"): + element_type_str = normalized[6:-1].strip() + element_type = map_dbt_type_to_feast_type(element_type_str) + # Array only supports primitive types + if isinstance(element_type, type(String)): + return Array(element_type) + return Array(String) # Fallback for complex nested types + + # Handle parameterized types: VARCHAR(255), NUMBER(10,2), etc. + # Extract base type by removing parentheses and parameters + base_type = normalized.split("(")[0].strip() + + # Handle Snowflake NUMBER with precision + if base_type == "NUMBER" and "(" in normalized: + try: + # Parse precision and scale: NUMBER(precision, scale) + params = normalized.split("(")[1].rstrip(")").split(",") + precision = int(params[0].strip()) + scale = int(params[1].strip()) if len(params) > 1 else 0 + + if scale > 0: + # Has decimal places, use Float64 + return Float64 + elif precision <= 9: + return Int32 + elif precision <= 18: + return Int64 + else: + # Precision > 18, may exceed Int64 range + return Float64 + except (ValueError, IndexError): + return Int64 + + # Look up in mapping table + if base_type in DBT_TO_FEAST_TYPE_MAP: + return DBT_TO_FEAST_TYPE_MAP[base_type] + + # Default to String for unknown types + return String + + +class DbtToFeastMapper: + """ + Maps dbt models to Feast objects. + + Supports creating DataSource, Entity, and FeatureView objects from + dbt model metadata. + + Examples: + >>> mapper = DbtToFeastMapper(data_source_type="bigquery") + >>> data_source = mapper.create_data_source(model) + >>> feature_view = mapper.create_feature_view( + ... model, data_source, entity_column="driver_id" + ... ) + + Args: + data_source_type: Type of data source ('bigquery', 'snowflake', 'file') + timestamp_field: Default timestamp field name + ttl_days: Default TTL in days for feature views + """ + + def __init__( + self, + data_source_type: str = "bigquery", + timestamp_field: str = "event_timestamp", + ttl_days: int = 1, + ): + self.data_source_type = data_source_type.lower() + self.timestamp_field = timestamp_field + self.ttl_days = ttl_days + + def create_data_source( + self, + model: DbtModel, + timestamp_field: Optional[str] = None, + created_timestamp_column: Optional[str] = None, + ) -> Any: + """ + Create a Feast DataSource from a dbt model. + + Args: + model: The DbtModel to create a DataSource from + timestamp_field: Override the default timestamp field + created_timestamp_column: Column for created timestamp (dedup) + + Returns: + A Feast DataSource (BigQuerySource, SnowflakeSource, or FileSource) + + Raises: + ValueError: If data_source_type is not supported + """ + ts_field = timestamp_field or self.timestamp_field + + # Build tags from dbt metadata + tags = {"dbt.model": model.name} + for tag in model.tags: + tags[f"dbt.tag.{tag}"] = "true" + + if self.data_source_type == "bigquery": + from feast.infra.offline_stores.bigquery_source import BigQuerySource + + return BigQuerySource( + name=f"{model.name}_source", + table=model.full_table_name, + timestamp_field=ts_field, + created_timestamp_column=created_timestamp_column or "", + description=model.description, + tags=tags, + ) + + elif self.data_source_type == "snowflake": + from feast.infra.offline_stores.snowflake_source import SnowflakeSource + + return SnowflakeSource( + name=f"{model.name}_source", + database=model.database, + schema=model.schema, + table=model.alias, + timestamp_field=ts_field, + created_timestamp_column=created_timestamp_column or "", + description=model.description, + tags=tags, + ) + + elif self.data_source_type == "file": + from feast.infra.offline_stores.file_source import FileSource + + # For file sources, use the model name as a placeholder path + return FileSource( + name=f"{model.name}_source", + path=f"/data/{model.name}.parquet", + timestamp_field=ts_field, + created_timestamp_column=created_timestamp_column or "", + description=model.description, + tags=tags, + ) + + else: + raise ValueError( + f"Unsupported data_source_type: {self.data_source_type}. " + f"Supported types: bigquery, snowflake, file" + ) + + def create_entity( + self, + name: str, + join_keys: Optional[List[str]] = None, + description: str = "", + tags: Optional[Dict[str, str]] = None, + value_type: ValueType = ValueType.STRING, + ) -> Entity: + """ + Create a Feast Entity. + + Args: + name: Entity name + join_keys: List of join key column names (defaults to [name]) + description: Entity description + tags: Optional tags + value_type: Value type for the entity (default: STRING) + + Returns: + A Feast Entity + """ + return Entity( + name=name, + join_keys=join_keys or [name], + value_type=value_type, + description=description, + tags=tags or {}, + ) + + def create_feature_view( + self, + model: DbtModel, + source: Any, + entity_column: str, + entity: Optional[Entity] = None, + timestamp_field: Optional[str] = None, + ttl_days: Optional[int] = None, + exclude_columns: Optional[List[str]] = None, + online: bool = True, + ) -> FeatureView: + """ + Create a Feast FeatureView from a dbt model. + + Args: + model: The DbtModel to create a FeatureView from + source: The DataSource for this FeatureView + entity_column: The entity/primary key column name + entity: Optional pre-created Entity (created if not provided) + timestamp_field: Override the default timestamp field + ttl_days: Override the default TTL in days + exclude_columns: Additional columns to exclude from features + online: Whether to enable online serving + + Returns: + A Feast FeatureView + """ + ts_field = timestamp_field or self.timestamp_field + ttl = timedelta(days=ttl_days if ttl_days is not None else self.ttl_days) + + # Columns to exclude from features + excluded = {entity_column, ts_field} + if exclude_columns: + excluded.update(exclude_columns) + + # Create schema from model columns + schema: List[Field] = [] + for column in model.columns: + if column.name not in excluded: + feast_type = map_dbt_type_to_feast_type(column.data_type) + schema.append( + Field( + name=column.name, + dtype=feast_type, + description=column.description, + ) + ) + + # Create entity if not provided + if entity is None: + entity = self.create_entity( + name=entity_column, + description=f"Entity for {model.name}", + ) + + # Build tags from dbt metadata + tags = { + "dbt.model": model.name, + "dbt.unique_id": model.unique_id, + } + for tag in model.tags: + tags[f"dbt.tag.{tag}"] = "true" + + return FeatureView( + name=model.name, + source=source, + schema=schema, + entities=[entity], + ttl=ttl, + online=online, + description=model.description, + tags=tags, + ) + + def create_all_from_model( + self, + model: DbtModel, + entity_column: str, + timestamp_field: Optional[str] = None, + ttl_days: Optional[int] = None, + exclude_columns: Optional[List[str]] = None, + online: bool = True, + ) -> Dict[str, Union[Entity, Any, FeatureView]]: + """ + Create all Feast objects (DataSource, Entity, FeatureView) from a dbt model. + + This is a convenience method that creates all necessary Feast objects + in one call. + + Args: + model: The DbtModel to create objects from + entity_column: The entity/primary key column name + timestamp_field: Override the default timestamp field + ttl_days: Override the default TTL in days + exclude_columns: Additional columns to exclude from features + online: Whether to enable online serving + + Returns: + Dict with keys 'entity', 'data_source', 'feature_view' + """ + # Create entity + entity = self.create_entity( + name=entity_column, + description=f"Entity for {model.name}", + tags={"dbt.model": model.name}, + ) + + # Create data source + data_source = self.create_data_source( + model=model, + timestamp_field=timestamp_field, + ) + + # Create feature view + feature_view = self.create_feature_view( + model=model, + source=data_source, + entity_column=entity_column, + entity=entity, + timestamp_field=timestamp_field, + ttl_days=ttl_days, + exclude_columns=exclude_columns, + online=online, + ) + + return { + "entity": entity, + "data_source": data_source, + "feature_view": feature_view, + } diff --git a/sdk/python/feast/dbt/parser.py b/sdk/python/feast/dbt/parser.py new file mode 100644 index 00000000000..ef3e7c8b80b --- /dev/null +++ b/sdk/python/feast/dbt/parser.py @@ -0,0 +1,227 @@ +""" +dbt manifest parser for Feast integration. + +This module provides functionality to parse dbt manifest.json files and extract +model metadata for generating Feast FeatureViews. + +Uses dbt-artifacts-parser to handle manifest versions v1-v12 (dbt 0.19 through 1.11). +""" + +import json +from enum import property +from pathlib import Path +from typing import Any, Dict, List, Optional +from dataclasses import dataclass, field + + +@dataclass +class DbtColumn: + """Represents a column in a dbt model.""" + + name: str + description: str = "" + data_type: str = "STRING" + tags: List[str] = field(default_factory=list) + meta: Dict[str, Any] = field(default_factory=dict) + + +@dataclass +class DbtModel: + """Represents a dbt model.""" + + name: str + unique_id: str + database: str + schema: str + alias: str + description: str = "" + columns: List[DbtColumn] = field(default_factory=list) + tags: List[str] = field(default_factory=list) + meta: Dict[str, Any] = field(default_factory=dict) + depends_on: List[str] = field(default_factory=list) + + @property + def full_table_name(self) -> str: + """Returns fully qualified table name (database.schema.table).""" + return f"{self.database}.{self.schema}.{self.alias}" + + +class DbtManifestParser: + """ + Parser for dbt manifest.json files. + + Uses dbt-artifacts-parser to handle manifest versions v1-v12. + Supports dbt versions 0.19 through 1.11. + + Examples: + >>> parser = DbtManifestParser("target/manifest.json") + >>> parser.parse() + >>> models = parser.get_models(tag_filter="feast") + >>> for model in models: + ... print(f"Model: {model.name}, Columns: {len(model.columns)}") + + Args: + manifest_path: Path to manifest.json file (typically target/manifest.json) + + Raises: + FileNotFoundError: If manifest.json doesn't exist + ValueError: If manifest.json is invalid JSON + """ + + def __init__(self, manifest_path: str): + """ + Initialize parser. + + Args: + manifest_path: Path to manifest.json file + """ + self.manifest_path = Path(manifest_path) + self.manifest = None + self._raw_manifest: Optional[Dict[str, Any]] = None + + def parse(self) -> None: + """ + Load and parse the manifest.json file. + + Raises: + FileNotFoundError: If manifest.json doesn't exist + ValueError: If manifest.json is invalid JSON + """ + if not self.manifest_path.exists(): + raise FileNotFoundError( + f"dbt manifest not found at {self.manifest_path}.\n" + f"Run 'dbt compile' or 'dbt run' first.\n" + f"Expected path: /target/manifest.json" + ) + + try: + with open(self.manifest_path, "r") as f: + self._raw_manifest = json.load(f) + except json.JSONDecodeError as e: + raise ValueError( + f"Invalid JSON in manifest: {e}\n" + f"Try: dbt clean && dbt compile" + ) + + # Try to use dbt-artifacts-parser if available + try: + from dbt_artifacts_parser.parser import parse_manifest + + self.manifest = parse_manifest(manifest=self._raw_manifest) + except ImportError: + # Fall back to raw dict parsing if dbt-artifacts-parser not installed + self.manifest = None + + def get_models( + self, + model_names: Optional[List[str]] = None, + tag_filter: Optional[str] = None, + ) -> List[DbtModel]: + """ + Extract dbt models from manifest. + + Args: + model_names: Optional list of specific model names to extract + tag_filter: Optional tag to filter models by + + Returns: + List of DbtModel objects + + Examples: + >>> models = parser.get_models(model_names=["driver_stats"]) + >>> models = parser.get_models(tag_filter="feast") + """ + if self._raw_manifest is None: + self.parse() + + models = [] + nodes = self._raw_manifest.get("nodes", {}) + + for node_id, node in nodes.items(): + # Only process models (not tests, seeds, snapshots, etc.) + if not node_id.startswith("model."): + continue + + # Also check resource_type if available + resource_type = node.get("resource_type", "model") + if resource_type != "model": + continue + + model_name = node.get("name", "") + + # Filter by model names if specified + if model_names and model_name not in model_names: + continue + + # Get tags from node + node_tags = node.get("tags", []) or [] + + # Filter by tag if specified + if tag_filter and tag_filter not in node_tags: + continue + + # Extract columns + columns = [] + node_columns = node.get("columns", {}) or {} + for col_name, col_data in node_columns.items(): + if isinstance(col_data, dict): + columns.append( + DbtColumn( + name=col_name, + description=col_data.get("description", "") or "", + data_type=col_data.get("data_type", "STRING") or "STRING", + tags=col_data.get("tags", []) or [], + meta=col_data.get("meta", {}) or {}, + ) + ) + + # Get depends_on nodes + depends_on = node.get("depends_on", {}) or {} + depends_on_nodes = depends_on.get("nodes", []) or [] + + # Create DbtModel + models.append( + DbtModel( + name=model_name, + unique_id=node_id, + database=node.get("database", "") or "", + schema=node.get("schema", "") or "", + alias=node.get("alias", model_name) or model_name, + description=node.get("description", "") or "", + columns=columns, + tags=node_tags, + meta=node.get("meta", {}) or {}, + depends_on=depends_on_nodes, + ) + ) + + return models + + def get_model_by_name(self, model_name: str) -> Optional[DbtModel]: + """ + Get a specific model by name. + + Args: + model_name: Name of the model to retrieve + + Returns: + DbtModel if found, None otherwise + """ + models = self.get_models(model_names=[model_name]) + return models[0] if models else None + + @property + def dbt_version(self) -> Optional[str]: + """Get dbt version from manifest metadata.""" + if self._raw_manifest is None: + return None + metadata = self._raw_manifest.get("metadata", {}) + return metadata.get("dbt_version") + + @property + def project_name(self) -> Optional[str]: + """Get project name from manifest metadata.""" + if self._raw_manifest is None: + return None + metadata = self._raw_manifest.get("metadata", {}) + return metadata.get("project_name") diff --git a/sdk/python/tests/unit/dbt/__init__.py b/sdk/python/tests/unit/dbt/__init__.py new file mode 100644 index 00000000000..8a225265d2c --- /dev/null +++ b/sdk/python/tests/unit/dbt/__init__.py @@ -0,0 +1 @@ +# dbt integration tests diff --git a/sdk/python/tests/unit/dbt/sample_manifest.json b/sdk/python/tests/unit/dbt/sample_manifest.json new file mode 100644 index 00000000000..6a44b9db749 --- /dev/null +++ b/sdk/python/tests/unit/dbt/sample_manifest.json @@ -0,0 +1,170 @@ +{ + "metadata": { + "dbt_version": "1.5.0", + "project_name": "sample_dbt_project", + "generated_at": "2024-01-10T00:00:00Z", + "invocation_id": "12345678-1234-1234-1234-123456789012" + }, + "nodes": { + "model.sample_dbt_project.driver_stats": { + "name": "driver_stats", + "unique_id": "model.sample_dbt_project.driver_stats", + "resource_type": "model", + "database": "feast_demo", + "schema": "public", + "alias": "driver_stats", + "description": "Driver statistics aggregated hourly for ML features", + "columns": { + "driver_id": { + "name": "driver_id", + "description": "Unique driver identifier", + "data_type": "INT64", + "tags": ["entity", "primary_key"], + "meta": {} + }, + "event_timestamp": { + "name": "event_timestamp", + "description": "Timestamp of the event", + "data_type": "TIMESTAMP", + "tags": ["timestamp"], + "meta": {} + }, + "trip_count": { + "name": "trip_count", + "description": "Total number of trips completed", + "data_type": "INT64", + "tags": ["feature"], + "meta": {} + }, + "avg_rating": { + "name": "avg_rating", + "description": "Average driver rating (1-5 scale)", + "data_type": "FLOAT64", + "tags": ["feature"], + "meta": {} + }, + "total_earnings": { + "name": "total_earnings", + "description": "Total earnings in dollars", + "data_type": "FLOAT64", + "tags": ["feature"], + "meta": {} + }, + "is_active": { + "name": "is_active", + "description": "Whether driver is currently active", + "data_type": "BOOLEAN", + "tags": ["feature"], + "meta": {} + } + }, + "tags": ["feast", "ml", "driver"], + "meta": { + "owner": "ml-team@example.com", + "team": "driver-experience" + }, + "depends_on": { + "nodes": ["source.sample_dbt_project.raw_trips"] + } + }, + "model.sample_dbt_project.customer_stats": { + "name": "customer_stats", + "unique_id": "model.sample_dbt_project.customer_stats", + "resource_type": "model", + "database": "feast_demo", + "schema": "public", + "alias": "customer_stats", + "description": "Customer statistics for personalization features", + "columns": { + "customer_id": { + "name": "customer_id", + "description": "Unique customer identifier", + "data_type": "STRING", + "tags": ["entity"], + "meta": {} + }, + "event_timestamp": { + "name": "event_timestamp", + "description": "Event timestamp", + "data_type": "TIMESTAMP", + "tags": [], + "meta": {} + }, + "order_count": { + "name": "order_count", + "description": "Total number of orders placed", + "data_type": "INT64", + "tags": ["feature"], + "meta": {} + }, + "avg_order_value": { + "name": "avg_order_value", + "description": "Average order value in dollars", + "data_type": "FLOAT64", + "tags": ["feature"], + "meta": {} + }, + "preferred_payment": { + "name": "preferred_payment", + "description": "Preferred payment method", + "data_type": "STRING", + "tags": ["feature"], + "meta": {} + } + }, + "tags": ["feast", "ml", "customer"], + "meta": { + "owner": "ml-team@example.com" + }, + "depends_on": { + "nodes": [] + } + }, + "model.sample_dbt_project.location_stats": { + "name": "location_stats", + "unique_id": "model.sample_dbt_project.location_stats", + "resource_type": "model", + "database": "feast_demo", + "schema": "public", + "alias": "location_stats", + "description": "Location-based statistics (no feast tag)", + "columns": { + "location_id": { + "name": "location_id", + "description": "Location identifier", + "data_type": "STRING", + "tags": [], + "meta": {} + }, + "event_timestamp": { + "name": "event_timestamp", + "description": "Event timestamp", + "data_type": "TIMESTAMP", + "tags": [], + "meta": {} + }, + "demand_score": { + "name": "demand_score", + "description": "Demand score for the location", + "data_type": "FLOAT64", + "tags": [], + "meta": {} + } + }, + "tags": ["analytics"], + "meta": {}, + "depends_on": { + "nodes": [] + } + } + }, + "sources": { + "source.sample_dbt_project.raw_trips": { + "name": "raw_trips", + "unique_id": "source.sample_dbt_project.raw_trips", + "source_name": "raw_data", + "schema": "raw", + "identifier": "trips" + } + } +} diff --git a/sdk/python/tests/unit/dbt/test_mapper.py b/sdk/python/tests/unit/dbt/test_mapper.py new file mode 100644 index 00000000000..bd7fdae7d38 --- /dev/null +++ b/sdk/python/tests/unit/dbt/test_mapper.py @@ -0,0 +1,309 @@ +""" +Unit tests for dbt to Feast mapper. +""" + +import pytest +from datetime import timedelta + +from feast.dbt.parser import DbtModel, DbtColumn +from feast.dbt.mapper import ( + DbtToFeastMapper, + map_dbt_type_to_feast_type, + DBT_TO_FEAST_TYPE_MAP, +) +from feast.types import ( + String, + Int32, + Int64, + Float32, + Float64, + Bool, + UnixTimestamp, + Bytes, + Array, +) + + +class TestTypeMapping: + """Tests for dbt to Feast type mapping.""" + + def test_string_types(self): + """Test string type mappings.""" + assert map_dbt_type_to_feast_type("STRING") == String + assert map_dbt_type_to_feast_type("TEXT") == String + assert map_dbt_type_to_feast_type("VARCHAR") == String + assert map_dbt_type_to_feast_type("VARCHAR(255)") == String + assert map_dbt_type_to_feast_type("CHAR") == String + assert map_dbt_type_to_feast_type("NVARCHAR") == String + + def test_integer_types(self): + """Test integer type mappings.""" + assert map_dbt_type_to_feast_type("INT") == Int64 + assert map_dbt_type_to_feast_type("INT64") == Int64 + assert map_dbt_type_to_feast_type("INTEGER") == Int64 + assert map_dbt_type_to_feast_type("BIGINT") == Int64 + assert map_dbt_type_to_feast_type("INT32") == Int32 + assert map_dbt_type_to_feast_type("SMALLINT") == Int32 + assert map_dbt_type_to_feast_type("TINYINT") == Int32 + + def test_float_types(self): + """Test float type mappings.""" + assert map_dbt_type_to_feast_type("FLOAT") == Float32 + assert map_dbt_type_to_feast_type("FLOAT32") == Float32 + assert map_dbt_type_to_feast_type("FLOAT64") == Float64 + assert map_dbt_type_to_feast_type("DOUBLE") == Float64 + assert map_dbt_type_to_feast_type("DOUBLE PRECISION") == Float64 + assert map_dbt_type_to_feast_type("REAL") == Float32 + + def test_boolean_types(self): + """Test boolean type mappings.""" + assert map_dbt_type_to_feast_type("BOOL") == Bool + assert map_dbt_type_to_feast_type("BOOLEAN") == Bool + + def test_timestamp_types(self): + """Test timestamp type mappings.""" + assert map_dbt_type_to_feast_type("TIMESTAMP") == UnixTimestamp + assert map_dbt_type_to_feast_type("TIMESTAMP_NTZ") == UnixTimestamp + assert map_dbt_type_to_feast_type("TIMESTAMP_LTZ") == UnixTimestamp + assert map_dbt_type_to_feast_type("DATETIME") == UnixTimestamp + assert map_dbt_type_to_feast_type("DATE") == UnixTimestamp + + def test_binary_types(self): + """Test binary type mappings.""" + assert map_dbt_type_to_feast_type("BYTES") == Bytes + assert map_dbt_type_to_feast_type("BINARY") == Bytes + assert map_dbt_type_to_feast_type("VARBINARY") == Bytes + assert map_dbt_type_to_feast_type("BLOB") == Bytes + + def test_case_insensitivity(self): + """Test type mapping is case insensitive.""" + assert map_dbt_type_to_feast_type("string") == String + assert map_dbt_type_to_feast_type("String") == String + assert map_dbt_type_to_feast_type("STRING") == String + assert map_dbt_type_to_feast_type("int64") == Int64 + assert map_dbt_type_to_feast_type("INT64") == Int64 + + def test_parameterized_types(self): + """Test parameterized types are handled correctly.""" + assert map_dbt_type_to_feast_type("VARCHAR(255)") == String + assert map_dbt_type_to_feast_type("CHAR(10)") == String + assert map_dbt_type_to_feast_type("DECIMAL(10,2)") == Int64 + + def test_snowflake_number_precision(self): + """Test Snowflake NUMBER type with precision.""" + # Small precision -> Int32 + assert map_dbt_type_to_feast_type("NUMBER(5,0)") == Int32 + assert map_dbt_type_to_feast_type("NUMBER(9,0)") == Int32 + + # Medium precision -> Int64 + assert map_dbt_type_to_feast_type("NUMBER(10,0)") == Int64 + assert map_dbt_type_to_feast_type("NUMBER(18,0)") == Int64 + + # Large precision -> Float64 + assert map_dbt_type_to_feast_type("NUMBER(20,0)") == Float64 + + # With decimal places -> Float64 + assert map_dbt_type_to_feast_type("NUMBER(10,2)") == Float64 + assert map_dbt_type_to_feast_type("NUMBER(5,3)") == Float64 + + def test_array_types(self): + """Test ARRAY type mappings.""" + result = map_dbt_type_to_feast_type("ARRAY") + assert isinstance(result, Array) + + result = map_dbt_type_to_feast_type("ARRAY") + assert isinstance(result, Array) + + result = map_dbt_type_to_feast_type("ARRAY") + assert isinstance(result, Array) + + def test_unknown_type_defaults_to_string(self): + """Test unknown types default to String.""" + assert map_dbt_type_to_feast_type("UNKNOWN_TYPE") == String + assert map_dbt_type_to_feast_type("CUSTOM") == String + + def test_empty_type_defaults_to_string(self): + """Test empty type defaults to String.""" + assert map_dbt_type_to_feast_type("") == String + assert map_dbt_type_to_feast_type(None) == String + + +@pytest.fixture +def sample_model(): + """Create a sample DbtModel for testing.""" + return DbtModel( + name="driver_stats", + unique_id="model.test_project.driver_stats", + database="my_database", + schema="analytics", + alias="driver_stats", + description="Driver statistics", + columns=[ + DbtColumn(name="driver_id", data_type="INT64", description="Driver ID"), + DbtColumn( + name="event_timestamp", + data_type="TIMESTAMP", + description="Event time", + ), + DbtColumn( + name="trip_count", data_type="INT64", description="Number of trips" + ), + DbtColumn( + name="avg_rating", data_type="FLOAT64", description="Average rating" + ), + DbtColumn( + name="is_active", data_type="BOOLEAN", description="Is driver active" + ), + ], + tags=["feast", "ml"], + meta={"owner": "ml-team"}, + ) + + +class TestDbtToFeastMapper: + """Tests for DbtToFeastMapper class.""" + + def test_create_bigquery_data_source(self, sample_model): + """Test creating a BigQuery data source.""" + mapper = DbtToFeastMapper(data_source_type="bigquery") + source = mapper.create_data_source(sample_model) + + assert source.name == "driver_stats_source" + assert source.table == "my_database.analytics.driver_stats" + assert source.timestamp_field == "event_timestamp" + assert "dbt.model" in source.tags + assert source.tags["dbt.model"] == "driver_stats" + + def test_create_snowflake_data_source(self, sample_model): + """Test creating a Snowflake data source.""" + mapper = DbtToFeastMapper(data_source_type="snowflake") + source = mapper.create_data_source(sample_model) + + assert source.name == "driver_stats_source" + assert source.database == "my_database" + assert source.schema == "analytics" + assert source.table == "driver_stats" + assert source.timestamp_field == "event_timestamp" + + def test_create_file_data_source(self, sample_model): + """Test creating a file data source.""" + mapper = DbtToFeastMapper(data_source_type="file") + source = mapper.create_data_source(sample_model) + + assert source.name == "driver_stats_source" + assert "driver_stats.parquet" in source.path + + def test_unsupported_data_source_type(self, sample_model): + """Test error for unsupported data source type.""" + mapper = DbtToFeastMapper(data_source_type="unsupported") + + with pytest.raises(ValueError) as exc_info: + mapper.create_data_source(sample_model) + + assert "Unsupported data_source_type" in str(exc_info.value) + + def test_custom_timestamp_field(self, sample_model): + """Test custom timestamp field.""" + mapper = DbtToFeastMapper( + data_source_type="bigquery", timestamp_field="created_at" + ) + source = mapper.create_data_source(sample_model) + + assert source.timestamp_field == "created_at" + + def test_create_entity(self): + """Test creating an entity.""" + mapper = DbtToFeastMapper() + entity = mapper.create_entity( + name="driver_id", + description="Driver entity", + tags={"source": "dbt"}, + ) + + assert entity.name == "driver_id" + assert entity.join_key == "driver_id" + assert entity.description == "Driver entity" + assert entity.tags == {"source": "dbt"} + + def test_create_feature_view(self, sample_model): + """Test creating a feature view.""" + mapper = DbtToFeastMapper(data_source_type="bigquery", ttl_days=7) + source = mapper.create_data_source(sample_model) + fv = mapper.create_feature_view( + model=sample_model, + source=source, + entity_column="driver_id", + ) + + assert fv.name == "driver_stats" + assert fv.ttl == timedelta(days=7) + assert fv.description == "Driver statistics" + + # Check features (should exclude entity and timestamp) + feature_names = [f.name for f in fv.features] + assert "trip_count" in feature_names + assert "avg_rating" in feature_names + assert "is_active" in feature_names + assert "driver_id" not in feature_names + assert "event_timestamp" not in feature_names + + # Check tags + assert "dbt.model" in fv.tags + assert fv.tags["dbt.model"] == "driver_stats" + assert "dbt.tag.feast" in fv.tags + + def test_create_feature_view_with_exclude(self, sample_model): + """Test excluding columns from feature view.""" + mapper = DbtToFeastMapper(data_source_type="bigquery") + source = mapper.create_data_source(sample_model) + fv = mapper.create_feature_view( + model=sample_model, + source=source, + entity_column="driver_id", + exclude_columns=["is_active"], + ) + + feature_names = [f.name for f in fv.features] + assert "trip_count" in feature_names + assert "avg_rating" in feature_names + assert "is_active" not in feature_names + + def test_create_all_from_model(self, sample_model): + """Test creating all Feast objects from a model.""" + mapper = DbtToFeastMapper(data_source_type="bigquery") + result = mapper.create_all_from_model( + model=sample_model, + entity_column="driver_id", + ) + + assert "entity" in result + assert "data_source" in result + assert "feature_view" in result + + assert result["entity"].name == "driver_id" + assert result["data_source"].name == "driver_stats_source" + assert result["feature_view"].name == "driver_stats" + + def test_feature_type_mapping(self, sample_model): + """Test that feature types are correctly mapped.""" + mapper = DbtToFeastMapper(data_source_type="bigquery") + source = mapper.create_data_source(sample_model) + fv = mapper.create_feature_view( + model=sample_model, + source=source, + entity_column="driver_id", + ) + + # Find specific features and check types + trip_count = next((f for f in fv.features if f.name == "trip_count"), None) + avg_rating = next((f for f in fv.features if f.name == "avg_rating"), None) + is_active = next((f for f in fv.features if f.name == "is_active"), None) + + assert trip_count is not None + assert trip_count.dtype == Int64 + + assert avg_rating is not None + assert avg_rating.dtype == Float64 + + assert is_active is not None + assert is_active.dtype == Bool diff --git a/sdk/python/tests/unit/dbt/test_parser.py b/sdk/python/tests/unit/dbt/test_parser.py new file mode 100644 index 00000000000..d61f3d82f9f --- /dev/null +++ b/sdk/python/tests/unit/dbt/test_parser.py @@ -0,0 +1,293 @@ +""" +Unit tests for dbt manifest parser. +""" + +import json +import pytest +from pathlib import Path + +from feast.dbt.parser import DbtManifestParser, DbtModel, DbtColumn + + +@pytest.fixture +def sample_manifest(tmp_path): + """Create a sample dbt manifest.json for testing.""" + manifest = { + "metadata": { + "dbt_version": "1.5.0", + "project_name": "test_project", + "generated_at": "2024-01-10T00:00:00Z", + }, + "nodes": { + "model.test_project.driver_stats": { + "name": "driver_stats", + "unique_id": "model.test_project.driver_stats", + "resource_type": "model", + "database": "my_database", + "schema": "analytics", + "alias": "driver_stats", + "description": "Driver statistics aggregated hourly", + "columns": { + "driver_id": { + "name": "driver_id", + "description": "Unique driver identifier", + "data_type": "INT64", + "tags": ["entity"], + "meta": {}, + }, + "event_timestamp": { + "name": "event_timestamp", + "description": "Event timestamp", + "data_type": "TIMESTAMP", + "tags": [], + "meta": {}, + }, + "trip_count": { + "name": "trip_count", + "description": "Number of trips", + "data_type": "INT64", + "tags": ["feature"], + "meta": {}, + }, + "avg_rating": { + "name": "avg_rating", + "description": "Average driver rating", + "data_type": "FLOAT64", + "tags": ["feature"], + "meta": {}, + }, + }, + "tags": ["feast", "ml"], + "meta": {"owner": "data-team"}, + "depends_on": {"nodes": ["source.test_project.raw_trips"]}, + }, + "model.test_project.customer_stats": { + "name": "customer_stats", + "unique_id": "model.test_project.customer_stats", + "resource_type": "model", + "database": "my_database", + "schema": "analytics", + "alias": "customer_stats", + "description": "Customer statistics", + "columns": { + "customer_id": { + "name": "customer_id", + "description": "Unique customer ID", + "data_type": "STRING", + "tags": [], + "meta": {}, + }, + "event_timestamp": { + "name": "event_timestamp", + "description": "Event timestamp", + "data_type": "TIMESTAMP", + "tags": [], + "meta": {}, + }, + "order_count": { + "name": "order_count", + "description": "Total orders", + "data_type": "INT64", + "tags": [], + "meta": {}, + }, + }, + "tags": ["ml"], + "meta": {}, + "depends_on": {"nodes": []}, + }, + "test.test_project.some_test": { + "name": "some_test", + "unique_id": "test.test_project.some_test", + "resource_type": "test", + "database": "my_database", + "schema": "analytics", + "alias": "some_test", + "description": "A test node", + "columns": {}, + "tags": [], + "meta": {}, + "depends_on": {"nodes": []}, + }, + }, + } + + manifest_path = tmp_path / "manifest.json" + manifest_path.write_text(json.dumps(manifest)) + return manifest_path + + +class TestDbtManifestParser: + """Tests for DbtManifestParser class.""" + + def test_parse_manifest(self, sample_manifest): + """Test parsing a valid manifest file.""" + parser = DbtManifestParser(str(sample_manifest)) + parser.parse() + + assert parser.dbt_version == "1.5.0" + assert parser.project_name == "test_project" + + def test_parse_manifest_not_found(self, tmp_path): + """Test error when manifest file doesn't exist.""" + parser = DbtManifestParser(str(tmp_path / "nonexistent.json")) + + with pytest.raises(FileNotFoundError) as exc_info: + parser.parse() + + assert "dbt manifest not found" in str(exc_info.value) + + def test_parse_manifest_invalid_json(self, tmp_path): + """Test error when manifest is invalid JSON.""" + invalid_path = tmp_path / "invalid.json" + invalid_path.write_text("not valid json {{{") + + parser = DbtManifestParser(str(invalid_path)) + + with pytest.raises(ValueError) as exc_info: + parser.parse() + + assert "Invalid JSON" in str(exc_info.value) + + def test_get_all_models(self, sample_manifest): + """Test getting all models from manifest.""" + parser = DbtManifestParser(str(sample_manifest)) + models = parser.get_models() + + # Should only get models, not tests + assert len(models) == 2 + model_names = [m.name for m in models] + assert "driver_stats" in model_names + assert "customer_stats" in model_names + assert "some_test" not in model_names + + def test_get_models_by_name(self, sample_manifest): + """Test filtering models by name.""" + parser = DbtManifestParser(str(sample_manifest)) + models = parser.get_models(model_names=["driver_stats"]) + + assert len(models) == 1 + assert models[0].name == "driver_stats" + + def test_get_models_by_tag(self, sample_manifest): + """Test filtering models by tag.""" + parser = DbtManifestParser(str(sample_manifest)) + + # Filter by 'feast' tag - only driver_stats has it + feast_models = parser.get_models(tag_filter="feast") + assert len(feast_models) == 1 + assert feast_models[0].name == "driver_stats" + + # Filter by 'ml' tag - both models have it + ml_models = parser.get_models(tag_filter="ml") + assert len(ml_models) == 2 + + def test_model_properties(self, sample_manifest): + """Test DbtModel properties.""" + parser = DbtManifestParser(str(sample_manifest)) + model = parser.get_model_by_name("driver_stats") + + assert model is not None + assert model.name == "driver_stats" + assert model.unique_id == "model.test_project.driver_stats" + assert model.database == "my_database" + assert model.schema == "analytics" + assert model.alias == "driver_stats" + assert model.description == "Driver statistics aggregated hourly" + assert model.full_table_name == "my_database.analytics.driver_stats" + assert "feast" in model.tags + assert "ml" in model.tags + assert len(model.columns) == 4 + + def test_column_properties(self, sample_manifest): + """Test DbtColumn properties.""" + parser = DbtManifestParser(str(sample_manifest)) + model = parser.get_model_by_name("driver_stats") + + # Find the trip_count column + trip_count_col = next( + (c for c in model.columns if c.name == "trip_count"), None + ) + + assert trip_count_col is not None + assert trip_count_col.name == "trip_count" + assert trip_count_col.data_type == "INT64" + assert trip_count_col.description == "Number of trips" + assert "feature" in trip_count_col.tags + + def test_get_model_by_name_not_found(self, sample_manifest): + """Test getting a model that doesn't exist.""" + parser = DbtManifestParser(str(sample_manifest)) + model = parser.get_model_by_name("nonexistent_model") + + assert model is None + + def test_depends_on(self, sample_manifest): + """Test model dependencies are captured.""" + parser = DbtManifestParser(str(sample_manifest)) + model = parser.get_model_by_name("driver_stats") + + assert len(model.depends_on) == 1 + assert "source.test_project.raw_trips" in model.depends_on + + +class TestDbtColumn: + """Tests for DbtColumn dataclass.""" + + def test_column_defaults(self): + """Test DbtColumn default values.""" + col = DbtColumn(name="test_col") + + assert col.name == "test_col" + assert col.description == "" + assert col.data_type == "STRING" + assert col.tags == [] + assert col.meta == {} + + def test_column_with_all_fields(self): + """Test DbtColumn with all fields specified.""" + col = DbtColumn( + name="feature_col", + description="A feature column", + data_type="FLOAT64", + tags=["feature", "numeric"], + meta={"owner": "ml-team"}, + ) + + assert col.name == "feature_col" + assert col.description == "A feature column" + assert col.data_type == "FLOAT64" + assert col.tags == ["feature", "numeric"] + assert col.meta == {"owner": "ml-team"} + + +class TestDbtModel: + """Tests for DbtModel dataclass.""" + + def test_model_full_table_name(self): + """Test full_table_name property.""" + model = DbtModel( + name="test_model", + unique_id="model.proj.test_model", + database="prod_db", + schema="public", + alias="test_model_v2", + ) + + assert model.full_table_name == "prod_db.public.test_model_v2" + + def test_model_defaults(self): + """Test DbtModel default values.""" + model = DbtModel( + name="test", + unique_id="model.proj.test", + database="db", + schema="schema", + alias="test", + ) + + assert model.description == "" + assert model.columns == [] + assert model.tags == [] + assert model.meta == {} + assert model.depends_on == [] From d20962d0b4592cf59cf430d6facce0348e2f7e00 Mon Sep 17 00:00:00 2001 From: yassinnouh21 Date: Sat, 10 Jan 2026 14:42:12 +0200 Subject: [PATCH 02/21] fix: Address mypy and ruff lint errors in dbt integration Signed-off-by: yassinnouh21 --- sdk/python/feast/cli/dbt_import.py | 12 ++++++------ sdk/python/feast/dbt/__init__.py | 4 ++-- sdk/python/feast/dbt/codegen.py | 7 +++---- sdk/python/feast/dbt/mapper.py | 5 ++--- sdk/python/feast/dbt/parser.py | 5 ++++- 5 files changed, 17 insertions(+), 16 deletions(-) diff --git a/sdk/python/feast/cli/dbt_import.py b/sdk/python/feast/cli/dbt_import.py index bd47c855a8b..2f231cc4611 100644 --- a/sdk/python/feast/cli/dbt_import.py +++ b/sdk/python/feast/cli/dbt_import.py @@ -5,7 +5,7 @@ dbt models with Feast feature stores. """ -from typing import List, Optional +from typing import Any, Dict, List, Optional import click from colorama import Fore, Style @@ -119,8 +119,8 @@ def import_command( # Generate Python file instead of applying to registry feast dbt import -m target/manifest.json -e driver_id --tag feast --output features.py """ - from feast.dbt.parser import DbtManifestParser from feast.dbt.mapper import DbtToFeastMapper + from feast.dbt.parser import DbtManifestParser # Parse manifest click.echo(f"{Fore.CYAN}Parsing dbt manifest: {manifest_path}{Style.RESET_ALL}") @@ -177,8 +177,8 @@ def import_command( # Generate Feast objects click.echo(f"\n{Fore.CYAN}Generating Feast objects...{Style.RESET_ALL}") - all_objects = [] - entities_created = {} + all_objects: List[Any] = [] + entities_created: Dict[str, Any] = {} for model in models: # Validate timestamp field exists @@ -202,7 +202,7 @@ def import_command( if entity_column not in entities_created: entity = mapper.create_entity( name=entity_column, - description=f"Entity key for dbt models", + description="Entity key for dbt models", ) entities_created[entity_column] = entity all_objects.append(entity) @@ -274,7 +274,7 @@ def import_command( click.echo( f"\n{Fore.GREEN}✓ Generated Feast definitions: {output}{Style.RESET_ALL}" ) - click.echo(f" You can now import this file in your feature_store.yaml repo.") + click.echo(" You can now import this file in your feature_store.yaml repo.") return if dry_run: diff --git a/sdk/python/feast/dbt/__init__.py b/sdk/python/feast/dbt/__init__.py index 0c6a290f3cf..9851ea03b35 100644 --- a/sdk/python/feast/dbt/__init__.py +++ b/sdk/python/feast/dbt/__init__.py @@ -15,9 +15,9 @@ ... feature_view = mapper.create_feature_view(model, data_source, "driver_id") """ -from feast.dbt.parser import DbtManifestParser, DbtModel, DbtColumn -from feast.dbt.mapper import DbtToFeastMapper from feast.dbt.codegen import DbtCodeGenerator, generate_feast_code +from feast.dbt.mapper import DbtToFeastMapper +from feast.dbt.parser import DbtColumn, DbtManifestParser, DbtModel __all__ = [ "DbtManifestParser", diff --git a/sdk/python/feast/dbt/codegen.py b/sdk/python/feast/dbt/codegen.py index ec3cbea3cbc..bdef479cbe2 100644 --- a/sdk/python/feast/dbt/codegen.py +++ b/sdk/python/feast/dbt/codegen.py @@ -7,10 +7,10 @@ from typing import Any, List, Optional, Set -from jinja2 import Environment, BaseLoader +from jinja2 import BaseLoader, Environment -from feast.dbt.parser import DbtModel from feast.dbt.mapper import map_dbt_type_to_feast_type +from feast.dbt.parser import DbtModel from feast.types import ( Array, Bool, @@ -23,7 +23,6 @@ UnixTimestamp, ) - # Template for generating a complete Feast definitions file FEAST_FILE_TEMPLATE = '''""" Feast feature definitions generated from dbt models. @@ -239,7 +238,7 @@ def generate( "var_name": entity_var, "name": entity_column, "join_key": entity_column, - "description": f"Entity key for dbt models", + "description": "Entity key for dbt models", "tags": {"source": "dbt"}, }) diff --git a/sdk/python/feast/dbt/mapper.py b/sdk/python/feast/dbt/mapper.py index 1275b7cf0a1..965927d2b15 100644 --- a/sdk/python/feast/dbt/mapper.py +++ b/sdk/python/feast/dbt/mapper.py @@ -12,20 +12,19 @@ from feast.entity import Entity from feast.feature_view import FeatureView from feast.field import Field -from feast.value_type import ValueType from feast.types import ( Array, Bool, Bytes, + FeastType, Float32, Float64, Int32, Int64, String, UnixTimestamp, - FeastType, ) - +from feast.value_type import ValueType # Comprehensive mapping from dbt/warehouse types to Feast types # Covers BigQuery, Snowflake, Redshift, PostgreSQL, and common SQL types diff --git a/sdk/python/feast/dbt/parser.py b/sdk/python/feast/dbt/parser.py index ef3e7c8b80b..bc82702ca74 100644 --- a/sdk/python/feast/dbt/parser.py +++ b/sdk/python/feast/dbt/parser.py @@ -8,10 +8,10 @@ """ import json +from dataclasses import dataclass, field from enum import property from pathlib import Path from typing import Any, Dict, List, Optional -from dataclasses import dataclass, field @dataclass @@ -134,6 +134,9 @@ def get_models( if self._raw_manifest is None: self.parse() + if self._raw_manifest is None: + return [] + models = [] nodes = self._raw_manifest.get("nodes", {}) From 354f92188b97fe68038a3fb01b4d631d4b041a0e Mon Sep 17 00:00:00 2001 From: yassinnouh21 Date: Sat, 10 Jan 2026 14:46:44 +0200 Subject: [PATCH 03/21] fix: Address ruff lint errors in dbt unit tests Signed-off-by: yassinnouh21 --- sdk/python/tests/unit/dbt/test_mapper.py | 18 +++++++++--------- sdk/python/tests/unit/dbt/test_parser.py | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/sdk/python/tests/unit/dbt/test_mapper.py b/sdk/python/tests/unit/dbt/test_mapper.py index bd7fdae7d38..7edd735aafc 100644 --- a/sdk/python/tests/unit/dbt/test_mapper.py +++ b/sdk/python/tests/unit/dbt/test_mapper.py @@ -2,25 +2,25 @@ Unit tests for dbt to Feast mapper. """ -import pytest from datetime import timedelta -from feast.dbt.parser import DbtModel, DbtColumn +import pytest + from feast.dbt.mapper import ( DbtToFeastMapper, map_dbt_type_to_feast_type, - DBT_TO_FEAST_TYPE_MAP, ) +from feast.dbt.parser import DbtColumn, DbtModel from feast.types import ( - String, - Int32, - Int64, + Array, + Bool, + Bytes, Float32, Float64, - Bool, + Int32, + Int64, + String, UnixTimestamp, - Bytes, - Array, ) diff --git a/sdk/python/tests/unit/dbt/test_parser.py b/sdk/python/tests/unit/dbt/test_parser.py index d61f3d82f9f..4b0ef36ffa7 100644 --- a/sdk/python/tests/unit/dbt/test_parser.py +++ b/sdk/python/tests/unit/dbt/test_parser.py @@ -3,10 +3,10 @@ """ import json + import pytest -from pathlib import Path -from feast.dbt.parser import DbtManifestParser, DbtModel, DbtColumn +from feast.dbt.parser import DbtColumn, DbtManifestParser, DbtModel @pytest.fixture From 5ca317cec670981dc228fe57677b8a09de86401c Mon Sep 17 00:00:00 2001 From: yassinnouh21 Date: Sat, 10 Jan 2026 14:51:18 +0200 Subject: [PATCH 04/21] style: Format dbt files with ruff Signed-off-by: yassinnouh21 --- sdk/python/feast/cli/dbt_import.py | 11 +++---- sdk/python/feast/dbt/codegen.py | 52 +++++++++++++++++------------- sdk/python/feast/dbt/parser.py | 3 +- 3 files changed, 34 insertions(+), 32 deletions(-) diff --git a/sdk/python/feast/cli/dbt_import.py b/sdk/python/feast/cli/dbt_import.py index 2f231cc4611..33a084daaa0 100644 --- a/sdk/python/feast/cli/dbt_import.py +++ b/sdk/python/feast/cli/dbt_import.py @@ -155,9 +155,7 @@ def import_command( click.echo(f" Model names: {', '.join(model_names)}") raise SystemExit(0) - click.echo( - f"{Fore.GREEN}Found {len(models)} model(s) to import:{Style.RESET_ALL}" - ) + click.echo(f"{Fore.GREEN}Found {len(models)} model(s) to import:{Style.RESET_ALL}") for model in models: tags_str = f" [tags: {', '.join(model.tags)}]" if model.tags else "" click.echo(f" - {model.name} ({len(model.columns)} columns){tags_str}") @@ -241,7 +239,8 @@ def import_command( # Filter models that were actually processed (have valid columns) valid_models = [ - m for m in models + m + for m in models if timestamp_field in [c.name for c in m.columns] and entity_column in [c.name for c in m.columns] ] @@ -278,9 +277,7 @@ def import_command( return if dry_run: - click.echo( - f"\n{Fore.YELLOW}Dry run - no changes applied.{Style.RESET_ALL}" - ) + click.echo(f"\n{Fore.YELLOW}Dry run - no changes applied.{Style.RESET_ALL}") click.echo("Remove --dry-run flag to apply changes.") return diff --git a/sdk/python/feast/dbt/codegen.py b/sdk/python/feast/dbt/codegen.py index bdef479cbe2..ac4f17e07e3 100644 --- a/sdk/python/feast/dbt/codegen.py +++ b/sdk/python/feast/dbt/codegen.py @@ -234,13 +234,15 @@ def generate( # Prepare entity data entities = [] entity_var = _make_var_name(entity_column) - entities.append({ - "var_name": entity_var, - "name": entity_column, - "join_key": entity_column, - "description": "Entity key for dbt models", - "tags": {"source": "dbt"}, - }) + entities.append( + { + "var_name": entity_var, + "name": entity_column, + "join_key": entity_column, + "description": "Entity key for dbt models", + "tags": {"source": "dbt"}, + } + ) # Prepare data sources and feature views data_sources = [] @@ -298,25 +300,29 @@ def generate( else: type_imports.add(type_name) - fields.append({ - "name": column.name, - "dtype": type_name, - "description": _escape_description(column.description), - }) + fields.append( + { + "name": column.name, + "dtype": type_name, + "description": _escape_description(column.description), + } + ) # Feature view fv_var = _make_var_name(f"{model.name}_fv") - feature_views.append({ - "var_name": fv_var, - "name": model.name, - "entity_var": entity_var, - "source_var": source_var, - "ttl_days": self.ttl_days, - "fields": fields, - "online": online, - "description": _escape_description(model.description), - "tags": tags, - }) + feature_views.append( + { + "var_name": fv_var, + "name": model.name, + "entity_var": entity_var, + "source_var": source_var, + "ttl_days": self.ttl_days, + "fields": fields, + "online": online, + "description": _escape_description(model.description), + "tags": tags, + } + ) # Sort type imports for consistent output sorted_types = sorted(type_imports) diff --git a/sdk/python/feast/dbt/parser.py b/sdk/python/feast/dbt/parser.py index bc82702ca74..8d7730eac46 100644 --- a/sdk/python/feast/dbt/parser.py +++ b/sdk/python/feast/dbt/parser.py @@ -99,8 +99,7 @@ def parse(self) -> None: self._raw_manifest = json.load(f) except json.JSONDecodeError as e: raise ValueError( - f"Invalid JSON in manifest: {e}\n" - f"Try: dbt clean && dbt compile" + f"Invalid JSON in manifest: {e}\nTry: dbt clean && dbt compile" ) # Try to use dbt-artifacts-parser if available From a301f83d9001b3a8de8c76b05e2efacbad327b0b Mon Sep 17 00:00:00 2001 From: yassinnouh21 Date: Sat, 10 Jan 2026 15:05:28 +0200 Subject: [PATCH 05/21] fix: Remove unused dbt-artifacts-parser import and fix enum import Signed-off-by: yassinnouh21 --- sdk/python/feast/dbt/parser.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/sdk/python/feast/dbt/parser.py b/sdk/python/feast/dbt/parser.py index 8d7730eac46..ddec3653bda 100644 --- a/sdk/python/feast/dbt/parser.py +++ b/sdk/python/feast/dbt/parser.py @@ -4,12 +4,11 @@ This module provides functionality to parse dbt manifest.json files and extract model metadata for generating Feast FeatureViews. -Uses dbt-artifacts-parser to handle manifest versions v1-v12 (dbt 0.19 through 1.11). +Supports dbt manifest versions v1-v12 (dbt 0.19 through 1.11+). """ import json from dataclasses import dataclass, field -from enum import property from pathlib import Path from typing import Any, Dict, List, Optional @@ -50,8 +49,7 @@ class DbtManifestParser: """ Parser for dbt manifest.json files. - Uses dbt-artifacts-parser to handle manifest versions v1-v12. - Supports dbt versions 0.19 through 1.11. + Supports dbt manifest versions v1-v12 (dbt versions 0.19 through 1.11+). Examples: >>> parser = DbtManifestParser("target/manifest.json") @@ -76,7 +74,6 @@ def __init__(self, manifest_path: str): manifest_path: Path to manifest.json file """ self.manifest_path = Path(manifest_path) - self.manifest = None self._raw_manifest: Optional[Dict[str, Any]] = None def parse(self) -> None: @@ -102,15 +99,6 @@ def parse(self) -> None: f"Invalid JSON in manifest: {e}\nTry: dbt clean && dbt compile" ) - # Try to use dbt-artifacts-parser if available - try: - from dbt_artifacts_parser.parser import parse_manifest - - self.manifest = parse_manifest(manifest=self._raw_manifest) - except ImportError: - # Fall back to raw dict parsing if dbt-artifacts-parser not installed - self.manifest = None - def get_models( self, model_names: Optional[List[str]] = None, From 460810ce0d5404d2fe2a89dd2e0ad427eda5d559 Mon Sep 17 00:00:00 2001 From: yassinnouh21 Date: Sat, 10 Jan 2026 15:26:56 +0200 Subject: [PATCH 06/21] feat: Use dbt-artifacts-parser for typed manifest parsing - Add dbt-artifacts-parser as optional dependency (feast[dbt]) - Update parser to use typed parsing with fallback to raw dict - Provides better support for manifest versions v1-v12 Signed-off-by: yassinnouh21 --- sdk/python/feast/dbt/parser.py | 182 ++++++++++++++++++++++++--------- setup.py | 3 + 2 files changed, 136 insertions(+), 49 deletions(-) diff --git a/sdk/python/feast/dbt/parser.py b/sdk/python/feast/dbt/parser.py index ddec3653bda..b653d622c27 100644 --- a/sdk/python/feast/dbt/parser.py +++ b/sdk/python/feast/dbt/parser.py @@ -4,7 +4,7 @@ This module provides functionality to parse dbt manifest.json files and extract model metadata for generating Feast FeatureViews. -Supports dbt manifest versions v1-v12 (dbt 0.19 through 1.11+). +Uses dbt-artifacts-parser for typed parsing of manifest versions v1-v12 (dbt 0.19 through 1.11+). """ import json @@ -47,9 +47,10 @@ def full_table_name(self) -> str: class DbtManifestParser: """ - Parser for dbt manifest.json files. + Parser for dbt manifest.json files using dbt-artifacts-parser. - Supports dbt manifest versions v1-v12 (dbt versions 0.19 through 1.11+). + Uses dbt-artifacts-parser for typed parsing of manifest versions v1-v12 + (dbt versions 0.19 through 1.11+). Examples: >>> parser = DbtManifestParser("target/manifest.json") @@ -75,14 +76,16 @@ def __init__(self, manifest_path: str): """ self.manifest_path = Path(manifest_path) self._raw_manifest: Optional[Dict[str, Any]] = None + self._parsed_manifest: Optional[Any] = None def parse(self) -> None: """ - Load and parse the manifest.json file. + Load and parse the manifest.json file using dbt-artifacts-parser. Raises: FileNotFoundError: If manifest.json doesn't exist ValueError: If manifest.json is invalid JSON + ImportError: If dbt-artifacts-parser is not installed """ if not self.manifest_path.exists(): raise FileNotFoundError( @@ -99,6 +102,121 @@ def parse(self) -> None: f"Invalid JSON in manifest: {e}\nTry: dbt clean && dbt compile" ) + # Parse using dbt-artifacts-parser for typed access + try: + from dbt_artifacts_parser.parser import parse_manifest + + self._parsed_manifest = parse_manifest(manifest=self._raw_manifest) + except ImportError: + raise ImportError( + "dbt-artifacts-parser is required for dbt integration.\n" + "Install with: pip install 'feast[dbt]' or pip install dbt-artifacts-parser" + ) + + def _extract_column_from_node(self, col_name: str, col_data: Any) -> DbtColumn: + """Extract column info from a parsed node column.""" + # Handle both dict and typed object access + if isinstance(col_data, dict): + return DbtColumn( + name=col_name, + description=col_data.get("description", "") or "", + data_type=col_data.get("data_type", "STRING") or "STRING", + tags=col_data.get("tags", []) or [], + meta=col_data.get("meta", {}) or {}, + ) + else: + # Typed object from dbt-artifacts-parser + return DbtColumn( + name=col_name, + description=getattr(col_data, "description", "") or "", + data_type=getattr(col_data, "data_type", "STRING") or "STRING", + tags=list(getattr(col_data, "tags", []) or []), + meta=dict(getattr(col_data, "meta", {}) or {}), + ) + + def _extract_model_from_node(self, node_id: str, node: Any) -> Optional[DbtModel]: + """Extract DbtModel from a parsed manifest node.""" + # Handle both dict and typed object access + if isinstance(node, dict): + resource_type = node.get("resource_type", "model") + if resource_type != "model": + return None + + model_name = node.get("name", "") + node_tags = node.get("tags", []) or [] + node_columns = node.get("columns", {}) or {} + depends_on = node.get("depends_on", {}) or {} + depends_on_nodes = depends_on.get("nodes", []) or [] + + columns = [ + self._extract_column_from_node(col_name, col_data) + for col_name, col_data in node_columns.items() + ] + + return DbtModel( + name=model_name, + unique_id=node_id, + database=node.get("database", "") or "", + schema=node.get("schema", "") or "", + alias=node.get("alias", model_name) or model_name, + description=node.get("description", "") or "", + columns=columns, + tags=node_tags, + meta=node.get("meta", {}) or {}, + depends_on=depends_on_nodes, + ) + else: + # Typed object from dbt-artifacts-parser + resource_type = getattr(node, "resource_type", None) + if resource_type is None: + # Check if node_id indicates it's a model + if not node_id.startswith("model."): + return None + elif ( + str(resource_type) != "model" + and str( + resource_type.value + if hasattr(resource_type, "value") + else resource_type + ) + != "model" + ): + return None + + model_name = getattr(node, "name", "") + node_tags = list(getattr(node, "tags", []) or []) + node_columns = getattr(node, "columns", {}) or {} + depends_on = getattr(node, "depends_on", None) + + if depends_on: + depends_on_nodes = list(getattr(depends_on, "nodes", []) or []) + else: + depends_on_nodes = [] + + # Handle columns dict + if isinstance(node_columns, dict): + columns = [ + self._extract_column_from_node(col_name, col_data) + for col_name, col_data in node_columns.items() + ] + else: + columns = [] + + return DbtModel( + name=model_name, + unique_id=node_id, + database=getattr(node, "database", "") or "", + schema=getattr(node, "schema_", "") + or getattr(node, "schema", "") + or "", + alias=getattr(node, "alias", model_name) or model_name, + description=getattr(node, "description", "") or "", + columns=columns, + tags=node_tags, + meta=dict(getattr(node, "meta", {}) or {}), + depends_on=depends_on_nodes, + ) + def get_models( self, model_names: Optional[List[str]] = None, @@ -125,65 +243,31 @@ def get_models( return [] models = [] - nodes = self._raw_manifest.get("nodes", {}) + + # Use parsed manifest if available, fall back to raw + if self._parsed_manifest is not None: + nodes = getattr(self._parsed_manifest, "nodes", {}) or {} + else: + nodes = self._raw_manifest.get("nodes", {}) for node_id, node in nodes.items(): # Only process models (not tests, seeds, snapshots, etc.) if not node_id.startswith("model."): continue - # Also check resource_type if available - resource_type = node.get("resource_type", "model") - if resource_type != "model": + model = self._extract_model_from_node(node_id, node) + if model is None: continue - model_name = node.get("name", "") - # Filter by model names if specified - if model_names and model_name not in model_names: + if model_names and model.name not in model_names: continue - # Get tags from node - node_tags = node.get("tags", []) or [] - # Filter by tag if specified - if tag_filter and tag_filter not in node_tags: + if tag_filter and tag_filter not in model.tags: continue - # Extract columns - columns = [] - node_columns = node.get("columns", {}) or {} - for col_name, col_data in node_columns.items(): - if isinstance(col_data, dict): - columns.append( - DbtColumn( - name=col_name, - description=col_data.get("description", "") or "", - data_type=col_data.get("data_type", "STRING") or "STRING", - tags=col_data.get("tags", []) or [], - meta=col_data.get("meta", {}) or {}, - ) - ) - - # Get depends_on nodes - depends_on = node.get("depends_on", {}) or {} - depends_on_nodes = depends_on.get("nodes", []) or [] - - # Create DbtModel - models.append( - DbtModel( - name=model_name, - unique_id=node_id, - database=node.get("database", "") or "", - schema=node.get("schema", "") or "", - alias=node.get("alias", model_name) or model_name, - description=node.get("description", "") or "", - columns=columns, - tags=node_tags, - meta=node.get("meta", {}) or {}, - depends_on=depends_on_nodes, - ) - ) + models.append(model) return models diff --git a/setup.py b/setup.py index d4ecc5ee0af..c183aa650ef 100644 --- a/setup.py +++ b/setup.py @@ -167,6 +167,8 @@ MILVUS_REQUIRED = ["pymilvus==2.4.15", "milvus-lite==2.4.12", "setuptools>=60,<81"] +DBT_REQUIRED = ["dbt-artifacts-parser>=0.6.0,<1"] + TORCH_REQUIRED = [ "torch>=2.7.0", "torchvision>=0.22.1", @@ -367,6 +369,7 @@ "qdrant": QDRANT_REQUIRED, "go": GO_REQUIRED, "milvus": MILVUS_REQUIRED, + "dbt": DBT_REQUIRED, "docling": DOCLING_REQUIRED, "pytorch": TORCH_REQUIRED, "nlp": NLP_REQUIRED, From b398368817af3d7b91ed2829f0bfa0b54a10769f Mon Sep 17 00:00:00 2001 From: yassinnouh21 Date: Sat, 10 Jan 2026 15:33:11 +0200 Subject: [PATCH 07/21] fix: Add graceful fallback for dbt-artifacts-parser validation errors When parsing minimal/incomplete manifests (e.g., in unit tests), dbt-artifacts-parser may fail validation. This change adds a graceful fallback to use raw dict parsing when typed parsing fails. Also updated test fixture with dbt_schema_version field. Signed-off-by: yassinnouh21 --- sdk/python/feast/dbt/parser.py | 3 +++ sdk/python/tests/unit/dbt/test_parser.py | 1 + 2 files changed, 4 insertions(+) diff --git a/sdk/python/feast/dbt/parser.py b/sdk/python/feast/dbt/parser.py index b653d622c27..11d7289a478 100644 --- a/sdk/python/feast/dbt/parser.py +++ b/sdk/python/feast/dbt/parser.py @@ -112,6 +112,9 @@ def parse(self) -> None: "dbt-artifacts-parser is required for dbt integration.\n" "Install with: pip install 'feast[dbt]' or pip install dbt-artifacts-parser" ) + except Exception: + # Fall back to raw manifest if typed parsing fails (e.g., incomplete manifest) + self._parsed_manifest = None def _extract_column_from_node(self, col_name: str, col_data: Any) -> DbtColumn: """Extract column info from a parsed node column.""" diff --git a/sdk/python/tests/unit/dbt/test_parser.py b/sdk/python/tests/unit/dbt/test_parser.py index 4b0ef36ffa7..49ec5d84dbb 100644 --- a/sdk/python/tests/unit/dbt/test_parser.py +++ b/sdk/python/tests/unit/dbt/test_parser.py @@ -14,6 +14,7 @@ def sample_manifest(tmp_path): """Create a sample dbt manifest.json for testing.""" manifest = { "metadata": { + "dbt_schema_version": "https://schemas.getdbt.com/dbt/manifest/v9.json", "dbt_version": "1.5.0", "project_name": "test_project", "generated_at": "2024-01-10T00:00:00Z", From 2f777ffe31465d8635cc2b59d9b3fc0adf23b619 Mon Sep 17 00:00:00 2001 From: yassinnouh21 Date: Sat, 10 Jan 2026 15:52:47 +0200 Subject: [PATCH 08/21] fix: Skip dbt tests when dbt-artifacts-parser is not installed Since dbt-artifacts-parser is an optional dependency, unit tests should be skipped in CI when it's not installed. Signed-off-by: yassinnouh21 --- sdk/python/tests/unit/dbt/test_mapper.py | 3 +++ sdk/python/tests/unit/dbt/test_parser.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/sdk/python/tests/unit/dbt/test_mapper.py b/sdk/python/tests/unit/dbt/test_mapper.py index 7edd735aafc..809c4b43b8e 100644 --- a/sdk/python/tests/unit/dbt/test_mapper.py +++ b/sdk/python/tests/unit/dbt/test_mapper.py @@ -6,6 +6,9 @@ import pytest +# Skip all tests in this module if dbt-artifacts-parser is not installed +pytest.importorskip("dbt_artifacts_parser", reason="dbt-artifacts-parser not installed") + from feast.dbt.mapper import ( DbtToFeastMapper, map_dbt_type_to_feast_type, diff --git a/sdk/python/tests/unit/dbt/test_parser.py b/sdk/python/tests/unit/dbt/test_parser.py index 49ec5d84dbb..a843f600938 100644 --- a/sdk/python/tests/unit/dbt/test_parser.py +++ b/sdk/python/tests/unit/dbt/test_parser.py @@ -6,6 +6,9 @@ import pytest +# Skip all tests in this module if dbt-artifacts-parser is not installed +pytest.importorskip("dbt_artifacts_parser", reason="dbt-artifacts-parser not installed") + from feast.dbt.parser import DbtColumn, DbtManifestParser, DbtModel From 86fb9510a3d87b47cc9832a1aa4e35c11aaf524f Mon Sep 17 00:00:00 2001 From: yassinnouh21 Date: Sat, 10 Jan 2026 15:56:50 +0200 Subject: [PATCH 09/21] refactor: Simplify parser to rely solely on dbt-artifacts-parser Removed manual/fallback dict parsing code. The parser now exclusively uses dbt-artifacts-parser typed objects. Updated test fixtures to create complete manifests that dbt-artifacts-parser can parse. Signed-off-by: yassinnouh21 --- sdk/python/feast/dbt/parser.py | 175 ++++++++------------ sdk/python/tests/unit/dbt/test_parser.py | 201 ++++++++++++----------- 2 files changed, 173 insertions(+), 203 deletions(-) diff --git a/sdk/python/feast/dbt/parser.py b/sdk/python/feast/dbt/parser.py index 11d7289a478..2399d5413cc 100644 --- a/sdk/python/feast/dbt/parser.py +++ b/sdk/python/feast/dbt/parser.py @@ -112,113 +112,64 @@ def parse(self) -> None: "dbt-artifacts-parser is required for dbt integration.\n" "Install with: pip install 'feast[dbt]' or pip install dbt-artifacts-parser" ) - except Exception: - # Fall back to raw manifest if typed parsing fails (e.g., incomplete manifest) - self._parsed_manifest = None def _extract_column_from_node(self, col_name: str, col_data: Any) -> DbtColumn: """Extract column info from a parsed node column.""" - # Handle both dict and typed object access - if isinstance(col_data, dict): - return DbtColumn( - name=col_name, - description=col_data.get("description", "") or "", - data_type=col_data.get("data_type", "STRING") or "STRING", - tags=col_data.get("tags", []) or [], - meta=col_data.get("meta", {}) or {}, - ) - else: - # Typed object from dbt-artifacts-parser - return DbtColumn( - name=col_name, - description=getattr(col_data, "description", "") or "", - data_type=getattr(col_data, "data_type", "STRING") or "STRING", - tags=list(getattr(col_data, "tags", []) or []), - meta=dict(getattr(col_data, "meta", {}) or {}), - ) + return DbtColumn( + name=col_name, + description=getattr(col_data, "description", "") or "", + data_type=getattr(col_data, "data_type", "STRING") or "STRING", + tags=list(getattr(col_data, "tags", []) or []), + meta=dict(getattr(col_data, "meta", {}) or {}), + ) def _extract_model_from_node(self, node_id: str, node: Any) -> Optional[DbtModel]: """Extract DbtModel from a parsed manifest node.""" - # Handle both dict and typed object access - if isinstance(node, dict): - resource_type = node.get("resource_type", "model") - if resource_type != "model": + # Check resource type + resource_type = getattr(node, "resource_type", None) + if resource_type is None: + if not node_id.startswith("model."): return None - - model_name = node.get("name", "") - node_tags = node.get("tags", []) or [] - node_columns = node.get("columns", {}) or {} - depends_on = node.get("depends_on", {}) or {} - depends_on_nodes = depends_on.get("nodes", []) or [] - - columns = [ - self._extract_column_from_node(col_name, col_data) - for col_name, col_data in node_columns.items() - ] - - return DbtModel( - name=model_name, - unique_id=node_id, - database=node.get("database", "") or "", - schema=node.get("schema", "") or "", - alias=node.get("alias", model_name) or model_name, - description=node.get("description", "") or "", - columns=columns, - tags=node_tags, - meta=node.get("meta", {}) or {}, - depends_on=depends_on_nodes, - ) else: - # Typed object from dbt-artifacts-parser - resource_type = getattr(node, "resource_type", None) - if resource_type is None: - # Check if node_id indicates it's a model - if not node_id.startswith("model."): - return None - elif ( - str(resource_type) != "model" - and str( - resource_type.value - if hasattr(resource_type, "value") - else resource_type - ) - != "model" - ): + resource_type_str = ( + resource_type.value + if hasattr(resource_type, "value") + else str(resource_type) + ) + if resource_type_str != "model": return None - model_name = getattr(node, "name", "") - node_tags = list(getattr(node, "tags", []) or []) - node_columns = getattr(node, "columns", {}) or {} - depends_on = getattr(node, "depends_on", None) - - if depends_on: - depends_on_nodes = list(getattr(depends_on, "nodes", []) or []) - else: - depends_on_nodes = [] - - # Handle columns dict - if isinstance(node_columns, dict): - columns = [ - self._extract_column_from_node(col_name, col_data) - for col_name, col_data in node_columns.items() - ] - else: - columns = [] - - return DbtModel( - name=model_name, - unique_id=node_id, - database=getattr(node, "database", "") or "", - schema=getattr(node, "schema_", "") - or getattr(node, "schema", "") - or "", - alias=getattr(node, "alias", model_name) or model_name, - description=getattr(node, "description", "") or "", - columns=columns, - tags=node_tags, - meta=dict(getattr(node, "meta", {}) or {}), - depends_on=depends_on_nodes, - ) + model_name = getattr(node, "name", "") + node_tags = list(getattr(node, "tags", []) or []) + node_columns = getattr(node, "columns", {}) or {} + depends_on = getattr(node, "depends_on", None) + + if depends_on: + depends_on_nodes = list(getattr(depends_on, "nodes", []) or []) + else: + depends_on_nodes = [] + + # Extract columns + columns = [ + self._extract_column_from_node(col_name, col_data) + for col_name, col_data in node_columns.items() + ] + + # Get schema - dbt-artifacts-parser uses schema_ to avoid Python keyword + schema = getattr(node, "schema_", "") or getattr(node, "schema", "") or "" + + return DbtModel( + name=model_name, + unique_id=node_id, + database=getattr(node, "database", "") or "", + schema=schema, + alias=getattr(node, "alias", model_name) or model_name, + description=getattr(node, "description", "") or "", + columns=columns, + tags=node_tags, + meta=dict(getattr(node, "meta", {}) or {}), + depends_on=depends_on_nodes, + ) def get_models( self, @@ -239,19 +190,14 @@ def get_models( >>> models = parser.get_models(model_names=["driver_stats"]) >>> models = parser.get_models(tag_filter="feast") """ - if self._raw_manifest is None: + if self._parsed_manifest is None: self.parse() - if self._raw_manifest is None: + if self._parsed_manifest is None: return [] models = [] - - # Use parsed manifest if available, fall back to raw - if self._parsed_manifest is not None: - nodes = getattr(self._parsed_manifest, "nodes", {}) or {} - else: - nodes = self._raw_manifest.get("nodes", {}) + nodes = getattr(self._parsed_manifest, "nodes", {}) or {} for node_id, node in nodes.items(): # Only process models (not tests, seeds, snapshots, etc.) @@ -290,15 +236,22 @@ def get_model_by_name(self, model_name: str) -> Optional[DbtModel]: @property def dbt_version(self) -> Optional[str]: """Get dbt version from manifest metadata.""" - if self._raw_manifest is None: + if self._parsed_manifest is None: + return None + metadata = getattr(self._parsed_manifest, "metadata", None) + if metadata is None: return None - metadata = self._raw_manifest.get("metadata", {}) - return metadata.get("dbt_version") + return getattr(metadata, "dbt_version", None) @property def project_name(self) -> Optional[str]: """Get project name from manifest metadata.""" - if self._raw_manifest is None: + if self._parsed_manifest is None: + return None + metadata = getattr(self._parsed_manifest, "metadata", None) + if metadata is None: return None - metadata = self._raw_manifest.get("metadata", {}) - return metadata.get("project_name") + # project_name may not exist in all manifest versions + return getattr(metadata, "project_name", None) or getattr( + metadata, "project_id", None + ) diff --git a/sdk/python/tests/unit/dbt/test_parser.py b/sdk/python/tests/unit/dbt/test_parser.py index a843f600938..2e15f9863ee 100644 --- a/sdk/python/tests/unit/dbt/test_parser.py +++ b/sdk/python/tests/unit/dbt/test_parser.py @@ -12,6 +12,65 @@ from feast.dbt.parser import DbtColumn, DbtManifestParser, DbtModel +def _create_model_node( + name: str, + unique_id: str, + database: str = "my_database", + schema: str = "analytics", + description: str = "", + columns: dict = None, + tags: list = None, + meta: dict = None, + depends_on_nodes: list = None, +): + """Helper to create a complete model node for dbt-artifacts-parser.""" + return { + "name": name, + "unique_id": unique_id, + "resource_type": "model", + "package_name": "test_project", + "path": f"models/{name}.sql", + "original_file_path": f"models/{name}.sql", + "fqn": ["test_project", name], + "alias": name, + "checksum": {"name": "sha256", "checksum": "abc123"}, + "database": database, + "schema": schema, + "description": description or "", + "columns": columns or {}, + "tags": tags or [], + "meta": meta or {}, + "config": { + "enabled": True, + "materialized": "table", + "tags": tags or [], + "meta": meta or {}, + }, + "depends_on": {"nodes": depends_on_nodes or [], "macros": []}, + "refs": [], + "sources": [], + "metrics": [], + "compiled_path": f"target/compiled/test_project/models/{name}.sql", + } + + +def _create_column( + name: str, + data_type: str = "STRING", + description: str = "", + tags: list = None, + meta: dict = None, +): + """Helper to create a column definition.""" + return { + "name": name, + "description": description or "", + "data_type": data_type, + "tags": tags or [], + "meta": meta or {}, + } + + @pytest.fixture def sample_manifest(tmp_path): """Create a sample dbt manifest.json for testing.""" @@ -19,101 +78,62 @@ def sample_manifest(tmp_path): "metadata": { "dbt_schema_version": "https://schemas.getdbt.com/dbt/manifest/v9.json", "dbt_version": "1.5.0", - "project_name": "test_project", "generated_at": "2024-01-10T00:00:00Z", + "invocation_id": "test-invocation-id", + "env": {}, + "adapter_type": "bigquery", }, "nodes": { - "model.test_project.driver_stats": { - "name": "driver_stats", - "unique_id": "model.test_project.driver_stats", - "resource_type": "model", - "database": "my_database", - "schema": "analytics", - "alias": "driver_stats", - "description": "Driver statistics aggregated hourly", - "columns": { - "driver_id": { - "name": "driver_id", - "description": "Unique driver identifier", - "data_type": "INT64", - "tags": ["entity"], - "meta": {}, - }, - "event_timestamp": { - "name": "event_timestamp", - "description": "Event timestamp", - "data_type": "TIMESTAMP", - "tags": [], - "meta": {}, - }, - "trip_count": { - "name": "trip_count", - "description": "Number of trips", - "data_type": "INT64", - "tags": ["feature"], - "meta": {}, - }, - "avg_rating": { - "name": "avg_rating", - "description": "Average driver rating", - "data_type": "FLOAT64", - "tags": ["feature"], - "meta": {}, - }, + "model.test_project.driver_stats": _create_model_node( + name="driver_stats", + unique_id="model.test_project.driver_stats", + description="Driver statistics aggregated hourly", + columns={ + "driver_id": _create_column( + "driver_id", "INT64", "Unique driver identifier", ["entity"] + ), + "event_timestamp": _create_column( + "event_timestamp", "TIMESTAMP", "Event timestamp" + ), + "trip_count": _create_column( + "trip_count", "INT64", "Number of trips", ["feature"] + ), + "avg_rating": _create_column( + "avg_rating", "FLOAT64", "Average driver rating", ["feature"] + ), }, - "tags": ["feast", "ml"], - "meta": {"owner": "data-team"}, - "depends_on": {"nodes": ["source.test_project.raw_trips"]}, - }, - "model.test_project.customer_stats": { - "name": "customer_stats", - "unique_id": "model.test_project.customer_stats", - "resource_type": "model", - "database": "my_database", - "schema": "analytics", - "alias": "customer_stats", - "description": "Customer statistics", - "columns": { - "customer_id": { - "name": "customer_id", - "description": "Unique customer ID", - "data_type": "STRING", - "tags": [], - "meta": {}, - }, - "event_timestamp": { - "name": "event_timestamp", - "description": "Event timestamp", - "data_type": "TIMESTAMP", - "tags": [], - "meta": {}, - }, - "order_count": { - "name": "order_count", - "description": "Total orders", - "data_type": "INT64", - "tags": [], - "meta": {}, - }, + tags=["feast", "ml"], + meta={"owner": "data-team"}, + depends_on_nodes=["source.test_project.raw_trips"], + ), + "model.test_project.customer_stats": _create_model_node( + name="customer_stats", + unique_id="model.test_project.customer_stats", + description="Customer statistics", + columns={ + "customer_id": _create_column( + "customer_id", "STRING", "Unique customer ID" + ), + "event_timestamp": _create_column( + "event_timestamp", "TIMESTAMP", "Event timestamp" + ), + "order_count": _create_column( + "order_count", "INT64", "Total orders" + ), }, - "tags": ["ml"], - "meta": {}, - "depends_on": {"nodes": []}, - }, - "test.test_project.some_test": { - "name": "some_test", - "unique_id": "test.test_project.some_test", - "resource_type": "test", - "database": "my_database", - "schema": "analytics", - "alias": "some_test", - "description": "A test node", - "columns": {}, - "tags": [], - "meta": {}, - "depends_on": {"nodes": []}, - }, + tags=["ml"], + ), }, + "sources": {}, + "macros": {}, + "docs": {}, + "exposures": {}, + "metrics": {}, + "groups": {}, + "selectors": {}, + "disabled": {}, + "parent_map": {}, + "child_map": {}, } manifest_path = tmp_path / "manifest.json" @@ -130,7 +150,6 @@ def test_parse_manifest(self, sample_manifest): parser.parse() assert parser.dbt_version == "1.5.0" - assert parser.project_name == "test_project" def test_parse_manifest_not_found(self, tmp_path): """Test error when manifest file doesn't exist.""" @@ -158,12 +177,10 @@ def test_get_all_models(self, sample_manifest): parser = DbtManifestParser(str(sample_manifest)) models = parser.get_models() - # Should only get models, not tests assert len(models) == 2 model_names = [m.name for m in models] assert "driver_stats" in model_names assert "customer_stats" in model_names - assert "some_test" not in model_names def test_get_models_by_name(self, sample_manifest): """Test filtering models by name.""" From a17b50caa7d5634651c5c4c78453b1caa7742f04 Mon Sep 17 00:00:00 2001 From: yassinnouh21 Date: Sat, 10 Jan 2026 15:59:46 +0200 Subject: [PATCH 10/21] ci: Add dbt-artifacts-parser to unit test dependencies Install dbt-artifacts-parser in CI so dbt unit tests run instead of being skipped. Signed-off-by: yassinnouh21 --- .github/workflows/unit_tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 7f7cc149d81..ddeed9b8903 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -32,6 +32,8 @@ jobs: enable-cache: true - name: Install dependencies run: make install-python-dependencies-ci + - name: Install dbt extra dependencies + run: uv pip install --system dbt-artifacts-parser - name: Test Python run: make test-python-unit - name: Minimize uv cache From 55174a5a1b6b1c4b629696a4a5d651a9f7aa3f2d Mon Sep 17 00:00:00 2001 From: yassinnouh21 Date: Sat, 10 Jan 2026 16:04:23 +0200 Subject: [PATCH 11/21] fix: Address Copilot code review comments for dbt integration - mapper.py: Fix Array element type check to use set membership instead of incorrect isinstance() comparison - codegen.py: Add safe getattr() with fallback for Array.base_type access Signed-off-by: yassinnouh21 --- sdk/python/feast/dbt/codegen.py | 5 +++-- sdk/python/feast/dbt/mapper.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/sdk/python/feast/dbt/codegen.py b/sdk/python/feast/dbt/codegen.py index ac4f17e07e3..86e67b4dcf0 100644 --- a/sdk/python/feast/dbt/codegen.py +++ b/sdk/python/feast/dbt/codegen.py @@ -123,8 +123,9 @@ def _get_feast_type_name(feast_type: Any) -> str: """Get the string name of a Feast type for code generation.""" if isinstance(feast_type, Array): - # Handle Array types - base_type_name = _get_feast_type_name(feast_type.base_type) + # Handle Array types - safely get base_type with fallback + base_type = getattr(feast_type, "base_type", String) + base_type_name = _get_feast_type_name(base_type) return f"Array({base_type_name})" # Map type objects to their names diff --git a/sdk/python/feast/dbt/mapper.py b/sdk/python/feast/dbt/mapper.py index 965927d2b15..9861d5feafe 100644 --- a/sdk/python/feast/dbt/mapper.py +++ b/sdk/python/feast/dbt/mapper.py @@ -110,7 +110,8 @@ def map_dbt_type_to_feast_type(dbt_type: str) -> FeastType: element_type_str = normalized[6:-1].strip() element_type = map_dbt_type_to_feast_type(element_type_str) # Array only supports primitive types - if isinstance(element_type, type(String)): + valid_array_types = {String, Int32, Int64, Float32, Float64, Bool, Bytes, UnixTimestamp} + if element_type in valid_array_types: return Array(element_type) return Array(String) # Fallback for complex nested types From e4ba00ae9e493ba62995606844d34b97e18c64f8 Mon Sep 17 00:00:00 2001 From: yassinnouh21 Date: Sat, 10 Jan 2026 16:06:06 +0200 Subject: [PATCH 12/21] fix: Only add ellipsis to truncated descriptions Signed-off-by: yassinnouh21 --- sdk/python/feast/cli/dbt_import.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sdk/python/feast/cli/dbt_import.py b/sdk/python/feast/cli/dbt_import.py index 33a084daaa0..790b20c00a5 100644 --- a/sdk/python/feast/cli/dbt_import.py +++ b/sdk/python/feast/cli/dbt_import.py @@ -365,7 +365,8 @@ def list_command( click.echo(f"{Fore.CYAN}{model.name}{Style.RESET_ALL}{tags_str}") click.echo(f" Table: {model.full_table_name}") if model.description: - click.echo(f" Description: {model.description[:80]}...") + desc = model.description[:80] + ("..." if len(model.description) > 80 else "") + click.echo(f" Description: {desc}") if show_columns and model.columns: click.echo(f" Columns ({len(model.columns)}):") From 01730a8379b9b400b6632a57f10f73f966852988 Mon Sep 17 00:00:00 2001 From: yassinnouh21 Date: Sat, 10 Jan 2026 16:24:54 +0200 Subject: [PATCH 13/21] style: Format dbt files with ruff Signed-off-by: yassinnouh21 --- sdk/python/feast/cli/dbt_import.py | 4 +++- sdk/python/feast/dbt/mapper.py | 11 ++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/sdk/python/feast/cli/dbt_import.py b/sdk/python/feast/cli/dbt_import.py index 790b20c00a5..b09fd90ec6d 100644 --- a/sdk/python/feast/cli/dbt_import.py +++ b/sdk/python/feast/cli/dbt_import.py @@ -365,7 +365,9 @@ def list_command( click.echo(f"{Fore.CYAN}{model.name}{Style.RESET_ALL}{tags_str}") click.echo(f" Table: {model.full_table_name}") if model.description: - desc = model.description[:80] + ("..." if len(model.description) > 80 else "") + desc = model.description[:80] + ( + "..." if len(model.description) > 80 else "" + ) click.echo(f" Description: {desc}") if show_columns and model.columns: diff --git a/sdk/python/feast/dbt/mapper.py b/sdk/python/feast/dbt/mapper.py index 9861d5feafe..1ecdb9f2731 100644 --- a/sdk/python/feast/dbt/mapper.py +++ b/sdk/python/feast/dbt/mapper.py @@ -110,7 +110,16 @@ def map_dbt_type_to_feast_type(dbt_type: str) -> FeastType: element_type_str = normalized[6:-1].strip() element_type = map_dbt_type_to_feast_type(element_type_str) # Array only supports primitive types - valid_array_types = {String, Int32, Int64, Float32, Float64, Bool, Bytes, UnixTimestamp} + valid_array_types = { + String, + Int32, + Int64, + Float32, + Float64, + Bool, + Bytes, + UnixTimestamp, + } if element_type in valid_array_types: return Array(element_type) return Array(String) # Fallback for complex nested types From 8a06b833cee872000abff80c92dd87c270dc8f7d Mon Sep 17 00:00:00 2001 From: yassinnouh21 Date: Sat, 10 Jan 2026 16:48:26 +0200 Subject: [PATCH 14/21] fix: Convert doctest examples to code blocks to avoid CI failures Signed-off-by: yassinnouh21 --- sdk/python/feast/dbt/__init__.py | 19 ++++++++++--------- sdk/python/feast/dbt/codegen.py | 29 +++++++++++++++-------------- sdk/python/feast/dbt/mapper.py | 21 +++++++-------------- sdk/python/feast/dbt/parser.py | 20 +++++++++++--------- 4 files changed, 43 insertions(+), 46 deletions(-) diff --git a/sdk/python/feast/dbt/__init__.py b/sdk/python/feast/dbt/__init__.py index 9851ea03b35..7d1312d5a1a 100644 --- a/sdk/python/feast/dbt/__init__.py +++ b/sdk/python/feast/dbt/__init__.py @@ -4,15 +4,16 @@ This module provides functionality to import dbt models as Feast FeatureViews, enabling automatic generation of Feast objects from dbt manifest.json files. -Example usage: - >>> from feast.dbt import DbtManifestParser, DbtToFeastMapper - >>> parser = DbtManifestParser("target/manifest.json") - >>> parser.parse() - >>> models = parser.get_models(tag_filter="feast") - >>> mapper = DbtToFeastMapper(data_source_type="bigquery") - >>> for model in models: - ... data_source = mapper.create_data_source(model) - ... feature_view = mapper.create_feature_view(model, data_source, "driver_id") +Example usage:: + + from feast.dbt import DbtManifestParser, DbtToFeastMapper + parser = DbtManifestParser("target/manifest.json") + parser.parse() + models = parser.get_models(tag_filter="feast") + mapper = DbtToFeastMapper(data_source_type="bigquery") + for model in models: + data_source = mapper.create_data_source(model) + feature_view = mapper.create_feature_view(model, data_source, "driver_id") """ from feast.dbt.codegen import DbtCodeGenerator, generate_feast_code diff --git a/sdk/python/feast/dbt/codegen.py b/sdk/python/feast/dbt/codegen.py index 86e67b4dcf0..5626b441b83 100644 --- a/sdk/python/feast/dbt/codegen.py +++ b/sdk/python/feast/dbt/codegen.py @@ -168,20 +168,21 @@ class DbtCodeGenerator: This class creates complete, importable Python files containing Entity, DataSource, and FeatureView definitions. - Example: - >>> generator = DbtCodeGenerator( - ... data_source_type="bigquery", - ... timestamp_field="event_timestamp", - ... ttl_days=7 - ... ) - >>> code = generator.generate( - ... models=models, - ... entity_column="user_id", - ... manifest_path="target/manifest.json", - ... project_name="my_project" - ... ) - >>> with open("features.py", "w") as f: - ... f.write(code) + Example:: + + generator = DbtCodeGenerator( + data_source_type="bigquery", + timestamp_field="event_timestamp", + ttl_days=7 + ) + code = generator.generate( + models=models, + entity_column="user_id", + manifest_path="target/manifest.json", + project_name="my_project" + ) + with open("features.py", "w") as f: + f.write(code) """ def __init__( diff --git a/sdk/python/feast/dbt/mapper.py b/sdk/python/feast/dbt/mapper.py index 1ecdb9f2731..2d6d63fbd32 100644 --- a/sdk/python/feast/dbt/mapper.py +++ b/sdk/python/feast/dbt/mapper.py @@ -90,14 +90,6 @@ def map_dbt_type_to_feast_type(dbt_type: str) -> FeastType: Returns: The corresponding Feast type - - Examples: - >>> map_dbt_type_to_feast_type("STRING") - String - >>> map_dbt_type_to_feast_type("INT64") - Int64 - >>> map_dbt_type_to_feast_type("ARRAY") - Array(String) """ if not dbt_type: return String @@ -164,12 +156,13 @@ class DbtToFeastMapper: Supports creating DataSource, Entity, and FeatureView objects from dbt model metadata. - Examples: - >>> mapper = DbtToFeastMapper(data_source_type="bigquery") - >>> data_source = mapper.create_data_source(model) - >>> feature_view = mapper.create_feature_view( - ... model, data_source, entity_column="driver_id" - ... ) + Example:: + + mapper = DbtToFeastMapper(data_source_type="bigquery") + data_source = mapper.create_data_source(model) + feature_view = mapper.create_feature_view( + model, data_source, entity_column="driver_id" + ) Args: data_source_type: Type of data source ('bigquery', 'snowflake', 'file') diff --git a/sdk/python/feast/dbt/parser.py b/sdk/python/feast/dbt/parser.py index 2399d5413cc..f7d3e587e54 100644 --- a/sdk/python/feast/dbt/parser.py +++ b/sdk/python/feast/dbt/parser.py @@ -52,12 +52,13 @@ class DbtManifestParser: Uses dbt-artifacts-parser for typed parsing of manifest versions v1-v12 (dbt versions 0.19 through 1.11+). - Examples: - >>> parser = DbtManifestParser("target/manifest.json") - >>> parser.parse() - >>> models = parser.get_models(tag_filter="feast") - >>> for model in models: - ... print(f"Model: {model.name}, Columns: {len(model.columns)}") + Example:: + + parser = DbtManifestParser("target/manifest.json") + parser.parse() + models = parser.get_models(tag_filter="feast") + for model in models: + print(f"Model: {model.name}, Columns: {len(model.columns)}") Args: manifest_path: Path to manifest.json file (typically target/manifest.json) @@ -186,9 +187,10 @@ def get_models( Returns: List of DbtModel objects - Examples: - >>> models = parser.get_models(model_names=["driver_stats"]) - >>> models = parser.get_models(tag_filter="feast") + Example:: + + models = parser.get_models(model_names=["driver_stats"]) + models = parser.get_models(tag_filter="feast") """ if self._parsed_manifest is None: self.parse() From fb40e935e29ee750056502e49b973a0bdcb44d0f Mon Sep 17 00:00:00 2001 From: yassinnouh21 Date: Sun, 11 Jan 2026 13:18:17 +0200 Subject: [PATCH 15/21] fix: Add dbt-artifacts-parser to feast[ci] and update requirements - Add dbt-artifacts-parser to pyproject.toml under feast[dbt] and feast[ci] extras - Remove separate install step from unit_tests.yml workflow - Update all requirements lock files Addresses review feedback from @ntkathole. Signed-off-by: YassinNouh21 Signed-off-by: yassinnouh21 --- .github/workflows/unit_tests.yml | 2 -- pyproject.toml | 3 +++ sdk/python/requirements/py3.10-ci-requirements.txt | 4 ++++ sdk/python/requirements/py3.11-ci-requirements.txt | 4 ++++ sdk/python/requirements/py3.12-ci-requirements.txt | 4 ++++ 5 files changed, 15 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index ddeed9b8903..7f7cc149d81 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -32,8 +32,6 @@ jobs: enable-cache: true - name: Install dependencies run: make install-python-dependencies-ci - - name: Install dbt extra dependencies - run: uv pip install --system dbt-artifacts-parser - name: Test Python run: make test-python-unit - name: Minimize uv cache diff --git a/pyproject.toml b/pyproject.toml index f2620b0915d..3acf0865ced 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -133,10 +133,13 @@ snowflake = [ sqlite_vec = ["sqlite-vec==v0.1.6"] mcp = ["fastapi_mcp"] +dbt = ["dbt-artifacts-parser"] + ci = [ "build", "virtualenv==20.23.0", "cryptography>=43.0,<44", + "dbt-artifacts-parser", "ruff>=0.8.0", "mypy-protobuf>=3.1", "grpcio-tools>=1.56.2,<=1.62.3", diff --git a/sdk/python/requirements/py3.10-ci-requirements.txt b/sdk/python/requirements/py3.10-ci-requirements.txt index 2c448173938..37d0004fb07 100644 --- a/sdk/python/requirements/py3.10-ci-requirements.txt +++ b/sdk/python/requirements/py3.10-ci-requirements.txt @@ -857,6 +857,10 @@ db-dtypes==1.5.0 \ # via # google-cloud-bigquery # pandas-gbq +dbt-artifacts-parser==0.12.0 \ + --hash=sha256:3db93df7969c3f22c6fbf75a51b0af4c21b189d8db6f3c54e8471102c775bb0d \ + --hash=sha256:9d1c0ed41926102c1c39fdd780e1a332f58c9b794e94dba0dcf5dfefc847d6ea + # via feast (setup.py) debugpy==1.8.19 \ --hash=sha256:0601708223fe1cd0e27c6cce67a899d92c7d68e73690211e6788a4b0e1903f5b \ --hash=sha256:14035cbdbb1fe4b642babcdcb5935c2da3b1067ac211c5c5a8fdc0bb31adbcaa \ diff --git a/sdk/python/requirements/py3.11-ci-requirements.txt b/sdk/python/requirements/py3.11-ci-requirements.txt index 091236521ff..77efcc4f93d 100644 --- a/sdk/python/requirements/py3.11-ci-requirements.txt +++ b/sdk/python/requirements/py3.11-ci-requirements.txt @@ -937,6 +937,10 @@ db-dtypes==1.5.0 \ # via # google-cloud-bigquery # pandas-gbq +dbt-artifacts-parser==0.12.0 \ + --hash=sha256:3db93df7969c3f22c6fbf75a51b0af4c21b189d8db6f3c54e8471102c775bb0d \ + --hash=sha256:9d1c0ed41926102c1c39fdd780e1a332f58c9b794e94dba0dcf5dfefc847d6ea + # via feast (setup.py) debugpy==1.8.19 \ --hash=sha256:0601708223fe1cd0e27c6cce67a899d92c7d68e73690211e6788a4b0e1903f5b \ --hash=sha256:14035cbdbb1fe4b642babcdcb5935c2da3b1067ac211c5c5a8fdc0bb31adbcaa \ diff --git a/sdk/python/requirements/py3.12-ci-requirements.txt b/sdk/python/requirements/py3.12-ci-requirements.txt index 50efe8231a9..f6ab00ea4a2 100644 --- a/sdk/python/requirements/py3.12-ci-requirements.txt +++ b/sdk/python/requirements/py3.12-ci-requirements.txt @@ -933,6 +933,10 @@ db-dtypes==1.5.0 \ # via # google-cloud-bigquery # pandas-gbq +dbt-artifacts-parser==0.12.0 \ + --hash=sha256:3db93df7969c3f22c6fbf75a51b0af4c21b189d8db6f3c54e8471102c775bb0d \ + --hash=sha256:9d1c0ed41926102c1c39fdd780e1a332f58c9b794e94dba0dcf5dfefc847d6ea + # via feast (setup.py) debugpy==1.8.19 \ --hash=sha256:0601708223fe1cd0e27c6cce67a899d92c7d68e73690211e6788a4b0e1903f5b \ --hash=sha256:14035cbdbb1fe4b642babcdcb5935c2da3b1067ac211c5c5a8fdc0bb31adbcaa \ From 53932ffe72645b0948b42120d99517744dbd3698 Mon Sep 17 00:00:00 2001 From: yassinnouh21 Date: Sun, 11 Jan 2026 13:59:11 +0200 Subject: [PATCH 16/21] docs: Add dbt integration documentation Add comprehensive documentation for the new dbt integration feature: - Quick start guide with step-by-step instructions - CLI reference for `feast dbt list` and `feast dbt import` - Type mapping table for dbt to Feast types - Data source configuration examples (BigQuery, Snowflake, File) - Best practices for tagging, documentation, and CI/CD - Troubleshooting section Addresses review feedback from @franciscojavierarceo. Signed-off-by: YassinNouh21 Signed-off-by: yassinnouh21 --- docs/SUMMARY.md | 1 + docs/how-to-guides/dbt-integration.md | 370 ++++++++++++++++++++++++++ 2 files changed, 371 insertions(+) create mode 100644 docs/how-to-guides/dbt-integration.md diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index 7d85aba1ad0..27a84d31213 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -76,6 +76,7 @@ * [Adding a custom provider](how-to-guides/customizing-feast/creating-a-custom-provider.md) * [Adding or reusing tests](how-to-guides/adding-or-reusing-tests.md) * [Starting Feast servers in TLS(SSL) Mode](how-to-guides/starting-feast-servers-tls-mode.md) +* [Importing Features from dbt](how-to-guides/dbt-integration.md) ## Reference diff --git a/docs/how-to-guides/dbt-integration.md b/docs/how-to-guides/dbt-integration.md new file mode 100644 index 00000000000..7b687df504f --- /dev/null +++ b/docs/how-to-guides/dbt-integration.md @@ -0,0 +1,370 @@ +# Importing Features from dbt + +This guide explains how to use Feast's dbt integration to automatically import dbt models as Feast FeatureViews. This enables you to leverage your existing dbt transformations as feature definitions without manual duplication. + +## Overview + +[dbt (data build tool)](https://www.getdbt.com/) is a popular tool for transforming data in your warehouse. Many teams already use dbt to create feature tables. Feast's dbt integration allows you to: + +- **Discover** dbt models tagged for feature engineering +- **Import** model metadata (columns, types, descriptions) as Feast objects +- **Generate** Python code for Entity, DataSource, and FeatureView definitions + +This eliminates the need to manually define Feast objects that mirror your dbt models. + +## Prerequisites + +- A dbt project with compiled artifacts (`target/manifest.json`) +- Feast installed with dbt support: + +```bash +pip install 'feast[dbt]' +``` + +Or install the parser directly: + +```bash +pip install dbt-artifacts-parser +``` + +## Quick Start + +### 1. Tag your dbt models + +In your dbt project, add a `feast` tag to models you want to import: + +{% code title="models/driver_features.sql" %} +```sql +{{ config( + materialized='table', + tags=['feast'] +) }} + +SELECT + driver_id, + event_timestamp, + avg_rating, + total_trips, + is_active +FROM {{ ref('stg_drivers') }} +``` +{% endcode %} + +### 2. Define column types in schema.yml + +Feast uses column metadata from your `schema.yml` to determine feature types: + +{% code title="models/schema.yml" %} +```yaml +version: 2 +models: + - name: driver_features + description: "Driver aggregated features for ML models" + columns: + - name: driver_id + description: "Unique driver identifier" + data_type: STRING + - name: event_timestamp + description: "Feature timestamp" + data_type: TIMESTAMP + - name: avg_rating + description: "Average driver rating" + data_type: FLOAT64 + - name: total_trips + description: "Total completed trips" + data_type: INT64 + - name: is_active + description: "Whether driver is currently active" + data_type: BOOLEAN +``` +{% endcode %} + +### 3. Compile your dbt project + +```bash +cd your_dbt_project +dbt compile +``` + +This generates `target/manifest.json` which Feast will read. + +### 4. List available models + +Use the Feast CLI to discover tagged models: + +```bash +feast dbt list target/manifest.json --tag-filter feast +``` + +Output: +``` +Found 1 model(s) with tag 'feast': + + driver_features + Description: Driver aggregated features for ML models + Columns: driver_id, event_timestamp, avg_rating, total_trips, is_active + Tags: feast +``` + +### 5. Import models as Feast definitions + +Generate a Python file with Feast object definitions: + +```bash +feast dbt import target/manifest.json \ + --entity-column driver_id \ + --data-source-type bigquery \ + --tag-filter feast \ + --output features/driver_features.py +``` + +This generates: + +{% code title="features/driver_features.py" %} +```python +""" +Feast feature definitions generated from dbt models. + +Source: target/manifest.json +Project: my_dbt_project +Generated by: feast dbt import +""" + +from datetime import timedelta + +from feast import Entity, FeatureView, Field +from feast.types import Bool, Float64, Int64 +from feast.infra.offline_stores.bigquery_source import BigQuerySource + + +# Entities +driver_id = Entity( + name="driver_id", + join_keys=["driver_id"], + description="Entity key for dbt models", + tags={'source': 'dbt'}, +) + + +# Data Sources +driver_features_source = BigQuerySource( + name="driver_features_source", + table="my_project.my_dataset.driver_features", + timestamp_field="event_timestamp", + description="Driver aggregated features for ML models", + tags={'dbt.model': 'driver_features', 'dbt.tag.feast': 'true'}, +) + + +# Feature Views +driver_features_fv = FeatureView( + name="driver_features", + entities=[driver_id], + ttl=timedelta(days=1), + schema=[ + Field(name="avg_rating", dtype=Float64, description="Average driver rating"), + Field(name="total_trips", dtype=Int64, description="Total completed trips"), + Field(name="is_active", dtype=Bool, description="Whether driver is currently active"), + ], + online=True, + source=driver_features_source, + description="Driver aggregated features for ML models", + tags={'dbt.model': 'driver_features', 'dbt.tag.feast': 'true'}, +) +``` +{% endcode %} + +## CLI Reference + +### `feast dbt list` + +Discover dbt models available for import. + +```bash +feast dbt list [OPTIONS] +``` + +**Arguments:** +- `manifest_path`: Path to dbt's `manifest.json` file + +**Options:** +- `--tag-filter`, `-t`: Filter models by dbt tag (e.g., `feast`) +- `--model`, `-m`: Filter to specific model name(s) + +### `feast dbt import` + +Import dbt models as Feast object definitions. + +```bash +feast dbt import [OPTIONS] +``` + +**Arguments:** +- `manifest_path`: Path to dbt's `manifest.json` file + +**Options:** + +| Option | Description | Default | +|--------|-------------|---------| +| `--entity-column`, `-e` | Column to use as entity key | (required) | +| `--data-source-type`, `-d` | Data source type: `bigquery`, `snowflake`, `file` | `bigquery` | +| `--tag-filter`, `-t` | Filter models by dbt tag | None | +| `--model`, `-m` | Import specific model(s) only | None | +| `--timestamp-field` | Timestamp column name | `event_timestamp` | +| `--ttl-days` | Feature TTL in days | `1` | +| `--exclude-columns` | Columns to exclude from features | None | +| `--no-online` | Disable online serving | `False` | +| `--output`, `-o` | Output Python file path | None (stdout) | +| `--dry-run` | Preview without generating code | `False` | + +## Type Mapping + +Feast automatically maps dbt/warehouse column types to Feast types: + +| dbt/SQL Type | Feast Type | +|--------------|------------| +| `STRING`, `VARCHAR`, `TEXT` | `String` | +| `INT`, `INTEGER`, `BIGINT` | `Int64` | +| `SMALLINT`, `TINYINT` | `Int32` | +| `FLOAT`, `REAL` | `Float32` | +| `DOUBLE`, `FLOAT64` | `Float64` | +| `BOOLEAN`, `BOOL` | `Bool` | +| `TIMESTAMP`, `DATETIME` | `UnixTimestamp` | +| `BYTES`, `BINARY` | `Bytes` | +| `ARRAY` | `Array(type)` | + +Snowflake `NUMBER(precision, scale)` types are handled specially: +- Scale > 0: `Float64` +- Precision <= 9: `Int32` +- Precision <= 18: `Int64` +- Precision > 18: `Float64` + +## Data Source Configuration + +### BigQuery + +```bash +feast dbt import manifest.json -e user_id -d bigquery -o features.py +``` + +Generates `BigQuerySource` with the full table path from dbt metadata: +```python +BigQuerySource( + table="project.dataset.table_name", + ... +) +``` + +### Snowflake + +```bash +feast dbt import manifest.json -e user_id -d snowflake -o features.py +``` + +Generates `SnowflakeSource` with database, schema, and table: +```python +SnowflakeSource( + database="MY_DB", + schema="MY_SCHEMA", + table="TABLE_NAME", + ... +) +``` + +### File + +```bash +feast dbt import manifest.json -e user_id -d file -o features.py +``` + +Generates `FileSource` with a placeholder path: +```python +FileSource( + path="/data/table_name.parquet", + ... +) +``` + +{% hint style="info" %} +For file sources, update the generated path to point to your actual data files. +{% endhint %} + +## Best Practices + +### 1. Use consistent tagging + +Create a standard tagging convention in your dbt project: + +```yaml +# dbt_project.yml +models: + my_project: + features: + +tags: ['feast'] # All models in features/ get the feast tag +``` + +### 2. Document your columns + +Column descriptions from `schema.yml` are preserved in the generated Feast definitions, making your feature catalog self-documenting. + +### 3. Review before committing + +Use `--dry-run` to preview what will be generated: + +```bash +feast dbt import manifest.json -e user_id -d bigquery --dry-run +``` + +### 4. Version control generated code + +Commit the generated Python files to your repository. This allows you to: +- Track changes to feature definitions over time +- Review dbt-to-Feast mapping in pull requests +- Customize generated code if needed + +### 5. Integrate with CI/CD + +Add dbt import to your CI pipeline: + +```yaml +# .github/workflows/features.yml +- name: Compile dbt + run: dbt compile + +- name: Generate Feast definitions + run: | + feast dbt import target/manifest.json \ + -e user_id -d bigquery -t feast \ + -o feature_repo/features.py + +- name: Apply Feast changes + run: feast apply +``` + +## Limitations + +- **Single entity support**: Currently supports one entity column per import. For multi-entity models, run multiple imports or manually adjust the generated code. +- **No incremental updates**: Each import generates a complete file. Use version control to track changes. +- **Column types required**: Models without `data_type` in schema.yml default to `String` type. + +## Troubleshooting + +### "manifest.json not found" + +Run `dbt compile` or `dbt run` first to generate the manifest file. + +### "No models found with tag" + +Check that your models have the correct tag in their config: + +```sql +{{ config(tags=['feast']) }} +``` + +### "Missing entity column" + +Ensure your dbt model includes the entity column specified with `--entity-column`. Models missing this column are skipped with a warning. + +### "Missing timestamp column" + +By default, Feast looks for `event_timestamp`. Use `--timestamp-field` to specify a different column name. From 972fc96ec88639e958b4522f09d0e7d59c996737 Mon Sep 17 00:00:00 2001 From: yassinnouh21 Date: Wed, 14 Jan 2026 10:56:23 +0200 Subject: [PATCH 17/21] docs: Add alpha warning to dbt integration documentation Add prominent warning callout highlighting that the dbt integration is an alpha feature with current limitations. This sets proper expectations for users regarding: - Supported data sources (BigQuery, Snowflake, File only) - Single entity per model constraint - Potential for breaking changes in future releases Addresses feedback from PR #5827 review comments. Signed-off-by: yassinnouh21 --- docs/how-to-guides/dbt-integration.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docs/how-to-guides/dbt-integration.md b/docs/how-to-guides/dbt-integration.md index 7b687df504f..abaadbf8740 100644 --- a/docs/how-to-guides/dbt-integration.md +++ b/docs/how-to-guides/dbt-integration.md @@ -1,5 +1,16 @@ # Importing Features from dbt +{% hint style="warning" %} +**Alpha Feature**: The dbt integration is currently in early development and subject to change. + +**Current Limitations**: +- Supported data sources: BigQuery, Snowflake, and File-based sources only +- Single entity per model +- Manual entity column specification required + +Breaking changes may occur in future releases. +{% endhint %} + This guide explains how to use Feast's dbt integration to automatically import dbt models as Feast FeatureViews. This enables you to leverage your existing dbt transformations as feature definitions without manual duplication. ## Overview From b2901f48edd8d06e1f9af76b3825e509a8b77f18 Mon Sep 17 00:00:00 2001 From: yassinnouh21 Date: Wed, 14 Jan 2026 10:56:44 +0200 Subject: [PATCH 18/21] fix: Add dbt-artifacts-parser to CI_REQUIRED dependencies Ensure dbt-artifacts-parser is installed in CI environments by adding it to the CI_REQUIRED list in setup.py. This matches the dependency already present in pyproject.toml and ensures CI tests for dbt integration have access to the required parser library. Addresses feedback from PR #5827 review comments. Signed-off-by: yassinnouh21 --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index c183aa650ef..94211f142ad 100644 --- a/setup.py +++ b/setup.py @@ -199,6 +199,7 @@ "build", "virtualenv==20.23.0", "cryptography>=43.0,<44", + "dbt-artifacts-parser>=0.6.0,<1", "ruff>=0.8.0", "mypy-protobuf>=3.1", "grpcio-tools>=1.56.2,<=1.62.3", From fe253c1dc7ec1866a1a750f7485e0170d85a4fac Mon Sep 17 00:00:00 2001 From: yassinnouh21 Date: Wed, 14 Jan 2026 11:19:57 +0200 Subject: [PATCH 19/21] fix: Add defensive Array.base_type handling with logging Add logging and defensive attribute access for Array.base_type in code generation to prevent potential AttributeError. While Array.__init__ always sets base_type, defensive programming with warnings provides: - Protection against edge cases or future Array implementation changes - Clear visibility when fallback occurs via logger.warning - Consistent error handling across both usage sites Changes: - Add logging module and logger instance - Update _get_feast_type_name() to use getattr with warning - Update import tracking logic to use getattr with warning - Add concise comments with examples (e.g., Array(String) -> base_type = String) Addresses code review feedback from PR #5827. Signed-off-by: yassinnouh21 --- sdk/python/feast/dbt/codegen.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/sdk/python/feast/dbt/codegen.py b/sdk/python/feast/dbt/codegen.py index 5626b441b83..d8f5f2e3293 100644 --- a/sdk/python/feast/dbt/codegen.py +++ b/sdk/python/feast/dbt/codegen.py @@ -5,10 +5,13 @@ (Entity, DataSource, FeatureView) from dbt model metadata. """ +import logging from typing import Any, List, Optional, Set from jinja2 import BaseLoader, Environment +logger = logging.getLogger(__name__) + from feast.dbt.mapper import map_dbt_type_to_feast_type from feast.dbt.parser import DbtModel from feast.types import ( @@ -123,8 +126,17 @@ def _get_feast_type_name(feast_type: Any) -> str: """Get the string name of a Feast type for code generation.""" if isinstance(feast_type, Array): - # Handle Array types - safely get base_type with fallback - base_type = getattr(feast_type, "base_type", String) + # Safely get base_type. Should always exist since Array.__init__ sets it. + # Example: Array(String) -> base_type = String + base_type = getattr(feast_type, "base_type", None) + + if base_type is None: + logger.warning( + "Array type missing 'base_type' attribute. " + "This indicates a bug in Array initialization. Falling back to String." + ) + base_type = String + base_type_name = _get_feast_type_name(base_type) return f"Array({base_type_name})" @@ -293,11 +305,20 @@ def generate( feast_type = map_dbt_type_to_feast_type(column.data_type) type_name = _get_feast_type_name(feast_type) - # Track base type for imports (handle Array specially) + # Track base type for imports. For Array types, import both Array and base type. + # Example: Array(Int64) requires imports: Array, Int64 if isinstance(feast_type, Array): type_imports.add("Array") - # Also add the base type - base_type_name = _get_feast_type_name(feast_type.base_type) + + base_type = getattr(feast_type, "base_type", None) + if base_type is None: + logger.warning( + "Array type missing 'base_type' attribute while generating imports. " + "This indicates a bug in Array initialization. Falling back to String." + ) + base_type = String + + base_type_name = _get_feast_type_name(base_type) type_imports.add(base_type_name) else: type_imports.add(type_name) From ed2c2919850a3350f85d75c8f3ee10899f775f21 Mon Sep 17 00:00:00 2001 From: yassinnouh21 Date: Wed, 14 Jan 2026 11:41:01 +0200 Subject: [PATCH 20/21] docs: Add comment explaining ImageBytes/PdfBytes exclusion Add clarifying comment in type_map explaining why ImageBytes and PdfBytes are not included in the dbt type mapping. While these types exist in Feast, dbt manifests only expose generic BYTES type without semantic information to distinguish between regular bytes, images, or PDFs. Example: A dbt model with image and PDF columns both appear as 'BYTES' in the manifest, making ImageBytes/PdfBytes types unmappable from dbt artifacts. Addresses feedback from PR #5827 review (franciscojavierarceo). Signed-off-by: yassinnouh21 --- sdk/python/feast/dbt/codegen.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sdk/python/feast/dbt/codegen.py b/sdk/python/feast/dbt/codegen.py index d8f5f2e3293..8003e370b72 100644 --- a/sdk/python/feast/dbt/codegen.py +++ b/sdk/python/feast/dbt/codegen.py @@ -140,7 +140,9 @@ def _get_feast_type_name(feast_type: Any) -> str: base_type_name = _get_feast_type_name(base_type) return f"Array({base_type_name})" - # Map type objects to their names + # Map type objects to their names. + # Note: ImageBytes and PdfBytes are excluded since dbt manifests only expose + # generic BYTES type without semantic information about binary content. type_map = { String: "String", Int32: "Int32", From 7a50c7326d24824dd1e8afe96fa0640bbf6aceca Mon Sep 17 00:00:00 2001 From: yassinnouh21 Date: Fri, 16 Jan 2026 16:21:33 +0200 Subject: [PATCH 21/21] fix: Move imports to top of file to resolve linter errors - Fix E402 linter error in feast/dbt/codegen.py by moving imports before logger initialization - Update requirements files to include dbt-artifacts-parser in pydantic dependency comments Signed-off-by: yassinnouh21 --- sdk/python/feast/dbt/codegen.py | 4 ++-- sdk/python/requirements/py3.10-ci-requirements.txt | 1 + sdk/python/requirements/py3.11-ci-requirements.txt | 1 + sdk/python/requirements/py3.12-ci-requirements.txt | 1 + 4 files changed, 5 insertions(+), 2 deletions(-) diff --git a/sdk/python/feast/dbt/codegen.py b/sdk/python/feast/dbt/codegen.py index 8003e370b72..1c7acfb944c 100644 --- a/sdk/python/feast/dbt/codegen.py +++ b/sdk/python/feast/dbt/codegen.py @@ -10,8 +10,6 @@ from jinja2 import BaseLoader, Environment -logger = logging.getLogger(__name__) - from feast.dbt.mapper import map_dbt_type_to_feast_type from feast.dbt.parser import DbtModel from feast.types import ( @@ -26,6 +24,8 @@ UnixTimestamp, ) +logger = logging.getLogger(__name__) + # Template for generating a complete Feast definitions file FEAST_FILE_TEMPLATE = '''""" Feast feature definitions generated from dbt models. diff --git a/sdk/python/requirements/py3.10-ci-requirements.txt b/sdk/python/requirements/py3.10-ci-requirements.txt index 37d0004fb07..38d936e0873 100644 --- a/sdk/python/requirements/py3.10-ci-requirements.txt +++ b/sdk/python/requirements/py3.10-ci-requirements.txt @@ -3772,6 +3772,7 @@ pydantic==2.12.5 \ --hash=sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d # via # feast (setup.py) + # dbt-artifacts-parser # docling # docling-core # docling-ibm-models diff --git a/sdk/python/requirements/py3.11-ci-requirements.txt b/sdk/python/requirements/py3.11-ci-requirements.txt index 77efcc4f93d..568d1756c31 100644 --- a/sdk/python/requirements/py3.11-ci-requirements.txt +++ b/sdk/python/requirements/py3.11-ci-requirements.txt @@ -3937,6 +3937,7 @@ pydantic==2.12.5 \ # via # feast (setup.py) # codeflare-sdk + # dbt-artifacts-parser # docling # docling-core # docling-ibm-models diff --git a/sdk/python/requirements/py3.12-ci-requirements.txt b/sdk/python/requirements/py3.12-ci-requirements.txt index f6ab00ea4a2..1b00d233284 100644 --- a/sdk/python/requirements/py3.12-ci-requirements.txt +++ b/sdk/python/requirements/py3.12-ci-requirements.txt @@ -3927,6 +3927,7 @@ pydantic==2.12.5 \ # via # feast (setup.py) # codeflare-sdk + # dbt-artifacts-parser # docling # docling-core # docling-ibm-models