diff --git a/docs/samples/fraud_detection_demo.ipynb b/docs/samples/fraud_detection_demo.ipynb new file mode 100644 index 000000000..1408de700 --- /dev/null +++ b/docs/samples/fraud_detection_demo.ipynb @@ -0,0 +1,1025 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "7b19a0cd-31da-45b7-91a4-9cd561f3d3d8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Feathr Fraud Detection Sample\n", + "\n", + "This notebook illustrates the use of Feature Store to create a model that predicts the fraud status of transactions based on the user account data and trasaction data. All the data that was used in the notebook can be found here: https://github.com/microsoft/r-server-fraud-detection\n", + "\n", + "\n", + "In the following Notebook, we \n", + "1. Install the latest Feathr code (to include some unreleased features) \n", + "2. Define Environment Variables & `yaml_config` Settings \n", + "3. Create `FeathrClient` and Define `FeatureAnchor`\n", + "4. `build_features` and `get_offline_features` \n", + "5. Train Fraud Detection Model wih `KNeighborsClassifier`\n", + "6. `materialize_features` and `multi_get_online_features`\n", + "7. `register_features` and `list_registered_features`" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "0b51153e-40dd-43d5-9d3a-501534156e6d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Setup Feathr Developer Environment" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Prior to running the notebook, if you have not deployed all the required resources, please refer to the guide here and follow the steps to do so: https://linkedin.github.io/feathr/how-to-guides/azure-deployment-arm.html***" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "b9c63dd5-304e-4797-a230-8fb753710dbc", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "! pip install feathr azure-cli" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "69222adf-1cb0-410b-b98d-e22877f358c0", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import glob\n", + "import os\n", + "import tempfile\n", + "from datetime import datetime, timedelta\n", + "from math import sqrt\n", + "\n", + "from feathr import FeathrClient\n", + "from feathr import STRING, BOOLEAN, FLOAT, INT32, ValueType\n", + "from feathr import Feature, DerivedFeature, FeatureAnchor\n", + "from feathr import BackfillTime, MaterializationSettings\n", + "from feathr import FeatureQuery, ObservationSettings\n", + "from feathr import RedisSink\n", + "from feathr import INPUT_CONTEXT, HdfsSource\n", + "from feathr import WindowAggTransformation\n", + "from feathr import TypedKey\n", + "from sklearn.model_selection import train_test_split\n", + "from azure.identity import DefaultAzureCredential\n", + "from azure.keyvault.secrets import SecretClient" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "c0299d67-1103-4aa4-ba57-300498ae2579", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "! az login --use-device-code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "58291272-00e5-4fe3-99d6-f1b89726f692", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# replace with your prefix\n", + "resource_prefix = " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Permission\n", + "To run the cells below, you need additional permission: permission to your managed identity to access the keyvault, and permission to the user to access the Storage Blob. Run the following lines of command in the Cloud Shell in order to grant yourself the access.\n", + "\n", + "~~~ \n", + "userId=\n", + "resource_prefix=\n", + "synapse_workspace_name=\"${resource_prefix}syws\"\n", + "keyvault_name=\"${resource_prefix}kv\"\n", + "objectId=$(az ad user show --id $userId --query id -o tsv)\n", + "az keyvault update --name $keyvault_name --enable-rbac-authorization false\n", + "az keyvault set-policy -n $keyvault_name --secret-permissions get list --object-id $objectId\n", + "az role assignment create --assignee $userId --role \"Storage Blob Data Contributor\"\n", + "az synapse role assignment create --workspace-name $synapse_workspace_name --role \"Synapse Contributor\" --assignee $userId\n", + "~~~" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "a8a70f27-d520-4d3c-bb8c-f364f84cb738", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Get all the required credentials from Azure Key Vault\n", + "key_vault_name=resource_prefix+\"kv\"\n", + "synapse_workspace_url=resource_prefix+\"syws\"\n", + "adls_account=resource_prefix+\"dls\"\n", + "adls_fs_name=resource_prefix+\"fs\"\n", + "purview_name=resource_prefix+\"purview\"\n", + "key_vault_uri = f\"https://{key_vault_name}.vault.azure.net\"\n", + "credential = DefaultAzureCredential(exclude_interactive_browser_credential=False)\n", + "client = SecretClient(vault_url=key_vault_uri, credential=credential)\n", + "secretName = \"FEATHR-ONLINE-STORE-CONN\"\n", + "retrieved_secret = client.get_secret(secretName).value\n", + "\n", + "# Get redis credentials; This is to parse Redis connection string.\n", + "redis_port=retrieved_secret.split(',')[0].split(\":\")[1]\n", + "redis_host=retrieved_secret.split(',')[0].split(\":\")[0]\n", + "redis_password=retrieved_secret.split(',')[1].split(\"password=\",1)[1]\n", + "redis_ssl=retrieved_secret.split(',')[2].split(\"ssl=\",1)[1]\n", + "\n", + "# Set the resource link\n", + "os.environ['spark_config__azure_synapse__dev_url'] = f'https://{synapse_workspace_url}.dev.azuresynapse.net'\n", + "os.environ['spark_config__azure_synapse__pool_name'] = 'spark31'\n", + "os.environ['spark_config__azure_synapse__workspace_dir'] = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_project'\n", + "os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n", + "os.environ['online_store__redis__host'] = redis_host\n", + "os.environ['online_store__redis__port'] = redis_port\n", + "os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n", + "os.environ['REDIS_PASSWORD']=redis_password\n", + "os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n", + "feathr_output_path = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_output'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "50b2f73e-6380-42c3-91e8-4f3e15bc10d6", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import tempfile\n", + "yaml_config = \"\"\"\n", + "# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n", + "api_version: 1\n", + "project_config:\n", + " project_name: 'fraud_detection_test'\n", + " required_environment_variables:\n", + " - 'REDIS_PASSWORD'\n", + "offline_store:\n", + " adls:\n", + " adls_enabled: true\n", + " wasb:\n", + " wasb_enabled: true\n", + " s3:\n", + " s3_enabled: false\n", + " s3_endpoint: 's3.amazonaws.com'\n", + " jdbc:\n", + " jdbc_enabled: false\n", + " jdbc_database: 'feathrtestdb'\n", + " jdbc_table: 'feathrtesttable'\n", + " snowflake:\n", + " url: \"dqllago-ol19457.snowflakecomputing.com\"\n", + " user: \"feathrintegration\"\n", + " role: \"ACCOUNTADMIN\"\n", + "spark_config:\n", + " spark_cluster: 'azure_synapse'\n", + " spark_result_output_parts: '1'\n", + " azure_synapse:\n", + " dev_url: 'https://feathrazuretest3synapse.dev.azuresynapse.net'\n", + " pool_name: 'spark3'\n", + " workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/fraud_detection_test'\n", + " executor_size: 'Small'\n", + " executor_num: 4\n", + " feathr_runtime_location: wasbs://public@azurefeathrstorage.blob.core.windows.net/feathr-assembly-LATEST.jar\n", + " databricks:\n", + " workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net'\n", + " config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_D3_v2','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}}\n", + " work_dir: 'dbfs:/fraud_detection_test'\n", + " feathr_runtime_location: https://azurefeathrstorage.blob.core.windows.net/public/feathr-assembly-LATEST.jar\n", + "online_store:\n", + " redis:\n", + " host: 'feathrazuretest3redis.redis.cache.windows.net'\n", + " port: 6380\n", + " ssl_enabled: True\n", + "feature_registry:\n", + " api_endpoint: \"https://feathr-sql-registry.azurewebsites.net/api/v1\"\n", + "\"\"\"\n", + "tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n", + "with open(tmp.name, \"w\") as text_file:\n", + " text_file.write(yaml_config)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "eab0957c-c906-4297-a729-8dd8d79cb629", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Initialize `Feathr Client`\n", + "- `FeathrClient`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "3734eee3-12f9-44db-a440-ad375ef859f0", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "client = FeathrClient(config_path=tmp.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "f6adbca1-5642-4ac1-bff7-e7c9d4d9e5b2", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Define Features\n", + "- `HdfsSource`\n", + "- `TypedKey`\n", + "- `Feature`\n", + "- `FeatureAnchor`\n", + "- `DerivedFeature`" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "b073b509-0f95-4e23-b16b-ffd8190fb6a2", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Account Features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "b3668eeb-e4a0-4327-baf6-5521c856f51d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "#Refer to to learn more about the details of each method\n", + "account_info = HdfsSource(name=\"AccountData\",\n", + " path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/account_out_small.csv\",\n", + " event_timestamp_column=\"transactionDate\",\n", + " timestamp_format=\"yyyyMMdd\")\n", + "\n", + "accountId = TypedKey(key_column=\"accountID\",\n", + " key_column_type=ValueType.INT32,\n", + " description=\"account id\")\n", + "\n", + "account_country = Feature(name=\"account_country\",\n", + " key=accountId,\n", + " feature_type=STRING, \n", + " transform=\"accountCountry\")\n", + "\n", + "is_user_registered = Feature(name=\"is_user_registered\",\n", + " key=accountId,\n", + " feature_type=BOOLEAN,\n", + " transform=\"isUserRegistered==TRUE\")\n", + "\n", + "num_payment_rejects_1d_per_user = Feature(name=\"num_payment_rejects_1d_per_user\",\n", + " key=accountId,\n", + " feature_type=INT32,\n", + " transform=\"numPaymentRejects1dPerUser\")\n", + "\n", + "account_age = Feature(name=\"account_age\",\n", + " key=accountId,\n", + " feature_type=INT32,\n", + " transform=\"accountAge\")\n", + " \n", + "features = [\n", + " account_country,\n", + " account_age,\n", + " is_user_registered,\n", + " num_payment_rejects_1d_per_user\n", + "]\n", + "\n", + "account_anchor = FeatureAnchor(name=\"account_features\",\n", + " source=account_info,\n", + " features=features)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "6f12c07e-4faf-4411-8acd-6f5d13b962f8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Transaction Features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "280062b9-ae21-4a1a-ae94-86a5c17fd589", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# # #Refer to to learn more about the details of each method\n", + "\n", + "transaction_data = HdfsSource(name=\"transaction_data\",\n", + " path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/transaction_out_small.csv\",\n", + " event_timestamp_column=\"transactionDate\",\n", + " timestamp_format=\"yyyyMMdd\")\n", + "\n", + "transaction_id = Feature(name=\"transaction_id\",\n", + " key=accountId,\n", + " feature_type=STRING,\n", + " transform=\"transactionID\")\n", + "\n", + "transaction_currency_code = Feature(name=\"transaction_currency_code\",\n", + " key=accountId,\n", + " feature_type=STRING,\n", + " transform=\"transactionCurrencyCode\")\n", + " \n", + "transaction_amount = Feature(name=\"transaction_amount\",\n", + " key=accountId,\n", + " feature_type=FLOAT,\n", + " transform=\"transactionAmount\")\n", + "\n", + "transaction_device_id = Feature(name=\"transaction_device_id\",\n", + " key=accountId,\n", + " feature_type=FLOAT,\n", + " transform=\"transactionDeviceId\")\n", + "\n", + "transaction_ip_address = Feature(name=\"transaction_ip_address\",\n", + " key=accountId,\n", + " feature_type=FLOAT,\n", + " transform=\"transactionIPaddress\")\n", + "\n", + "transaction_time = Feature(name=\"transaction_time\",\n", + " key=accountId,\n", + " feature_type=INT32,\n", + " transform=\"transactionTime\")\n", + "\n", + "fraud_status = Feature(name=\"fraud_status\",\n", + " key=accountId,\n", + " feature_type=STRING,\n", + " transform=\"fraud_tag\")\n", + "\n", + "features = [\n", + " transaction_id,\n", + " transaction_amount,\n", + " transaction_device_id,\n", + " transaction_ip_address,\n", + " transaction_time,\n", + " transaction_currency_code,\n", + " fraud_status\n", + "]\n", + "\n", + "transaction_feature_anchor = FeatureAnchor(name=\"transaction_features\",\n", + " source=transaction_data,\n", + " features=features)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "86ac05e1-26bb-4820-87ea-f547e3561181", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Transaction Aggregated Features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "4c969554-f690-42f5-b70a-d962bf558b03", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# average amount of transaction in the past week\n", + "transactions_aggr = HdfsSource(name=\"transactions_aggr\",\n", + " path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/transaction_out_small.csv\",\n", + " event_timestamp_column=\"transactionDate\",\n", + " timestamp_format=\"yyyyMMdd\")\n", + "\n", + "# average amount of transaction in that week\n", + "avg_transaction_amount = Feature(name=\"avg_transaction_amount\",\n", + " key=accountId,\n", + " feature_type=FLOAT,\n", + " transform=WindowAggTransformation(agg_expr=\"cast_float(transactionAmount)\",\n", + " agg_func=\"AVG\",\n", + " window=\"7d\"))\n", + "\n", + "# number of transaction that took place in a day\n", + "num_trasaction_count_in_day = Feature(name=\"num_trasaction_count_in_day\",\n", + " key=accountId,\n", + " feature_type=INT32,\n", + " transform=WindowAggTransformation(agg_expr=\"transactionID\",\n", + " agg_func=\"COUNT\",\n", + " window=\"1d\"))\n", + "\n", + "# Amount of transaction that took place in a day\n", + "total_transaction_amount_in_day = Feature(name=\"total_transaction_amount_in_day\",\n", + " key=accountId,\n", + " feature_type=FLOAT,\n", + " transform=WindowAggTransformation(agg_expr=\"cast_float(transactionAmount)\",\n", + " agg_func=\"SUM\",\n", + " window=\"1d\"))\n", + "\n", + "# average time of transaction in the past week\n", + "avg_transaction_time = Feature(name=\"avg_transaction_time\",\n", + " key=accountId,\n", + " feature_type=INT32,\n", + " transform=WindowAggTransformation(agg_expr=\"cast_float(transactionTime)\",\n", + " agg_func=\"AVG\",\n", + " window=\"7d\")) \n", + "\n", + "# total number of currency used for transaction in the past week\n", + "num_currency_type_in_week = Feature(name=\"num_currency_type_in_week\",\n", + " key=accountId,\n", + " feature_type=INT32,\n", + " transform=WindowAggTransformation(agg_expr=\"transactionCurrencyCode\",\n", + " agg_func=\"COUNT\",\n", + " window=\"7d\"))\n", + "\n", + "# number of different ip address used for transaction in the past week\n", + "num_ip_address_count = Feature(name=\"num_ip_address_count\",\n", + " key=accountId,\n", + " feature_type=INT32,\n", + " transform=WindowAggTransformation(agg_expr=\"transactionIPaddress\",\n", + " agg_func=\"COUNT\",\n", + " window=\"7d\"))\n", + "\n", + "# number of devices used for the transaction in the past week\n", + "num_device_count = Feature(name=\"num_device_count\",\n", + " key=accountId,\n", + " feature_type=INT32,\n", + " transform=WindowAggTransformation(agg_expr=\"transactionDeviceId\",\n", + " agg_func=\"COUNT\",\n", + " window=\"7d\"))\n", + "\n", + "# find the time of most recent transaction\n", + "time_most_recent_transaction = Feature(name=\"time_most_recent_transaction\",\n", + " key=accountId,\n", + " feature_type=INT32,\n", + " transform=WindowAggTransformation(agg_expr=\"transactionTime\",\n", + " agg_func=\"LATEST\",\n", + " window=\"7d\"))\n", + "\n", + "features = [\n", + " avg_transaction_amount,\n", + " avg_transaction_time,\n", + " total_transaction_amount_in_day,\n", + " num_trasaction_count_in_day,\n", + " num_currency_type_in_week,\n", + " num_ip_address_count,\n", + " num_device_count,\n", + " time_most_recent_transaction\n", + "]\n", + "\n", + "aggr_anchor = FeatureAnchor(name=\"transaction_aggr_features\",\n", + " source=transactions_aggr,\n", + " features=features)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "17cc5132-461f-4d3d-b517-1f7e69d23252", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Derived Features\n", + "- `DerivedFeature`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "7ac10ce4-e222-469c-bb2e-1658b45e3eda", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# derived features\n", + "feature_diff_current_and_avg_amount = DerivedFeature(name=\"feature_diff_current_and_avg_amount\",\n", + " key=accountId,\n", + " feature_type=FLOAT,\n", + " input_features=[\n", + " transaction_amount, avg_transaction_amount],\n", + " transform=\"transaction_amount - avg_transaction_amount\")\n", + "\n", + "feature_time_pass_after_most_recent_transaction = DerivedFeature(name=\"feature_time_pass_after_most_recent_transaction\",\n", + " key=accountId,\n", + " feature_type=INT32,\n", + " input_features=[\n", + " transaction_time, time_most_recent_transaction],\n", + " transform=\"cast_int(transaction_time) - cast_int(time_most_recent_transaction)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "a9ec8416-9ac6-4499-b60f-55822265b893", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Build Defined Features\n", + "- `build_features`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "d9d32d4f-2b60-4978-bb87-c7d2160e98eb", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "client.build_features(anchor_list=[account_anchor, transaction_feature_anchor, aggr_anchor], \n", + " derived_feature_list=[feature_time_pass_after_most_recent_transaction, feature_diff_current_and_avg_amount])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "fa9e53b9-e7d4-4b25-b486-dc9e6801369a", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Get Offline Features\n", + "- `FeatureQuery`\n", + "- `ObservationSettings`\n", + "- `get_offline_features`\n", + "- `feathr_spark_launcher.download_result`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "b6340f2f-79dc-442b-a202-b2f2078a62ac", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "if client.spark_runtime == 'databricks':\n", + " output_path = 'dbfs:/feathrfrauddetection_test.avro'\n", + "else:\n", + " output_path = feathr_output_path\n", + "\n", + "feature_query = FeatureQuery(\n", + " feature_list=[\"account_country\",\n", + " \"transaction_time\",\n", + " \"num_currency_type_in_week\",\n", + " \"num_trasaction_count_in_day\",\n", + " \"total_transaction_amount_in_day\",\n", + " \"fraud_status\",\n", + " \"is_user_registered\",\n", + " \"avg_transaction_amount\",\n", + " \"num_ip_address_count\",\n", + " \"num_device_count\",\n", + " \"time_most_recent_transaction\",\n", + " \"feature_diff_current_and_avg_amount\",\n", + " \"feature_time_pass_after_most_recent_transaction\"], key=accountId)\n", + " \n", + "settings = ObservationSettings(\n", + " observation_path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/observation_out_small.csv\",\n", + " event_timestamp_column=\"transactionDate\",\n", + " timestamp_format=\"yyyyMMdd\")\n", + " \n", + "client.get_offline_features(observation_settings=settings,\n", + " feature_query=feature_query,\n", + " output_path=output_path)\n", + "client.wait_job_to_finish(timeout_sec=10000000000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "5b7603ee-0c81-49ed-8e1f-53161ae57cbf", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import pandavro as pdx\n", + "import glob\n", + "from pathlib import Path\n", + "import matplotlib.pyplot as plt\n", + "from datetime import datetime, timedelta\n", + "\n", + "from feathr import BackfillTime, MaterializationSettings, RedisSink" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "997db6eb-c7d8-4f5e-b6e0-09733ff706b7", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "def get_result_df(client: FeathrClient) -> pd.DataFrame:\n", + " \"\"\"Download the job result dataset from cloud as a Pandas dataframe.\"\"\"\n", + " res_url = client.get_job_result_uri(block=True, timeout_sec=600)\n", + " tmp_dir = tempfile.TemporaryDirectory()\n", + " client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=tmp_dir.name)\n", + " dataframe_list = []\n", + " # assuming the result are in avro format\n", + " for file in glob.glob(os.path.join(tmp_dir.name, '*.avro')):\n", + " dataframe_list.append(pdx.read_avro(file))\n", + " vertical_concat_df = pd.concat(dataframe_list, axis=0)\n", + " tmp_dir.cleanup()\n", + " return vertical_concat_df\n", + "\n", + "df_res = get_result_df(client)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "7fff1ac7-90d1-469b-a54c-397904417796", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Feature Visualization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "e482625e-2ecd-45cb-9d43-5baacd445006", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "filepath = Path('./result_out.csv')\n", + "df_res.to_csv(filepath, index=False) \n", + "df_res.reset_index()\n", + "df_res" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "b4f86c53-16cf-4836-969b-7c34f0922057", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Train Fraud Detection Model with Calculated Features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "0d9d06b8-01e7-4772-8734-6ebfe1996b03", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.model_selection import train_test_split \n", + "import seaborn as sns\n", + "\n", + "final_df = df_res\n", + "final_df.drop(['accountID'], axis=1, inplace=True, errors='ignore')\n", + "final_df.drop(['transactionDate'], axis=1, inplace=True, errors='ignore')\n", + "final_df.drop(['account_country'], axis=1, inplace=True, errors='ignore')\n", + "final_df = final_df.fillna(0)\n", + "\n", + "x_train, x_test, y_train, y_test = train_test_split(final_df.drop([\"fraud_status\"], axis=1),\n", + " final_df[\"fraud_status\"],\n", + " test_size=0.20,\n", + " random_state=0)\n", + " \n", + "K = []\n", + "training = []\n", + "test = []\n", + "scores = {}\n", + " \n", + "for k in range(2, 21):\n", + " clf = KNeighborsClassifier(n_neighbors = k)\n", + " clf.fit(x_train, y_train)\n", + " \n", + " training_score = clf.score(x_train, y_train)\n", + " test_score = clf.score(x_test, y_test)\n", + " K.append(k)\n", + " \n", + " training.append(training_score)\n", + " test.append(test_score)\n", + " scores[k] = [training_score, test_score]\n", + "\n", + "for keys, values in scores.items():\n", + " print(keys, ':', values)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "83e69f23-aa4e-4893-8907-6d5f0792c23f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Materialize Features in Redis\n", + "- `BackfillTime`\n", + "- `RedisSink`\n", + "- `materialize_features`\n", + "- `multi_get_online_features`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "faad23c1-d827-4674-b630-83530574c27d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "backfill_time = BackfillTime(start=datetime(\n", + " 2013, 4, 7), end=datetime(2013, 4, 7), step=timedelta(days=1))\n", + "redisSink = RedisSink(table_name=\"fraudDetectionDemoFeature\")\n", + "settings = MaterializationSettings(\"fraudDetectionDemoFeature\",\n", + " backfill_time=backfill_time,\n", + " sinks=[redisSink],\n", + " feature_names=[\"fraud_status\"])\n", + "\n", + "client.materialize_features(settings)\n", + "client.wait_job_to_finish(timeout_sec=5000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "1f5b191f-b1e8-49e4-b54d-ffc2f8c0a0b8", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "client.multi_get_online_features('fraudDetectionDemoFeature', ['1759222192247110', '914800996051170'], [\n", + " \"fraud_status\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "9c3b2403-95d6-44a1-b536-d2088608ff58", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "client.multi_get_online_features('fraudDetectionDemoFeature', ['1759222192247110', '914800996051170', '844428033864668'], [\n", + " \"fraud_status\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "71ba8699-3c42-4f73-be59-95b29f468696", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Register Features with Registry APIs\n", + "- `register_features`\n", + "- `list_registered_features`\n", + "- Above queries are send to a Standard Registry API Service (both `Purview` and `SQL` backend are supported)\n", + "- More friendly interface with detailed lineage can be found in: [Feathr UI](https://feathr-sql-registry.azurewebsites.net/)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "c5028dd9-01ed-4394-a5c7-623e674125f6", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "client.register_features()\n", + "client.list_registered_features(project_name=\"fraud_detection_test\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "cb814ce7-72b9-4622-8518-106d4acf9008", + "showTitle": false, + "title": "" + } + }, + "source": [] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "fraud_detection_feathr_test_2", + "notebookOrigID": 1891349682974490, + "widgets": {} + }, + "interpreter": { + "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" + }, + "kernelspec": { + "display_name": "Python 3.10.4 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file