From 18b015cdb41c0668326c3be6ee6a5639b52d97bc Mon Sep 17 00:00:00 2001 From: Curie Date: Wed, 27 Jul 2022 08:10:32 -0700 Subject: [PATCH 1/5] add new fraud detection sample --- .../data/feathr_user_workspace/fraud_detection_demo.ipynb | 1 + 1 file changed, 1 insertion(+) create mode 100644 feathr_project/feathrcli/data/feathr_user_workspace/fraud_detection_demo.ipynb diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/fraud_detection_demo.ipynb b/feathr_project/feathrcli/data/feathr_user_workspace/fraud_detection_demo.ipynb new file mode 100644 index 000000000..82cf512b5 --- /dev/null +++ b/feathr_project/feathrcli/data/feathr_user_workspace/fraud_detection_demo.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7b19a0cd-31da-45b7-91a4-9cd561f3d3d8","showTitle":false,"title":""}},"source":["# Feathr Fraud Detection Sample\n","\n","This notebook illustrates the use of Feature Store to create a model that predicts the fraud status of transactions based on the user account data and trasaction data. All the data that was used in the notebook can be found here: https://github.com/microsoft/r-server-fraud-detection\n","\n","\n","In the following Notebook, we \n","1. Install the latest Feathr code (to include some unreleased features) \n","2. Define Environment Variables & `yaml_config` Settings \n","3. Create `FeathrClient` and Define `FeatureAnchor`\n","4. `build_features` and `get_offline_features` \n","5. Train Fraud Detection Model wih `KNeighborsClassifier`\n","6. `materialize_features` and `multi_get_online_features`\n","7. `register_features` and `list_registered_features`"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0b51153e-40dd-43d5-9d3a-501534156e6d","showTitle":false,"title":""}},"source":["## Setup Feathr Developer Environment"]},{"cell_type":"markdown","metadata":{},"source":["***Prior to running the notebook, if you have not deployed all the required resources, please refer to the guide here and follow the steps to do so: https://github.com/linkedin/feathr/blob/main/docs/how-to-guides/azure-deployment-arm.md***"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b9c63dd5-304e-4797-a230-8fb753710dbc","showTitle":false,"title":""}},"outputs":[],"source":["! pip install azure-cli\n","! pip install git+https://github.com/linkedin/feathr.git#subdirectory=feathr_project"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"69222adf-1cb0-410b-b98d-e22877f358c0","showTitle":false,"title":""}},"outputs":[],"source":["import glob\n","import os\n","import tempfile\n","from datetime import datetime, timedelta\n","from math import sqrt\n","\n","from feathr import FeathrClient\n","from feathr import STRING, BOOLEAN, FLOAT, INT32, ValueType\n","from feathr import Feature, DerivedFeature, FeatureAnchor\n","from feathr import BackfillTime, MaterializationSettings\n","from feathr import FeatureQuery, ObservationSettings\n","from feathr import RedisSink\n","from feathr import INPUT_CONTEXT, HdfsSource\n","from feathr import WindowAggTransformation\n","from feathr import TypedKey\n","from sklearn.model_selection import train_test_split\n","from azure.identity import DefaultAzureCredential\n","from azure.keyvault.secrets import SecretClient"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c0299d67-1103-4aa4-ba57-300498ae2579","showTitle":false,"title":""}},"outputs":[],"source":["! az login --use-device-code"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"58291272-00e5-4fe3-99d6-f1b89726f692","showTitle":false,"title":""}},"outputs":[],"source":["# replace with your prefix\n","resource_prefix = "]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9927c62c-7393-4ccd-a6dc-631084134386","showTitle":false,"title":""}},"source":["## Permission\n","To run the cells below, you need additional permission: permission to your managed identity to access the keyvault, and permission to the user to access the Storage Blob. Run the following lines of command in the Cloud Shell in order to grant yourself the access.\n","\n","~~~ \n","userId=\n","resource_prefix=\n","keyvault_name=\"${resource_prefix}kv\"\n","objectId=$(az ad user show --id $userId --query id -o tsv)\n","az keyvault update --name $keyvault_name --enable-rbac-authorization false\n","az keyvault set-policy -n $keyvault_name --secret-permissions get list --object-id $objectId\n","az role assignment create --assignee $userId --role \"Storage Blob Data Contributor\"\n","~~~"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a8a70f27-d520-4d3c-bb8c-f364f84cb738","showTitle":false,"title":""}},"outputs":[],"source":["# Get all the required credentials from Azure Key Vault\n","key_vault_name=resource_prefix+\"kv\"\n","synapse_workspace_url=resource_prefix+\"syws\"\n","adls_account=resource_prefix+\"dls\"\n","adls_fs_name=resource_prefix+\"fs\"\n","purview_name=resource_prefix+\"purview\"\n","key_vault_uri = f\"https://{key_vault_name}.vault.azure.net\"\n","credential = DefaultAzureCredential(exclude_interactive_browser_credential=False)\n","client = SecretClient(vault_url=key_vault_uri, credential=credential)\n","secretName = \"FEATHR-ONLINE-STORE-CONN\"\n","retrieved_secret = client.get_secret(secretName).value\n","\n","# Get redis credentials; This is to parse Redis connection string.\n","redis_port=retrieved_secret.split(',')[0].split(\":\")[1]\n","redis_host=retrieved_secret.split(',')[0].split(\":\")[0]\n","redis_password=retrieved_secret.split(',')[1].split(\"password=\",1)[1]\n","redis_ssl=retrieved_secret.split(',')[2].split(\"ssl=\",1)[1]\n","\n","# Set the resource link\n","os.environ['spark_config__azure_synapse__dev_url'] = f'https://{synapse_workspace_url}.dev.azuresynapse.net'\n","os.environ['spark_config__azure_synapse__pool_name'] = 'spark31'\n","os.environ['spark_config__azure_synapse__workspace_dir'] = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_project'\n","os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n","os.environ['online_store__redis__host'] = redis_host\n","os.environ['online_store__redis__port'] = redis_port\n","os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n","os.environ['REDIS_PASSWORD']=redis_password\n","os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n","feathr_output_path = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_output'"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"50b2f73e-6380-42c3-91e8-4f3e15bc10d6","showTitle":false,"title":""}},"outputs":[],"source":["import tempfile\n","yaml_config = \"\"\"\n","# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n","api_version: 1\n","project_config:\n"," project_name: 'fraud_detection_test'\n"," required_environment_variables:\n"," - 'REDIS_PASSWORD'\n","offline_store:\n"," adls:\n"," adls_enabled: true\n"," wasb:\n"," wasb_enabled: true\n"," s3:\n"," s3_enabled: false\n"," s3_endpoint: 's3.amazonaws.com'\n"," jdbc:\n"," jdbc_enabled: false\n"," jdbc_database: 'feathrtestdb'\n"," jdbc_table: 'feathrtesttable'\n"," snowflake:\n"," url: \"dqllago-ol19457.snowflakecomputing.com\"\n"," user: \"feathrintegration\"\n"," role: \"ACCOUNTADMIN\"\n","spark_config:\n"," spark_cluster: 'azure_synapse'\n"," spark_result_output_parts: '1'\n"," azure_synapse:\n"," dev_url: 'https://feathrazuretest3synapse.dev.azuresynapse.net'\n"," pool_name: 'spark3'\n"," workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/fraud_detection_test'\n"," executor_size: 'Small'\n"," executor_num: 4\n"," feathr_runtime_location: wasbs://public@azurefeathrstorage.blob.core.windows.net/feathr-assembly-LATEST.jar\n"," databricks:\n"," workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net'\n"," config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_D3_v2','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}}\n"," work_dir: 'dbfs:/fraud_detection_test'\n"," feathr_runtime_location: https://azurefeathrstorage.blob.core.windows.net/public/feathr-assembly-LATEST.jar\n","online_store:\n"," redis:\n"," host: 'feathrazuretest3redis.redis.cache.windows.net'\n"," port: 6380\n"," ssl_enabled: True\n","feature_registry:\n"," api_endpoint: \"https://feathr-sql-registry.azurewebsites.net/api/v1\"\n","\"\"\"\n","tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n","with open(tmp.name, \"w\") as text_file:\n"," text_file.write(yaml_config)\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"eab0957c-c906-4297-a729-8dd8d79cb629","showTitle":false,"title":""}},"source":["## Initialize `Feathr Client`\n","- `FeathrClient`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3734eee3-12f9-44db-a440-ad375ef859f0","showTitle":false,"title":""}},"outputs":[],"source":["client = FeathrClient(config_path=tmp.name)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"f6adbca1-5642-4ac1-bff7-e7c9d4d9e5b2","showTitle":false,"title":""}},"source":["## Define Features\n","- `HdfsSource`\n","- `TypedKey`\n","- `Feature`\n","- `FeatureAnchor`\n","- `DerivedFeature`"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b073b509-0f95-4e23-b16b-ffd8190fb6a2","showTitle":false,"title":""}},"source":["### Account Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b3668eeb-e4a0-4327-baf6-5521c856f51d","showTitle":false,"title":""}},"outputs":[],"source":["#Refer to to learn more about the details of each method\n","account_info = HdfsSource(name=\"AccountData\",\n"," path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/account_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n","\n","accountId = TypedKey(key_column=\"accountID\",\n"," key_column_type=ValueType.INT32,\n"," description=\"account id\")\n","\n","account_country = Feature(name=\"account_country\",\n"," key=accountId,\n"," feature_type=STRING, \n"," transform=\"accountCountry\")\n","\n","is_user_registered = Feature(name=\"is_user_registered\",\n"," key=accountId,\n"," feature_type=BOOLEAN,\n"," transform=\"isUserRegistered==TRUE\")\n","\n","num_payment_rejects_1d_per_user = Feature(name=\"num_payment_rejects_1d_per_user\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=\"numPaymentRejects1dPerUser\")\n","\n","account_age = Feature(name=\"account_age\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=\"accountAge\")\n"," \n","features = [\n"," account_country,\n"," account_age,\n"," is_user_registered,\n"," num_payment_rejects_1d_per_user\n","]\n","\n","account_anchor = FeatureAnchor(name=\"account_features\",\n"," source=account_info,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"6f12c07e-4faf-4411-8acd-6f5d13b962f8","showTitle":false,"title":""}},"source":["### Transaction Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"280062b9-ae21-4a1a-ae94-86a5c17fd589","showTitle":false,"title":""}},"outputs":[],"source":["# # #Refer to to learn more about the details of each method\n","\n","transaction_data = HdfsSource(name=\"transaction_data\",\n"," path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/transaction_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n","\n","transaction_id = Feature(name=\"transaction_id\",\n"," key=accountId,\n"," feature_type=STRING,\n"," transform=\"transactionID\")\n","\n","transaction_currency_code = Feature(name=\"transaction_currency_code\",\n"," key=accountId,\n"," feature_type=STRING,\n"," transform=\"transactionCurrencyCode\")\n"," \n","transaction_amount = Feature(name=\"transaction_amount\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=\"transactionAmount\")\n","\n","transaction_device_id = Feature(name=\"transaction_device_id\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=\"transactionDeviceId\")\n","\n","transaction_ip_address = Feature(name=\"transaction_ip_address\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=\"transactionIPaddress\")\n","\n","transaction_time = Feature(name=\"transaction_time\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=\"transactionTime\")\n","\n","fraud_status = Feature(name=\"fraud_status\",\n"," key=accountId,\n"," feature_type=STRING,\n"," transform=\"fraud_tag\")\n","\n","features = [\n"," transaction_id,\n"," transaction_amount,\n"," transaction_device_id,\n"," transaction_ip_address,\n"," transaction_time,\n"," transaction_currency_code,\n"," fraud_status\n","]\n","\n","transaction_feature_anchor = FeatureAnchor(name=\"transaction_features\",\n"," source=transaction_data,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"86ac05e1-26bb-4820-87ea-f547e3561181","showTitle":false,"title":""}},"source":["### Transaction Aggregated Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4c969554-f690-42f5-b70a-d962bf558b03","showTitle":false,"title":""}},"outputs":[],"source":["# average amount of transaction in the past week\n","transactions_aggr = HdfsSource(name=\"transactions_aggr\",\n"," path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/transaction_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n","\n","# average amount of transaction in that week\n","avg_transaction_amount = Feature(name=\"avg_transaction_amount\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(transactionAmount)\",\n"," agg_func=\"AVG\",\n"," window=\"7d\"))\n","\n","# number of transaction that took place in a day\n","num_trasaction_count_in_day = Feature(name=\"num_trasaction_count_in_day\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionID\",\n"," agg_func=\"COUNT\",\n"," window=\"1d\"))\n","\n","# Amount of transaction that took place in a day\n","total_transaction_amount_in_day = Feature(name=\"total_transaction_amount_in_day\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(transactionAmount)\",\n"," agg_func=\"SUM\",\n"," window=\"1d\"))\n","\n","# average time of transaction in the past week\n","avg_transaction_time = Feature(name=\"avg_transaction_time\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(transactionTime)\",\n"," agg_func=\"AVG\",\n"," window=\"7d\")) \n","\n","# total number of currency used for transaction in the past week\n","num_currency_type_in_week = Feature(name=\"num_currency_type_in_week\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionCurrencyCode\",\n"," agg_func=\"COUNT\",\n"," window=\"7d\"))\n","\n","# number of different ip address used for transaction in the past week\n","num_ip_address_count = Feature(name=\"num_ip_address_count\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionIPaddress\",\n"," agg_func=\"COUNT\",\n"," window=\"7d\"))\n","\n","# number of devices used for the transaction in the past week\n","num_device_count = Feature(name=\"num_device_count\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionDeviceId\",\n"," agg_func=\"COUNT\",\n"," window=\"7d\"))\n","\n","# find the time of most recent transaction\n","time_most_recent_transaction = Feature(name=\"time_most_recent_transaction\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionTime\",\n"," agg_func=\"LATEST\",\n"," window=\"7d\"))\n","\n","features = [\n"," avg_transaction_amount,\n"," avg_transaction_time,\n"," total_transaction_amount_in_day,\n"," num_trasaction_count_in_day,\n"," num_currency_type_in_week,\n"," num_ip_address_count,\n"," num_device_count,\n"," time_most_recent_transaction\n","]\n","\n","aggr_anchor = FeatureAnchor(name=\"transaction_aggr_features\",\n"," source=transactions_aggr,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"17cc5132-461f-4d3d-b517-1f7e69d23252","showTitle":false,"title":""}},"source":["### Derived Features\n","- `DerivedFeature`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7ac10ce4-e222-469c-bb2e-1658b45e3eda","showTitle":false,"title":""}},"outputs":[],"source":["# derived features\n","feature_diff_current_and_avg_amount = DerivedFeature(name=\"feature_diff_current_and_avg_amount\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," input_features=[\n"," transaction_amount, avg_transaction_amount],\n"," transform=\"transaction_amount - avg_transaction_amount\")\n","\n","feature_time_pass_after_most_recent_transaction = DerivedFeature(name=\"feature_time_pass_after_most_recent_transaction\",\n"," key=accountId,\n"," feature_type=INT32,\n"," input_features=[\n"," transaction_time, time_most_recent_transaction],\n"," transform=\"cast_int(transaction_time) - cast_int(time_most_recent_transaction)\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a9ec8416-9ac6-4499-b60f-55822265b893","showTitle":false,"title":""}},"source":["## Build Defined Features\n","- `build_features`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"d9d32d4f-2b60-4978-bb87-c7d2160e98eb","showTitle":false,"title":""}},"outputs":[],"source":["client.build_features(anchor_list=[account_anchor, transaction_feature_anchor, aggr_anchor], \n"," derived_feature_list=[feature_time_pass_after_most_recent_transaction, feature_diff_current_and_avg_amount])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"fa9e53b9-e7d4-4b25-b486-dc9e6801369a","showTitle":false,"title":""}},"source":["## Get Offline Features\n","- `FeatureQuery`\n","- `ObservationSettings`\n","- `get_offline_features`\n","- `feathr_spark_launcher.download_result`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b6340f2f-79dc-442b-a202-b2f2078a62ac","showTitle":false,"title":""}},"outputs":[],"source":["if client.spark_runtime == 'databricks':\n"," output_path = 'dbfs:/feathrfrauddetection_test.avro'\n","else:\n"," output_path = feathr_output_path\n","\n","feature_query = FeatureQuery(\n"," feature_list=[\"account_country\",\n"," \"transaction_time\",\n"," \"num_currency_type_in_week\",\n"," \"num_trasaction_count_in_day\",\n"," \"total_transaction_amount_in_day\",\n"," \"fraud_status\",\n"," \"is_user_registered\",\n"," \"avg_transaction_amount\",\n"," \"num_ip_address_count\",\n"," \"num_device_count\",\n"," \"time_most_recent_transaction\",\n"," \"feature_diff_current_and_avg_amount\",\n"," \"feature_time_pass_after_most_recent_transaction\"], key=accountId)\n"," \n","settings = ObservationSettings(\n"," observation_path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/observation_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n"," \n","client.get_offline_features(observation_settings=settings,\n"," feature_query=feature_query,\n"," output_path=output_path)\n","client.wait_job_to_finish(timeout_sec=10000000000)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5b7603ee-0c81-49ed-8e1f-53161ae57cbf","showTitle":false,"title":""}},"outputs":[],"source":["import pandas as pd\n","import pandavro as pdx\n","import glob\n","from pathlib import Path\n","import matplotlib.pyplot as plt\n","from datetime import datetime, timedelta\n","\n","from feathr import BackfillTime, MaterializationSettings, RedisSink"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"997db6eb-c7d8-4f5e-b6e0-09733ff706b7","showTitle":false,"title":""}},"outputs":[],"source":["def get_result_df(client: FeathrClient) -> pd.DataFrame:\n"," \"\"\"Download the job result dataset from cloud as a Pandas dataframe.\"\"\"\n"," res_url = client.get_job_result_uri(block=True, timeout_sec=600)\n"," tmp_dir = tempfile.TemporaryDirectory()\n"," client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=tmp_dir.name)\n"," dataframe_list = []\n"," # assuming the result are in avro format\n"," for file in glob.glob(os.path.join(tmp_dir.name, '*.avro')):\n"," dataframe_list.append(pdx.read_avro(file))\n"," vertical_concat_df = pd.concat(dataframe_list, axis=0)\n"," tmp_dir.cleanup()\n"," return vertical_concat_df\n","\n","df_res = get_result_df(client)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7fff1ac7-90d1-469b-a54c-397904417796","showTitle":false,"title":""}},"source":["## Feature Visualization"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e482625e-2ecd-45cb-9d43-5baacd445006","showTitle":false,"title":""}},"outputs":[],"source":["filepath = Path('./result_out.csv')\n","df_res.to_csv(filepath, index=False) \n","df_res.reset_index()\n","df_res"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b4f86c53-16cf-4836-969b-7c34f0922057","showTitle":false,"title":""}},"source":["## Train Fraud Detection Model with Calculated Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0d9d06b8-01e7-4772-8734-6ebfe1996b03","showTitle":false,"title":""}},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","from sklearn.neighbors import KNeighborsClassifier\n","from sklearn.model_selection import train_test_split \n","import seaborn as sns\n","\n","final_df = df_res\n","final_df.drop(['accountID'], axis=1, inplace=True, errors='ignore')\n","final_df.drop(['transactionDate'], axis=1, inplace=True, errors='ignore')\n","final_df.drop(['account_country'], axis=1, inplace=True, errors='ignore')\n","final_df = final_df.fillna(0)\n","\n","x_train, x_test, y_train, y_test = train_test_split(final_df.drop([\"fraud_status\"], axis=1),\n"," final_df[\"fraud_status\"],\n"," test_size=0.20,\n"," random_state=0)\n"," \n","K = []\n","training = []\n","test = []\n","scores = {}\n"," \n","for k in range(2, 21):\n"," clf = KNeighborsClassifier(n_neighbors = k)\n"," clf.fit(x_train, y_train)\n"," \n"," training_score = clf.score(x_train, y_train)\n"," test_score = clf.score(x_test, y_test)\n"," K.append(k)\n"," \n"," training.append(training_score)\n"," test.append(test_score)\n"," scores[k] = [training_score, test_score]\n","\n","for keys, values in scores.items():\n"," print(keys, ':', values)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"83e69f23-aa4e-4893-8907-6d5f0792c23f","showTitle":false,"title":""}},"source":["## Materialize Features in Redis\n","- `BackfillTime`\n","- `RedisSink`\n","- `materialize_features`\n","- `multi_get_online_features`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"faad23c1-d827-4674-b630-83530574c27d","showTitle":false,"title":""}},"outputs":[],"source":["backfill_time = BackfillTime(start=datetime(\n"," 2013, 4, 7), end=datetime(2013, 4, 7), step=timedelta(days=1))\n","redisSink = RedisSink(table_name=\"fraudDetectionDemoFeature\")\n","settings = MaterializationSettings(\"fraudDetectionDemoFeature\",\n"," backfill_time=backfill_time,\n"," sinks=[redisSink],\n"," feature_names=[\"fraud_status\"])\n","\n","client.materialize_features(settings)\n","client.wait_job_to_finish(timeout_sec=5000)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"1f5b191f-b1e8-49e4-b54d-ffc2f8c0a0b8","showTitle":false,"title":""}},"outputs":[],"source":["client.multi_get_online_features('fraudDetectionDemoFeature', ['1759222192247110', '914800996051170'], [\n"," \"fraud_status\"])"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9c3b2403-95d6-44a1-b536-d2088608ff58","showTitle":false,"title":""}},"outputs":[],"source":["client.multi_get_online_features('fraudDetectionDemoFeature', ['1759222192247110', '914800996051170', '844428033864668'], [\n"," \"fraud_status\"])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"71ba8699-3c42-4f73-be59-95b29f468696","showTitle":false,"title":""}},"source":["## Register Features with Registry APIs\n","- `register_features`\n","- `list_registered_features`\n","- Above queries are send to a Standard Registry API Service (both `Purview` and `SQL` backend are supported)\n","- More friendly interface with detailed lineage can be found in: [Feathr UI](https://feathr-sql-registry.azurewebsites.net/)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c5028dd9-01ed-4394-a5c7-623e674125f6","showTitle":false,"title":""}},"outputs":[],"source":["client.register_features()\n","client.list_registered_features(project_name=\"fraud_detection_test\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"cb814ce7-72b9-4622-8518-106d4acf9008","showTitle":false,"title":""}},"source":[]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"fraud_detection_feathr_test_2","notebookOrigID":1891349682974490,"widgets":{}},"interpreter":{"hash":"aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"},"kernelspec":{"display_name":"Python 3.10.4 64-bit","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.4"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":0} From 3d93628293fff18464c62224b537a2792bb00187 Mon Sep 17 00:00:00 2001 From: Curie Date: Wed, 27 Jul 2022 08:12:32 -0700 Subject: [PATCH 2/5] modify library path --- .../data/feathr_user_workspace/fraud_detection_demo.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/fraud_detection_demo.ipynb b/feathr_project/feathrcli/data/feathr_user_workspace/fraud_detection_demo.ipynb index 82cf512b5..4de0e304e 100644 --- a/feathr_project/feathrcli/data/feathr_user_workspace/fraud_detection_demo.ipynb +++ b/feathr_project/feathrcli/data/feathr_user_workspace/fraud_detection_demo.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7b19a0cd-31da-45b7-91a4-9cd561f3d3d8","showTitle":false,"title":""}},"source":["# Feathr Fraud Detection Sample\n","\n","This notebook illustrates the use of Feature Store to create a model that predicts the fraud status of transactions based on the user account data and trasaction data. All the data that was used in the notebook can be found here: https://github.com/microsoft/r-server-fraud-detection\n","\n","\n","In the following Notebook, we \n","1. Install the latest Feathr code (to include some unreleased features) \n","2. Define Environment Variables & `yaml_config` Settings \n","3. Create `FeathrClient` and Define `FeatureAnchor`\n","4. `build_features` and `get_offline_features` \n","5. Train Fraud Detection Model wih `KNeighborsClassifier`\n","6. `materialize_features` and `multi_get_online_features`\n","7. `register_features` and `list_registered_features`"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0b51153e-40dd-43d5-9d3a-501534156e6d","showTitle":false,"title":""}},"source":["## Setup Feathr Developer Environment"]},{"cell_type":"markdown","metadata":{},"source":["***Prior to running the notebook, if you have not deployed all the required resources, please refer to the guide here and follow the steps to do so: https://github.com/linkedin/feathr/blob/main/docs/how-to-guides/azure-deployment-arm.md***"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b9c63dd5-304e-4797-a230-8fb753710dbc","showTitle":false,"title":""}},"outputs":[],"source":["! pip install azure-cli\n","! pip install git+https://github.com/linkedin/feathr.git#subdirectory=feathr_project"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"69222adf-1cb0-410b-b98d-e22877f358c0","showTitle":false,"title":""}},"outputs":[],"source":["import glob\n","import os\n","import tempfile\n","from datetime import datetime, timedelta\n","from math import sqrt\n","\n","from feathr import FeathrClient\n","from feathr import STRING, BOOLEAN, FLOAT, INT32, ValueType\n","from feathr import Feature, DerivedFeature, FeatureAnchor\n","from feathr import BackfillTime, MaterializationSettings\n","from feathr import FeatureQuery, ObservationSettings\n","from feathr import RedisSink\n","from feathr import INPUT_CONTEXT, HdfsSource\n","from feathr import WindowAggTransformation\n","from feathr import TypedKey\n","from sklearn.model_selection import train_test_split\n","from azure.identity import DefaultAzureCredential\n","from azure.keyvault.secrets import SecretClient"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c0299d67-1103-4aa4-ba57-300498ae2579","showTitle":false,"title":""}},"outputs":[],"source":["! az login --use-device-code"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"58291272-00e5-4fe3-99d6-f1b89726f692","showTitle":false,"title":""}},"outputs":[],"source":["# replace with your prefix\n","resource_prefix = "]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9927c62c-7393-4ccd-a6dc-631084134386","showTitle":false,"title":""}},"source":["## Permission\n","To run the cells below, you need additional permission: permission to your managed identity to access the keyvault, and permission to the user to access the Storage Blob. Run the following lines of command in the Cloud Shell in order to grant yourself the access.\n","\n","~~~ \n","userId=\n","resource_prefix=\n","keyvault_name=\"${resource_prefix}kv\"\n","objectId=$(az ad user show --id $userId --query id -o tsv)\n","az keyvault update --name $keyvault_name --enable-rbac-authorization false\n","az keyvault set-policy -n $keyvault_name --secret-permissions get list --object-id $objectId\n","az role assignment create --assignee $userId --role \"Storage Blob Data Contributor\"\n","~~~"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a8a70f27-d520-4d3c-bb8c-f364f84cb738","showTitle":false,"title":""}},"outputs":[],"source":["# Get all the required credentials from Azure Key Vault\n","key_vault_name=resource_prefix+\"kv\"\n","synapse_workspace_url=resource_prefix+\"syws\"\n","adls_account=resource_prefix+\"dls\"\n","adls_fs_name=resource_prefix+\"fs\"\n","purview_name=resource_prefix+\"purview\"\n","key_vault_uri = f\"https://{key_vault_name}.vault.azure.net\"\n","credential = DefaultAzureCredential(exclude_interactive_browser_credential=False)\n","client = SecretClient(vault_url=key_vault_uri, credential=credential)\n","secretName = \"FEATHR-ONLINE-STORE-CONN\"\n","retrieved_secret = client.get_secret(secretName).value\n","\n","# Get redis credentials; This is to parse Redis connection string.\n","redis_port=retrieved_secret.split(',')[0].split(\":\")[1]\n","redis_host=retrieved_secret.split(',')[0].split(\":\")[0]\n","redis_password=retrieved_secret.split(',')[1].split(\"password=\",1)[1]\n","redis_ssl=retrieved_secret.split(',')[2].split(\"ssl=\",1)[1]\n","\n","# Set the resource link\n","os.environ['spark_config__azure_synapse__dev_url'] = f'https://{synapse_workspace_url}.dev.azuresynapse.net'\n","os.environ['spark_config__azure_synapse__pool_name'] = 'spark31'\n","os.environ['spark_config__azure_synapse__workspace_dir'] = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_project'\n","os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n","os.environ['online_store__redis__host'] = redis_host\n","os.environ['online_store__redis__port'] = redis_port\n","os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n","os.environ['REDIS_PASSWORD']=redis_password\n","os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n","feathr_output_path = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_output'"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"50b2f73e-6380-42c3-91e8-4f3e15bc10d6","showTitle":false,"title":""}},"outputs":[],"source":["import tempfile\n","yaml_config = \"\"\"\n","# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n","api_version: 1\n","project_config:\n"," project_name: 'fraud_detection_test'\n"," required_environment_variables:\n"," - 'REDIS_PASSWORD'\n","offline_store:\n"," adls:\n"," adls_enabled: true\n"," wasb:\n"," wasb_enabled: true\n"," s3:\n"," s3_enabled: false\n"," s3_endpoint: 's3.amazonaws.com'\n"," jdbc:\n"," jdbc_enabled: false\n"," jdbc_database: 'feathrtestdb'\n"," jdbc_table: 'feathrtesttable'\n"," snowflake:\n"," url: \"dqllago-ol19457.snowflakecomputing.com\"\n"," user: \"feathrintegration\"\n"," role: \"ACCOUNTADMIN\"\n","spark_config:\n"," spark_cluster: 'azure_synapse'\n"," spark_result_output_parts: '1'\n"," azure_synapse:\n"," dev_url: 'https://feathrazuretest3synapse.dev.azuresynapse.net'\n"," pool_name: 'spark3'\n"," workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/fraud_detection_test'\n"," executor_size: 'Small'\n"," executor_num: 4\n"," feathr_runtime_location: wasbs://public@azurefeathrstorage.blob.core.windows.net/feathr-assembly-LATEST.jar\n"," databricks:\n"," workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net'\n"," config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_D3_v2','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}}\n"," work_dir: 'dbfs:/fraud_detection_test'\n"," feathr_runtime_location: https://azurefeathrstorage.blob.core.windows.net/public/feathr-assembly-LATEST.jar\n","online_store:\n"," redis:\n"," host: 'feathrazuretest3redis.redis.cache.windows.net'\n"," port: 6380\n"," ssl_enabled: True\n","feature_registry:\n"," api_endpoint: \"https://feathr-sql-registry.azurewebsites.net/api/v1\"\n","\"\"\"\n","tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n","with open(tmp.name, \"w\") as text_file:\n"," text_file.write(yaml_config)\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"eab0957c-c906-4297-a729-8dd8d79cb629","showTitle":false,"title":""}},"source":["## Initialize `Feathr Client`\n","- `FeathrClient`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3734eee3-12f9-44db-a440-ad375ef859f0","showTitle":false,"title":""}},"outputs":[],"source":["client = FeathrClient(config_path=tmp.name)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"f6adbca1-5642-4ac1-bff7-e7c9d4d9e5b2","showTitle":false,"title":""}},"source":["## Define Features\n","- `HdfsSource`\n","- `TypedKey`\n","- `Feature`\n","- `FeatureAnchor`\n","- `DerivedFeature`"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b073b509-0f95-4e23-b16b-ffd8190fb6a2","showTitle":false,"title":""}},"source":["### Account Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b3668eeb-e4a0-4327-baf6-5521c856f51d","showTitle":false,"title":""}},"outputs":[],"source":["#Refer to to learn more about the details of each method\n","account_info = HdfsSource(name=\"AccountData\",\n"," path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/account_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n","\n","accountId = TypedKey(key_column=\"accountID\",\n"," key_column_type=ValueType.INT32,\n"," description=\"account id\")\n","\n","account_country = Feature(name=\"account_country\",\n"," key=accountId,\n"," feature_type=STRING, \n"," transform=\"accountCountry\")\n","\n","is_user_registered = Feature(name=\"is_user_registered\",\n"," key=accountId,\n"," feature_type=BOOLEAN,\n"," transform=\"isUserRegistered==TRUE\")\n","\n","num_payment_rejects_1d_per_user = Feature(name=\"num_payment_rejects_1d_per_user\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=\"numPaymentRejects1dPerUser\")\n","\n","account_age = Feature(name=\"account_age\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=\"accountAge\")\n"," \n","features = [\n"," account_country,\n"," account_age,\n"," is_user_registered,\n"," num_payment_rejects_1d_per_user\n","]\n","\n","account_anchor = FeatureAnchor(name=\"account_features\",\n"," source=account_info,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"6f12c07e-4faf-4411-8acd-6f5d13b962f8","showTitle":false,"title":""}},"source":["### Transaction Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"280062b9-ae21-4a1a-ae94-86a5c17fd589","showTitle":false,"title":""}},"outputs":[],"source":["# # #Refer to to learn more about the details of each method\n","\n","transaction_data = HdfsSource(name=\"transaction_data\",\n"," path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/transaction_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n","\n","transaction_id = Feature(name=\"transaction_id\",\n"," key=accountId,\n"," feature_type=STRING,\n"," transform=\"transactionID\")\n","\n","transaction_currency_code = Feature(name=\"transaction_currency_code\",\n"," key=accountId,\n"," feature_type=STRING,\n"," transform=\"transactionCurrencyCode\")\n"," \n","transaction_amount = Feature(name=\"transaction_amount\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=\"transactionAmount\")\n","\n","transaction_device_id = Feature(name=\"transaction_device_id\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=\"transactionDeviceId\")\n","\n","transaction_ip_address = Feature(name=\"transaction_ip_address\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=\"transactionIPaddress\")\n","\n","transaction_time = Feature(name=\"transaction_time\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=\"transactionTime\")\n","\n","fraud_status = Feature(name=\"fraud_status\",\n"," key=accountId,\n"," feature_type=STRING,\n"," transform=\"fraud_tag\")\n","\n","features = [\n"," transaction_id,\n"," transaction_amount,\n"," transaction_device_id,\n"," transaction_ip_address,\n"," transaction_time,\n"," transaction_currency_code,\n"," fraud_status\n","]\n","\n","transaction_feature_anchor = FeatureAnchor(name=\"transaction_features\",\n"," source=transaction_data,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"86ac05e1-26bb-4820-87ea-f547e3561181","showTitle":false,"title":""}},"source":["### Transaction Aggregated Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4c969554-f690-42f5-b70a-d962bf558b03","showTitle":false,"title":""}},"outputs":[],"source":["# average amount of transaction in the past week\n","transactions_aggr = HdfsSource(name=\"transactions_aggr\",\n"," path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/transaction_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n","\n","# average amount of transaction in that week\n","avg_transaction_amount = Feature(name=\"avg_transaction_amount\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(transactionAmount)\",\n"," agg_func=\"AVG\",\n"," window=\"7d\"))\n","\n","# number of transaction that took place in a day\n","num_trasaction_count_in_day = Feature(name=\"num_trasaction_count_in_day\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionID\",\n"," agg_func=\"COUNT\",\n"," window=\"1d\"))\n","\n","# Amount of transaction that took place in a day\n","total_transaction_amount_in_day = Feature(name=\"total_transaction_amount_in_day\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(transactionAmount)\",\n"," agg_func=\"SUM\",\n"," window=\"1d\"))\n","\n","# average time of transaction in the past week\n","avg_transaction_time = Feature(name=\"avg_transaction_time\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(transactionTime)\",\n"," agg_func=\"AVG\",\n"," window=\"7d\")) \n","\n","# total number of currency used for transaction in the past week\n","num_currency_type_in_week = Feature(name=\"num_currency_type_in_week\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionCurrencyCode\",\n"," agg_func=\"COUNT\",\n"," window=\"7d\"))\n","\n","# number of different ip address used for transaction in the past week\n","num_ip_address_count = Feature(name=\"num_ip_address_count\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionIPaddress\",\n"," agg_func=\"COUNT\",\n"," window=\"7d\"))\n","\n","# number of devices used for the transaction in the past week\n","num_device_count = Feature(name=\"num_device_count\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionDeviceId\",\n"," agg_func=\"COUNT\",\n"," window=\"7d\"))\n","\n","# find the time of most recent transaction\n","time_most_recent_transaction = Feature(name=\"time_most_recent_transaction\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionTime\",\n"," agg_func=\"LATEST\",\n"," window=\"7d\"))\n","\n","features = [\n"," avg_transaction_amount,\n"," avg_transaction_time,\n"," total_transaction_amount_in_day,\n"," num_trasaction_count_in_day,\n"," num_currency_type_in_week,\n"," num_ip_address_count,\n"," num_device_count,\n"," time_most_recent_transaction\n","]\n","\n","aggr_anchor = FeatureAnchor(name=\"transaction_aggr_features\",\n"," source=transactions_aggr,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"17cc5132-461f-4d3d-b517-1f7e69d23252","showTitle":false,"title":""}},"source":["### Derived Features\n","- `DerivedFeature`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7ac10ce4-e222-469c-bb2e-1658b45e3eda","showTitle":false,"title":""}},"outputs":[],"source":["# derived features\n","feature_diff_current_and_avg_amount = DerivedFeature(name=\"feature_diff_current_and_avg_amount\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," input_features=[\n"," transaction_amount, avg_transaction_amount],\n"," transform=\"transaction_amount - avg_transaction_amount\")\n","\n","feature_time_pass_after_most_recent_transaction = DerivedFeature(name=\"feature_time_pass_after_most_recent_transaction\",\n"," key=accountId,\n"," feature_type=INT32,\n"," input_features=[\n"," transaction_time, time_most_recent_transaction],\n"," transform=\"cast_int(transaction_time) - cast_int(time_most_recent_transaction)\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a9ec8416-9ac6-4499-b60f-55822265b893","showTitle":false,"title":""}},"source":["## Build Defined Features\n","- `build_features`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"d9d32d4f-2b60-4978-bb87-c7d2160e98eb","showTitle":false,"title":""}},"outputs":[],"source":["client.build_features(anchor_list=[account_anchor, transaction_feature_anchor, aggr_anchor], \n"," derived_feature_list=[feature_time_pass_after_most_recent_transaction, feature_diff_current_and_avg_amount])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"fa9e53b9-e7d4-4b25-b486-dc9e6801369a","showTitle":false,"title":""}},"source":["## Get Offline Features\n","- `FeatureQuery`\n","- `ObservationSettings`\n","- `get_offline_features`\n","- `feathr_spark_launcher.download_result`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b6340f2f-79dc-442b-a202-b2f2078a62ac","showTitle":false,"title":""}},"outputs":[],"source":["if client.spark_runtime == 'databricks':\n"," output_path = 'dbfs:/feathrfrauddetection_test.avro'\n","else:\n"," output_path = feathr_output_path\n","\n","feature_query = FeatureQuery(\n"," feature_list=[\"account_country\",\n"," \"transaction_time\",\n"," \"num_currency_type_in_week\",\n"," \"num_trasaction_count_in_day\",\n"," \"total_transaction_amount_in_day\",\n"," \"fraud_status\",\n"," \"is_user_registered\",\n"," \"avg_transaction_amount\",\n"," \"num_ip_address_count\",\n"," \"num_device_count\",\n"," \"time_most_recent_transaction\",\n"," \"feature_diff_current_and_avg_amount\",\n"," \"feature_time_pass_after_most_recent_transaction\"], key=accountId)\n"," \n","settings = ObservationSettings(\n"," observation_path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/observation_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n"," \n","client.get_offline_features(observation_settings=settings,\n"," feature_query=feature_query,\n"," output_path=output_path)\n","client.wait_job_to_finish(timeout_sec=10000000000)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5b7603ee-0c81-49ed-8e1f-53161ae57cbf","showTitle":false,"title":""}},"outputs":[],"source":["import pandas as pd\n","import pandavro as pdx\n","import glob\n","from pathlib import Path\n","import matplotlib.pyplot as plt\n","from datetime import datetime, timedelta\n","\n","from feathr import BackfillTime, MaterializationSettings, RedisSink"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"997db6eb-c7d8-4f5e-b6e0-09733ff706b7","showTitle":false,"title":""}},"outputs":[],"source":["def get_result_df(client: FeathrClient) -> pd.DataFrame:\n"," \"\"\"Download the job result dataset from cloud as a Pandas dataframe.\"\"\"\n"," res_url = client.get_job_result_uri(block=True, timeout_sec=600)\n"," tmp_dir = tempfile.TemporaryDirectory()\n"," client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=tmp_dir.name)\n"," dataframe_list = []\n"," # assuming the result are in avro format\n"," for file in glob.glob(os.path.join(tmp_dir.name, '*.avro')):\n"," dataframe_list.append(pdx.read_avro(file))\n"," vertical_concat_df = pd.concat(dataframe_list, axis=0)\n"," tmp_dir.cleanup()\n"," return vertical_concat_df\n","\n","df_res = get_result_df(client)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7fff1ac7-90d1-469b-a54c-397904417796","showTitle":false,"title":""}},"source":["## Feature Visualization"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e482625e-2ecd-45cb-9d43-5baacd445006","showTitle":false,"title":""}},"outputs":[],"source":["filepath = Path('./result_out.csv')\n","df_res.to_csv(filepath, index=False) \n","df_res.reset_index()\n","df_res"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b4f86c53-16cf-4836-969b-7c34f0922057","showTitle":false,"title":""}},"source":["## Train Fraud Detection Model with Calculated Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0d9d06b8-01e7-4772-8734-6ebfe1996b03","showTitle":false,"title":""}},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","from sklearn.neighbors import KNeighborsClassifier\n","from sklearn.model_selection import train_test_split \n","import seaborn as sns\n","\n","final_df = df_res\n","final_df.drop(['accountID'], axis=1, inplace=True, errors='ignore')\n","final_df.drop(['transactionDate'], axis=1, inplace=True, errors='ignore')\n","final_df.drop(['account_country'], axis=1, inplace=True, errors='ignore')\n","final_df = final_df.fillna(0)\n","\n","x_train, x_test, y_train, y_test = train_test_split(final_df.drop([\"fraud_status\"], axis=1),\n"," final_df[\"fraud_status\"],\n"," test_size=0.20,\n"," random_state=0)\n"," \n","K = []\n","training = []\n","test = []\n","scores = {}\n"," \n","for k in range(2, 21):\n"," clf = KNeighborsClassifier(n_neighbors = k)\n"," clf.fit(x_train, y_train)\n"," \n"," training_score = clf.score(x_train, y_train)\n"," test_score = clf.score(x_test, y_test)\n"," K.append(k)\n"," \n"," training.append(training_score)\n"," test.append(test_score)\n"," scores[k] = [training_score, test_score]\n","\n","for keys, values in scores.items():\n"," print(keys, ':', values)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"83e69f23-aa4e-4893-8907-6d5f0792c23f","showTitle":false,"title":""}},"source":["## Materialize Features in Redis\n","- `BackfillTime`\n","- `RedisSink`\n","- `materialize_features`\n","- `multi_get_online_features`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"faad23c1-d827-4674-b630-83530574c27d","showTitle":false,"title":""}},"outputs":[],"source":["backfill_time = BackfillTime(start=datetime(\n"," 2013, 4, 7), end=datetime(2013, 4, 7), step=timedelta(days=1))\n","redisSink = RedisSink(table_name=\"fraudDetectionDemoFeature\")\n","settings = MaterializationSettings(\"fraudDetectionDemoFeature\",\n"," backfill_time=backfill_time,\n"," sinks=[redisSink],\n"," feature_names=[\"fraud_status\"])\n","\n","client.materialize_features(settings)\n","client.wait_job_to_finish(timeout_sec=5000)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"1f5b191f-b1e8-49e4-b54d-ffc2f8c0a0b8","showTitle":false,"title":""}},"outputs":[],"source":["client.multi_get_online_features('fraudDetectionDemoFeature', ['1759222192247110', '914800996051170'], [\n"," \"fraud_status\"])"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9c3b2403-95d6-44a1-b536-d2088608ff58","showTitle":false,"title":""}},"outputs":[],"source":["client.multi_get_online_features('fraudDetectionDemoFeature', ['1759222192247110', '914800996051170', '844428033864668'], [\n"," \"fraud_status\"])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"71ba8699-3c42-4f73-be59-95b29f468696","showTitle":false,"title":""}},"source":["## Register Features with Registry APIs\n","- `register_features`\n","- `list_registered_features`\n","- Above queries are send to a Standard Registry API Service (both `Purview` and `SQL` backend are supported)\n","- More friendly interface with detailed lineage can be found in: [Feathr UI](https://feathr-sql-registry.azurewebsites.net/)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c5028dd9-01ed-4394-a5c7-623e674125f6","showTitle":false,"title":""}},"outputs":[],"source":["client.register_features()\n","client.list_registered_features(project_name=\"fraud_detection_test\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"cb814ce7-72b9-4622-8518-106d4acf9008","showTitle":false,"title":""}},"source":[]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"fraud_detection_feathr_test_2","notebookOrigID":1891349682974490,"widgets":{}},"interpreter":{"hash":"aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"},"kernelspec":{"display_name":"Python 3.10.4 64-bit","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.4"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":0} +{"cells":[{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7b19a0cd-31da-45b7-91a4-9cd561f3d3d8","showTitle":false,"title":""}},"source":["# Feathr Fraud Detection Sample\n","\n","This notebook illustrates the use of Feature Store to create a model that predicts the fraud status of transactions based on the user account data and trasaction data. All the data that was used in the notebook can be found here: https://github.com/microsoft/r-server-fraud-detection\n","\n","\n","In the following Notebook, we \n","1. Install the latest Feathr code (to include some unreleased features) \n","2. Define Environment Variables & `yaml_config` Settings \n","3. Create `FeathrClient` and Define `FeatureAnchor`\n","4. `build_features` and `get_offline_features` \n","5. Train Fraud Detection Model wih `KNeighborsClassifier`\n","6. `materialize_features` and `multi_get_online_features`\n","7. `register_features` and `list_registered_features`"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0b51153e-40dd-43d5-9d3a-501534156e6d","showTitle":false,"title":""}},"source":["## Setup Feathr Developer Environment"]},{"cell_type":"markdown","metadata":{},"source":["***Prior to running the notebook, if you have not deployed all the required resources, please refer to the guide here and follow the steps to do so: https://github.com/linkedin/feathr/blob/main/docs/how-to-guides/azure-deployment-arm.md***"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b9c63dd5-304e-4797-a230-8fb753710dbc","showTitle":false,"title":""}},"outputs":[],"source":["! pip install feathr azure-cli"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"69222adf-1cb0-410b-b98d-e22877f358c0","showTitle":false,"title":""}},"outputs":[],"source":["import glob\n","import os\n","import tempfile\n","from datetime import datetime, timedelta\n","from math import sqrt\n","\n","from feathr import FeathrClient\n","from feathr import STRING, BOOLEAN, FLOAT, INT32, ValueType\n","from feathr import Feature, DerivedFeature, FeatureAnchor\n","from feathr import BackfillTime, MaterializationSettings\n","from feathr import FeatureQuery, ObservationSettings\n","from feathr import RedisSink\n","from feathr import INPUT_CONTEXT, HdfsSource\n","from feathr import WindowAggTransformation\n","from feathr import TypedKey\n","from sklearn.model_selection import train_test_split\n","from azure.identity import DefaultAzureCredential\n","from azure.keyvault.secrets import SecretClient"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c0299d67-1103-4aa4-ba57-300498ae2579","showTitle":false,"title":""}},"outputs":[],"source":["! az login --use-device-code"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"58291272-00e5-4fe3-99d6-f1b89726f692","showTitle":false,"title":""}},"outputs":[],"source":["# replace with your prefix\n","resource_prefix = "]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9927c62c-7393-4ccd-a6dc-631084134386","showTitle":false,"title":""}},"source":["## Permission\n","To run the cells below, you need additional permission: permission to your managed identity to access the keyvault, and permission to the user to access the Storage Blob. Run the following lines of command in the Cloud Shell in order to grant yourself the access.\n","\n","~~~ \n","userId=\n","resource_prefix=\n","keyvault_name=\"${resource_prefix}kv\"\n","objectId=$(az ad user show --id $userId --query id -o tsv)\n","az keyvault update --name $keyvault_name --enable-rbac-authorization false\n","az keyvault set-policy -n $keyvault_name --secret-permissions get list --object-id $objectId\n","az role assignment create --assignee $userId --role \"Storage Blob Data Contributor\"\n","~~~"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a8a70f27-d520-4d3c-bb8c-f364f84cb738","showTitle":false,"title":""}},"outputs":[],"source":["# Get all the required credentials from Azure Key Vault\n","key_vault_name=resource_prefix+\"kv\"\n","synapse_workspace_url=resource_prefix+\"syws\"\n","adls_account=resource_prefix+\"dls\"\n","adls_fs_name=resource_prefix+\"fs\"\n","purview_name=resource_prefix+\"purview\"\n","key_vault_uri = f\"https://{key_vault_name}.vault.azure.net\"\n","credential = DefaultAzureCredential(exclude_interactive_browser_credential=False)\n","client = SecretClient(vault_url=key_vault_uri, credential=credential)\n","secretName = \"FEATHR-ONLINE-STORE-CONN\"\n","retrieved_secret = client.get_secret(secretName).value\n","\n","# Get redis credentials; This is to parse Redis connection string.\n","redis_port=retrieved_secret.split(',')[0].split(\":\")[1]\n","redis_host=retrieved_secret.split(',')[0].split(\":\")[0]\n","redis_password=retrieved_secret.split(',')[1].split(\"password=\",1)[1]\n","redis_ssl=retrieved_secret.split(',')[2].split(\"ssl=\",1)[1]\n","\n","# Set the resource link\n","os.environ['spark_config__azure_synapse__dev_url'] = f'https://{synapse_workspace_url}.dev.azuresynapse.net'\n","os.environ['spark_config__azure_synapse__pool_name'] = 'spark31'\n","os.environ['spark_config__azure_synapse__workspace_dir'] = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_project'\n","os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n","os.environ['online_store__redis__host'] = redis_host\n","os.environ['online_store__redis__port'] = redis_port\n","os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n","os.environ['REDIS_PASSWORD']=redis_password\n","os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n","feathr_output_path = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_output'"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"50b2f73e-6380-42c3-91e8-4f3e15bc10d6","showTitle":false,"title":""}},"outputs":[],"source":["import tempfile\n","yaml_config = \"\"\"\n","# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n","api_version: 1\n","project_config:\n"," project_name: 'fraud_detection_test'\n"," required_environment_variables:\n"," - 'REDIS_PASSWORD'\n","offline_store:\n"," adls:\n"," adls_enabled: true\n"," wasb:\n"," wasb_enabled: true\n"," s3:\n"," s3_enabled: false\n"," s3_endpoint: 's3.amazonaws.com'\n"," jdbc:\n"," jdbc_enabled: false\n"," jdbc_database: 'feathrtestdb'\n"," jdbc_table: 'feathrtesttable'\n"," snowflake:\n"," url: \"dqllago-ol19457.snowflakecomputing.com\"\n"," user: \"feathrintegration\"\n"," role: \"ACCOUNTADMIN\"\n","spark_config:\n"," spark_cluster: 'azure_synapse'\n"," spark_result_output_parts: '1'\n"," azure_synapse:\n"," dev_url: 'https://feathrazuretest3synapse.dev.azuresynapse.net'\n"," pool_name: 'spark3'\n"," workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/fraud_detection_test'\n"," executor_size: 'Small'\n"," executor_num: 4\n"," feathr_runtime_location: wasbs://public@azurefeathrstorage.blob.core.windows.net/feathr-assembly-LATEST.jar\n"," databricks:\n"," workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net'\n"," config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_D3_v2','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}}\n"," work_dir: 'dbfs:/fraud_detection_test'\n"," feathr_runtime_location: https://azurefeathrstorage.blob.core.windows.net/public/feathr-assembly-LATEST.jar\n","online_store:\n"," redis:\n"," host: 'feathrazuretest3redis.redis.cache.windows.net'\n"," port: 6380\n"," ssl_enabled: True\n","feature_registry:\n"," api_endpoint: \"https://feathr-sql-registry.azurewebsites.net/api/v1\"\n","\"\"\"\n","tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n","with open(tmp.name, \"w\") as text_file:\n"," text_file.write(yaml_config)\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"eab0957c-c906-4297-a729-8dd8d79cb629","showTitle":false,"title":""}},"source":["## Initialize `Feathr Client`\n","- `FeathrClient`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3734eee3-12f9-44db-a440-ad375ef859f0","showTitle":false,"title":""}},"outputs":[],"source":["client = FeathrClient(config_path=tmp.name)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"f6adbca1-5642-4ac1-bff7-e7c9d4d9e5b2","showTitle":false,"title":""}},"source":["## Define Features\n","- `HdfsSource`\n","- `TypedKey`\n","- `Feature`\n","- `FeatureAnchor`\n","- `DerivedFeature`"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b073b509-0f95-4e23-b16b-ffd8190fb6a2","showTitle":false,"title":""}},"source":["### Account Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b3668eeb-e4a0-4327-baf6-5521c856f51d","showTitle":false,"title":""}},"outputs":[],"source":["#Refer to to learn more about the details of each method\n","account_info = HdfsSource(name=\"AccountData\",\n"," path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/account_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n","\n","accountId = TypedKey(key_column=\"accountID\",\n"," key_column_type=ValueType.INT32,\n"," description=\"account id\")\n","\n","account_country = Feature(name=\"account_country\",\n"," key=accountId,\n"," feature_type=STRING, \n"," transform=\"accountCountry\")\n","\n","is_user_registered = Feature(name=\"is_user_registered\",\n"," key=accountId,\n"," feature_type=BOOLEAN,\n"," transform=\"isUserRegistered==TRUE\")\n","\n","num_payment_rejects_1d_per_user = Feature(name=\"num_payment_rejects_1d_per_user\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=\"numPaymentRejects1dPerUser\")\n","\n","account_age = Feature(name=\"account_age\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=\"accountAge\")\n"," \n","features = [\n"," account_country,\n"," account_age,\n"," is_user_registered,\n"," num_payment_rejects_1d_per_user\n","]\n","\n","account_anchor = FeatureAnchor(name=\"account_features\",\n"," source=account_info,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"6f12c07e-4faf-4411-8acd-6f5d13b962f8","showTitle":false,"title":""}},"source":["### Transaction Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"280062b9-ae21-4a1a-ae94-86a5c17fd589","showTitle":false,"title":""}},"outputs":[],"source":["# # #Refer to to learn more about the details of each method\n","\n","transaction_data = HdfsSource(name=\"transaction_data\",\n"," path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/transaction_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n","\n","transaction_id = Feature(name=\"transaction_id\",\n"," key=accountId,\n"," feature_type=STRING,\n"," transform=\"transactionID\")\n","\n","transaction_currency_code = Feature(name=\"transaction_currency_code\",\n"," key=accountId,\n"," feature_type=STRING,\n"," transform=\"transactionCurrencyCode\")\n"," \n","transaction_amount = Feature(name=\"transaction_amount\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=\"transactionAmount\")\n","\n","transaction_device_id = Feature(name=\"transaction_device_id\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=\"transactionDeviceId\")\n","\n","transaction_ip_address = Feature(name=\"transaction_ip_address\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=\"transactionIPaddress\")\n","\n","transaction_time = Feature(name=\"transaction_time\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=\"transactionTime\")\n","\n","fraud_status = Feature(name=\"fraud_status\",\n"," key=accountId,\n"," feature_type=STRING,\n"," transform=\"fraud_tag\")\n","\n","features = [\n"," transaction_id,\n"," transaction_amount,\n"," transaction_device_id,\n"," transaction_ip_address,\n"," transaction_time,\n"," transaction_currency_code,\n"," fraud_status\n","]\n","\n","transaction_feature_anchor = FeatureAnchor(name=\"transaction_features\",\n"," source=transaction_data,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"86ac05e1-26bb-4820-87ea-f547e3561181","showTitle":false,"title":""}},"source":["### Transaction Aggregated Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4c969554-f690-42f5-b70a-d962bf558b03","showTitle":false,"title":""}},"outputs":[],"source":["# average amount of transaction in the past week\n","transactions_aggr = HdfsSource(name=\"transactions_aggr\",\n"," path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/transaction_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n","\n","# average amount of transaction in that week\n","avg_transaction_amount = Feature(name=\"avg_transaction_amount\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(transactionAmount)\",\n"," agg_func=\"AVG\",\n"," window=\"7d\"))\n","\n","# number of transaction that took place in a day\n","num_trasaction_count_in_day = Feature(name=\"num_trasaction_count_in_day\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionID\",\n"," agg_func=\"COUNT\",\n"," window=\"1d\"))\n","\n","# Amount of transaction that took place in a day\n","total_transaction_amount_in_day = Feature(name=\"total_transaction_amount_in_day\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(transactionAmount)\",\n"," agg_func=\"SUM\",\n"," window=\"1d\"))\n","\n","# average time of transaction in the past week\n","avg_transaction_time = Feature(name=\"avg_transaction_time\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(transactionTime)\",\n"," agg_func=\"AVG\",\n"," window=\"7d\")) \n","\n","# total number of currency used for transaction in the past week\n","num_currency_type_in_week = Feature(name=\"num_currency_type_in_week\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionCurrencyCode\",\n"," agg_func=\"COUNT\",\n"," window=\"7d\"))\n","\n","# number of different ip address used for transaction in the past week\n","num_ip_address_count = Feature(name=\"num_ip_address_count\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionIPaddress\",\n"," agg_func=\"COUNT\",\n"," window=\"7d\"))\n","\n","# number of devices used for the transaction in the past week\n","num_device_count = Feature(name=\"num_device_count\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionDeviceId\",\n"," agg_func=\"COUNT\",\n"," window=\"7d\"))\n","\n","# find the time of most recent transaction\n","time_most_recent_transaction = Feature(name=\"time_most_recent_transaction\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionTime\",\n"," agg_func=\"LATEST\",\n"," window=\"7d\"))\n","\n","features = [\n"," avg_transaction_amount,\n"," avg_transaction_time,\n"," total_transaction_amount_in_day,\n"," num_trasaction_count_in_day,\n"," num_currency_type_in_week,\n"," num_ip_address_count,\n"," num_device_count,\n"," time_most_recent_transaction\n","]\n","\n","aggr_anchor = FeatureAnchor(name=\"transaction_aggr_features\",\n"," source=transactions_aggr,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"17cc5132-461f-4d3d-b517-1f7e69d23252","showTitle":false,"title":""}},"source":["### Derived Features\n","- `DerivedFeature`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7ac10ce4-e222-469c-bb2e-1658b45e3eda","showTitle":false,"title":""}},"outputs":[],"source":["# derived features\n","feature_diff_current_and_avg_amount = DerivedFeature(name=\"feature_diff_current_and_avg_amount\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," input_features=[\n"," transaction_amount, avg_transaction_amount],\n"," transform=\"transaction_amount - avg_transaction_amount\")\n","\n","feature_time_pass_after_most_recent_transaction = DerivedFeature(name=\"feature_time_pass_after_most_recent_transaction\",\n"," key=accountId,\n"," feature_type=INT32,\n"," input_features=[\n"," transaction_time, time_most_recent_transaction],\n"," transform=\"cast_int(transaction_time) - cast_int(time_most_recent_transaction)\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a9ec8416-9ac6-4499-b60f-55822265b893","showTitle":false,"title":""}},"source":["## Build Defined Features\n","- `build_features`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"d9d32d4f-2b60-4978-bb87-c7d2160e98eb","showTitle":false,"title":""}},"outputs":[],"source":["client.build_features(anchor_list=[account_anchor, transaction_feature_anchor, aggr_anchor], \n"," derived_feature_list=[feature_time_pass_after_most_recent_transaction, feature_diff_current_and_avg_amount])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"fa9e53b9-e7d4-4b25-b486-dc9e6801369a","showTitle":false,"title":""}},"source":["## Get Offline Features\n","- `FeatureQuery`\n","- `ObservationSettings`\n","- `get_offline_features`\n","- `feathr_spark_launcher.download_result`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b6340f2f-79dc-442b-a202-b2f2078a62ac","showTitle":false,"title":""}},"outputs":[],"source":["if client.spark_runtime == 'databricks':\n"," output_path = 'dbfs:/feathrfrauddetection_test.avro'\n","else:\n"," output_path = feathr_output_path\n","\n","feature_query = FeatureQuery(\n"," feature_list=[\"account_country\",\n"," \"transaction_time\",\n"," \"num_currency_type_in_week\",\n"," \"num_trasaction_count_in_day\",\n"," \"total_transaction_amount_in_day\",\n"," \"fraud_status\",\n"," \"is_user_registered\",\n"," \"avg_transaction_amount\",\n"," \"num_ip_address_count\",\n"," \"num_device_count\",\n"," \"time_most_recent_transaction\",\n"," \"feature_diff_current_and_avg_amount\",\n"," \"feature_time_pass_after_most_recent_transaction\"], key=accountId)\n"," \n","settings = ObservationSettings(\n"," observation_path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/observation_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n"," \n","client.get_offline_features(observation_settings=settings,\n"," feature_query=feature_query,\n"," output_path=output_path)\n","client.wait_job_to_finish(timeout_sec=10000000000)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5b7603ee-0c81-49ed-8e1f-53161ae57cbf","showTitle":false,"title":""}},"outputs":[],"source":["import pandas as pd\n","import pandavro as pdx\n","import glob\n","from pathlib import Path\n","import matplotlib.pyplot as plt\n","from datetime import datetime, timedelta\n","\n","from feathr import BackfillTime, MaterializationSettings, RedisSink"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"997db6eb-c7d8-4f5e-b6e0-09733ff706b7","showTitle":false,"title":""}},"outputs":[],"source":["def get_result_df(client: FeathrClient) -> pd.DataFrame:\n"," \"\"\"Download the job result dataset from cloud as a Pandas dataframe.\"\"\"\n"," res_url = client.get_job_result_uri(block=True, timeout_sec=600)\n"," tmp_dir = tempfile.TemporaryDirectory()\n"," client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=tmp_dir.name)\n"," dataframe_list = []\n"," # assuming the result are in avro format\n"," for file in glob.glob(os.path.join(tmp_dir.name, '*.avro')):\n"," dataframe_list.append(pdx.read_avro(file))\n"," vertical_concat_df = pd.concat(dataframe_list, axis=0)\n"," tmp_dir.cleanup()\n"," return vertical_concat_df\n","\n","df_res = get_result_df(client)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7fff1ac7-90d1-469b-a54c-397904417796","showTitle":false,"title":""}},"source":["## Feature Visualization"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e482625e-2ecd-45cb-9d43-5baacd445006","showTitle":false,"title":""}},"outputs":[],"source":["filepath = Path('./result_out.csv')\n","df_res.to_csv(filepath, index=False) \n","df_res.reset_index()\n","df_res"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b4f86c53-16cf-4836-969b-7c34f0922057","showTitle":false,"title":""}},"source":["## Train Fraud Detection Model with Calculated Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0d9d06b8-01e7-4772-8734-6ebfe1996b03","showTitle":false,"title":""}},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","from sklearn.neighbors import KNeighborsClassifier\n","from sklearn.model_selection import train_test_split \n","import seaborn as sns\n","\n","final_df = df_res\n","final_df.drop(['accountID'], axis=1, inplace=True, errors='ignore')\n","final_df.drop(['transactionDate'], axis=1, inplace=True, errors='ignore')\n","final_df.drop(['account_country'], axis=1, inplace=True, errors='ignore')\n","final_df = final_df.fillna(0)\n","\n","x_train, x_test, y_train, y_test = train_test_split(final_df.drop([\"fraud_status\"], axis=1),\n"," final_df[\"fraud_status\"],\n"," test_size=0.20,\n"," random_state=0)\n"," \n","K = []\n","training = []\n","test = []\n","scores = {}\n"," \n","for k in range(2, 21):\n"," clf = KNeighborsClassifier(n_neighbors = k)\n"," clf.fit(x_train, y_train)\n"," \n"," training_score = clf.score(x_train, y_train)\n"," test_score = clf.score(x_test, y_test)\n"," K.append(k)\n"," \n"," training.append(training_score)\n"," test.append(test_score)\n"," scores[k] = [training_score, test_score]\n","\n","for keys, values in scores.items():\n"," print(keys, ':', values)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"83e69f23-aa4e-4893-8907-6d5f0792c23f","showTitle":false,"title":""}},"source":["## Materialize Features in Redis\n","- `BackfillTime`\n","- `RedisSink`\n","- `materialize_features`\n","- `multi_get_online_features`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"faad23c1-d827-4674-b630-83530574c27d","showTitle":false,"title":""}},"outputs":[],"source":["backfill_time = BackfillTime(start=datetime(\n"," 2013, 4, 7), end=datetime(2013, 4, 7), step=timedelta(days=1))\n","redisSink = RedisSink(table_name=\"fraudDetectionDemoFeature\")\n","settings = MaterializationSettings(\"fraudDetectionDemoFeature\",\n"," backfill_time=backfill_time,\n"," sinks=[redisSink],\n"," feature_names=[\"fraud_status\"])\n","\n","client.materialize_features(settings)\n","client.wait_job_to_finish(timeout_sec=5000)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"1f5b191f-b1e8-49e4-b54d-ffc2f8c0a0b8","showTitle":false,"title":""}},"outputs":[],"source":["client.multi_get_online_features('fraudDetectionDemoFeature', ['1759222192247110', '914800996051170'], [\n"," \"fraud_status\"])"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9c3b2403-95d6-44a1-b536-d2088608ff58","showTitle":false,"title":""}},"outputs":[],"source":["client.multi_get_online_features('fraudDetectionDemoFeature', ['1759222192247110', '914800996051170', '844428033864668'], [\n"," \"fraud_status\"])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"71ba8699-3c42-4f73-be59-95b29f468696","showTitle":false,"title":""}},"source":["## Register Features with Registry APIs\n","- `register_features`\n","- `list_registered_features`\n","- Above queries are send to a Standard Registry API Service (both `Purview` and `SQL` backend are supported)\n","- More friendly interface with detailed lineage can be found in: [Feathr UI](https://feathr-sql-registry.azurewebsites.net/)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c5028dd9-01ed-4394-a5c7-623e674125f6","showTitle":false,"title":""}},"outputs":[],"source":["client.register_features()\n","client.list_registered_features(project_name=\"fraud_detection_test\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"cb814ce7-72b9-4622-8518-106d4acf9008","showTitle":false,"title":""}},"source":[]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"fraud_detection_feathr_test_2","notebookOrigID":1891349682974490,"widgets":{}},"interpreter":{"hash":"aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"},"kernelspec":{"display_name":"Python 3.10.4 64-bit","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.4"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":0} From d5a6e59cdc9c45a2cff876888b3a624f5e0089ee Mon Sep 17 00:00:00 2001 From: Curie Date: Wed, 27 Jul 2022 11:31:24 -0700 Subject: [PATCH 3/5] fix link for arm deployment --- .../data/feathr_user_workspace/fraud_detection_demo.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/fraud_detection_demo.ipynb b/feathr_project/feathrcli/data/feathr_user_workspace/fraud_detection_demo.ipynb index 4de0e304e..129227003 100644 --- a/feathr_project/feathrcli/data/feathr_user_workspace/fraud_detection_demo.ipynb +++ b/feathr_project/feathrcli/data/feathr_user_workspace/fraud_detection_demo.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7b19a0cd-31da-45b7-91a4-9cd561f3d3d8","showTitle":false,"title":""}},"source":["# Feathr Fraud Detection Sample\n","\n","This notebook illustrates the use of Feature Store to create a model that predicts the fraud status of transactions based on the user account data and trasaction data. All the data that was used in the notebook can be found here: https://github.com/microsoft/r-server-fraud-detection\n","\n","\n","In the following Notebook, we \n","1. Install the latest Feathr code (to include some unreleased features) \n","2. Define Environment Variables & `yaml_config` Settings \n","3. Create `FeathrClient` and Define `FeatureAnchor`\n","4. `build_features` and `get_offline_features` \n","5. Train Fraud Detection Model wih `KNeighborsClassifier`\n","6. `materialize_features` and `multi_get_online_features`\n","7. `register_features` and `list_registered_features`"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0b51153e-40dd-43d5-9d3a-501534156e6d","showTitle":false,"title":""}},"source":["## Setup Feathr Developer Environment"]},{"cell_type":"markdown","metadata":{},"source":["***Prior to running the notebook, if you have not deployed all the required resources, please refer to the guide here and follow the steps to do so: https://github.com/linkedin/feathr/blob/main/docs/how-to-guides/azure-deployment-arm.md***"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b9c63dd5-304e-4797-a230-8fb753710dbc","showTitle":false,"title":""}},"outputs":[],"source":["! pip install feathr azure-cli"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"69222adf-1cb0-410b-b98d-e22877f358c0","showTitle":false,"title":""}},"outputs":[],"source":["import glob\n","import os\n","import tempfile\n","from datetime import datetime, timedelta\n","from math import sqrt\n","\n","from feathr import FeathrClient\n","from feathr import STRING, BOOLEAN, FLOAT, INT32, ValueType\n","from feathr import Feature, DerivedFeature, FeatureAnchor\n","from feathr import BackfillTime, MaterializationSettings\n","from feathr import FeatureQuery, ObservationSettings\n","from feathr import RedisSink\n","from feathr import INPUT_CONTEXT, HdfsSource\n","from feathr import WindowAggTransformation\n","from feathr import TypedKey\n","from sklearn.model_selection import train_test_split\n","from azure.identity import DefaultAzureCredential\n","from azure.keyvault.secrets import SecretClient"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c0299d67-1103-4aa4-ba57-300498ae2579","showTitle":false,"title":""}},"outputs":[],"source":["! az login --use-device-code"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"58291272-00e5-4fe3-99d6-f1b89726f692","showTitle":false,"title":""}},"outputs":[],"source":["# replace with your prefix\n","resource_prefix = "]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9927c62c-7393-4ccd-a6dc-631084134386","showTitle":false,"title":""}},"source":["## Permission\n","To run the cells below, you need additional permission: permission to your managed identity to access the keyvault, and permission to the user to access the Storage Blob. Run the following lines of command in the Cloud Shell in order to grant yourself the access.\n","\n","~~~ \n","userId=\n","resource_prefix=\n","keyvault_name=\"${resource_prefix}kv\"\n","objectId=$(az ad user show --id $userId --query id -o tsv)\n","az keyvault update --name $keyvault_name --enable-rbac-authorization false\n","az keyvault set-policy -n $keyvault_name --secret-permissions get list --object-id $objectId\n","az role assignment create --assignee $userId --role \"Storage Blob Data Contributor\"\n","~~~"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a8a70f27-d520-4d3c-bb8c-f364f84cb738","showTitle":false,"title":""}},"outputs":[],"source":["# Get all the required credentials from Azure Key Vault\n","key_vault_name=resource_prefix+\"kv\"\n","synapse_workspace_url=resource_prefix+\"syws\"\n","adls_account=resource_prefix+\"dls\"\n","adls_fs_name=resource_prefix+\"fs\"\n","purview_name=resource_prefix+\"purview\"\n","key_vault_uri = f\"https://{key_vault_name}.vault.azure.net\"\n","credential = DefaultAzureCredential(exclude_interactive_browser_credential=False)\n","client = SecretClient(vault_url=key_vault_uri, credential=credential)\n","secretName = \"FEATHR-ONLINE-STORE-CONN\"\n","retrieved_secret = client.get_secret(secretName).value\n","\n","# Get redis credentials; This is to parse Redis connection string.\n","redis_port=retrieved_secret.split(',')[0].split(\":\")[1]\n","redis_host=retrieved_secret.split(',')[0].split(\":\")[0]\n","redis_password=retrieved_secret.split(',')[1].split(\"password=\",1)[1]\n","redis_ssl=retrieved_secret.split(',')[2].split(\"ssl=\",1)[1]\n","\n","# Set the resource link\n","os.environ['spark_config__azure_synapse__dev_url'] = f'https://{synapse_workspace_url}.dev.azuresynapse.net'\n","os.environ['spark_config__azure_synapse__pool_name'] = 'spark31'\n","os.environ['spark_config__azure_synapse__workspace_dir'] = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_project'\n","os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n","os.environ['online_store__redis__host'] = redis_host\n","os.environ['online_store__redis__port'] = redis_port\n","os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n","os.environ['REDIS_PASSWORD']=redis_password\n","os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n","feathr_output_path = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_output'"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"50b2f73e-6380-42c3-91e8-4f3e15bc10d6","showTitle":false,"title":""}},"outputs":[],"source":["import tempfile\n","yaml_config = \"\"\"\n","# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n","api_version: 1\n","project_config:\n"," project_name: 'fraud_detection_test'\n"," required_environment_variables:\n"," - 'REDIS_PASSWORD'\n","offline_store:\n"," adls:\n"," adls_enabled: true\n"," wasb:\n"," wasb_enabled: true\n"," s3:\n"," s3_enabled: false\n"," s3_endpoint: 's3.amazonaws.com'\n"," jdbc:\n"," jdbc_enabled: false\n"," jdbc_database: 'feathrtestdb'\n"," jdbc_table: 'feathrtesttable'\n"," snowflake:\n"," url: \"dqllago-ol19457.snowflakecomputing.com\"\n"," user: \"feathrintegration\"\n"," role: \"ACCOUNTADMIN\"\n","spark_config:\n"," spark_cluster: 'azure_synapse'\n"," spark_result_output_parts: '1'\n"," azure_synapse:\n"," dev_url: 'https://feathrazuretest3synapse.dev.azuresynapse.net'\n"," pool_name: 'spark3'\n"," workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/fraud_detection_test'\n"," executor_size: 'Small'\n"," executor_num: 4\n"," feathr_runtime_location: wasbs://public@azurefeathrstorage.blob.core.windows.net/feathr-assembly-LATEST.jar\n"," databricks:\n"," workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net'\n"," config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_D3_v2','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}}\n"," work_dir: 'dbfs:/fraud_detection_test'\n"," feathr_runtime_location: https://azurefeathrstorage.blob.core.windows.net/public/feathr-assembly-LATEST.jar\n","online_store:\n"," redis:\n"," host: 'feathrazuretest3redis.redis.cache.windows.net'\n"," port: 6380\n"," ssl_enabled: True\n","feature_registry:\n"," api_endpoint: \"https://feathr-sql-registry.azurewebsites.net/api/v1\"\n","\"\"\"\n","tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n","with open(tmp.name, \"w\") as text_file:\n"," text_file.write(yaml_config)\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"eab0957c-c906-4297-a729-8dd8d79cb629","showTitle":false,"title":""}},"source":["## Initialize `Feathr Client`\n","- `FeathrClient`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3734eee3-12f9-44db-a440-ad375ef859f0","showTitle":false,"title":""}},"outputs":[],"source":["client = FeathrClient(config_path=tmp.name)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"f6adbca1-5642-4ac1-bff7-e7c9d4d9e5b2","showTitle":false,"title":""}},"source":["## Define Features\n","- `HdfsSource`\n","- `TypedKey`\n","- `Feature`\n","- `FeatureAnchor`\n","- `DerivedFeature`"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b073b509-0f95-4e23-b16b-ffd8190fb6a2","showTitle":false,"title":""}},"source":["### Account Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b3668eeb-e4a0-4327-baf6-5521c856f51d","showTitle":false,"title":""}},"outputs":[],"source":["#Refer to to learn more about the details of each method\n","account_info = HdfsSource(name=\"AccountData\",\n"," path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/account_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n","\n","accountId = TypedKey(key_column=\"accountID\",\n"," key_column_type=ValueType.INT32,\n"," description=\"account id\")\n","\n","account_country = Feature(name=\"account_country\",\n"," key=accountId,\n"," feature_type=STRING, \n"," transform=\"accountCountry\")\n","\n","is_user_registered = Feature(name=\"is_user_registered\",\n"," key=accountId,\n"," feature_type=BOOLEAN,\n"," transform=\"isUserRegistered==TRUE\")\n","\n","num_payment_rejects_1d_per_user = Feature(name=\"num_payment_rejects_1d_per_user\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=\"numPaymentRejects1dPerUser\")\n","\n","account_age = Feature(name=\"account_age\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=\"accountAge\")\n"," \n","features = [\n"," account_country,\n"," account_age,\n"," is_user_registered,\n"," num_payment_rejects_1d_per_user\n","]\n","\n","account_anchor = FeatureAnchor(name=\"account_features\",\n"," source=account_info,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"6f12c07e-4faf-4411-8acd-6f5d13b962f8","showTitle":false,"title":""}},"source":["### Transaction Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"280062b9-ae21-4a1a-ae94-86a5c17fd589","showTitle":false,"title":""}},"outputs":[],"source":["# # #Refer to to learn more about the details of each method\n","\n","transaction_data = HdfsSource(name=\"transaction_data\",\n"," path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/transaction_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n","\n","transaction_id = Feature(name=\"transaction_id\",\n"," key=accountId,\n"," feature_type=STRING,\n"," transform=\"transactionID\")\n","\n","transaction_currency_code = Feature(name=\"transaction_currency_code\",\n"," key=accountId,\n"," feature_type=STRING,\n"," transform=\"transactionCurrencyCode\")\n"," \n","transaction_amount = Feature(name=\"transaction_amount\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=\"transactionAmount\")\n","\n","transaction_device_id = Feature(name=\"transaction_device_id\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=\"transactionDeviceId\")\n","\n","transaction_ip_address = Feature(name=\"transaction_ip_address\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=\"transactionIPaddress\")\n","\n","transaction_time = Feature(name=\"transaction_time\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=\"transactionTime\")\n","\n","fraud_status = Feature(name=\"fraud_status\",\n"," key=accountId,\n"," feature_type=STRING,\n"," transform=\"fraud_tag\")\n","\n","features = [\n"," transaction_id,\n"," transaction_amount,\n"," transaction_device_id,\n"," transaction_ip_address,\n"," transaction_time,\n"," transaction_currency_code,\n"," fraud_status\n","]\n","\n","transaction_feature_anchor = FeatureAnchor(name=\"transaction_features\",\n"," source=transaction_data,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"86ac05e1-26bb-4820-87ea-f547e3561181","showTitle":false,"title":""}},"source":["### Transaction Aggregated Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4c969554-f690-42f5-b70a-d962bf558b03","showTitle":false,"title":""}},"outputs":[],"source":["# average amount of transaction in the past week\n","transactions_aggr = HdfsSource(name=\"transactions_aggr\",\n"," path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/transaction_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n","\n","# average amount of transaction in that week\n","avg_transaction_amount = Feature(name=\"avg_transaction_amount\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(transactionAmount)\",\n"," agg_func=\"AVG\",\n"," window=\"7d\"))\n","\n","# number of transaction that took place in a day\n","num_trasaction_count_in_day = Feature(name=\"num_trasaction_count_in_day\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionID\",\n"," agg_func=\"COUNT\",\n"," window=\"1d\"))\n","\n","# Amount of transaction that took place in a day\n","total_transaction_amount_in_day = Feature(name=\"total_transaction_amount_in_day\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(transactionAmount)\",\n"," agg_func=\"SUM\",\n"," window=\"1d\"))\n","\n","# average time of transaction in the past week\n","avg_transaction_time = Feature(name=\"avg_transaction_time\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(transactionTime)\",\n"," agg_func=\"AVG\",\n"," window=\"7d\")) \n","\n","# total number of currency used for transaction in the past week\n","num_currency_type_in_week = Feature(name=\"num_currency_type_in_week\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionCurrencyCode\",\n"," agg_func=\"COUNT\",\n"," window=\"7d\"))\n","\n","# number of different ip address used for transaction in the past week\n","num_ip_address_count = Feature(name=\"num_ip_address_count\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionIPaddress\",\n"," agg_func=\"COUNT\",\n"," window=\"7d\"))\n","\n","# number of devices used for the transaction in the past week\n","num_device_count = Feature(name=\"num_device_count\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionDeviceId\",\n"," agg_func=\"COUNT\",\n"," window=\"7d\"))\n","\n","# find the time of most recent transaction\n","time_most_recent_transaction = Feature(name=\"time_most_recent_transaction\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionTime\",\n"," agg_func=\"LATEST\",\n"," window=\"7d\"))\n","\n","features = [\n"," avg_transaction_amount,\n"," avg_transaction_time,\n"," total_transaction_amount_in_day,\n"," num_trasaction_count_in_day,\n"," num_currency_type_in_week,\n"," num_ip_address_count,\n"," num_device_count,\n"," time_most_recent_transaction\n","]\n","\n","aggr_anchor = FeatureAnchor(name=\"transaction_aggr_features\",\n"," source=transactions_aggr,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"17cc5132-461f-4d3d-b517-1f7e69d23252","showTitle":false,"title":""}},"source":["### Derived Features\n","- `DerivedFeature`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7ac10ce4-e222-469c-bb2e-1658b45e3eda","showTitle":false,"title":""}},"outputs":[],"source":["# derived features\n","feature_diff_current_and_avg_amount = DerivedFeature(name=\"feature_diff_current_and_avg_amount\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," input_features=[\n"," transaction_amount, avg_transaction_amount],\n"," transform=\"transaction_amount - avg_transaction_amount\")\n","\n","feature_time_pass_after_most_recent_transaction = DerivedFeature(name=\"feature_time_pass_after_most_recent_transaction\",\n"," key=accountId,\n"," feature_type=INT32,\n"," input_features=[\n"," transaction_time, time_most_recent_transaction],\n"," transform=\"cast_int(transaction_time) - cast_int(time_most_recent_transaction)\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a9ec8416-9ac6-4499-b60f-55822265b893","showTitle":false,"title":""}},"source":["## Build Defined Features\n","- `build_features`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"d9d32d4f-2b60-4978-bb87-c7d2160e98eb","showTitle":false,"title":""}},"outputs":[],"source":["client.build_features(anchor_list=[account_anchor, transaction_feature_anchor, aggr_anchor], \n"," derived_feature_list=[feature_time_pass_after_most_recent_transaction, feature_diff_current_and_avg_amount])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"fa9e53b9-e7d4-4b25-b486-dc9e6801369a","showTitle":false,"title":""}},"source":["## Get Offline Features\n","- `FeatureQuery`\n","- `ObservationSettings`\n","- `get_offline_features`\n","- `feathr_spark_launcher.download_result`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b6340f2f-79dc-442b-a202-b2f2078a62ac","showTitle":false,"title":""}},"outputs":[],"source":["if client.spark_runtime == 'databricks':\n"," output_path = 'dbfs:/feathrfrauddetection_test.avro'\n","else:\n"," output_path = feathr_output_path\n","\n","feature_query = FeatureQuery(\n"," feature_list=[\"account_country\",\n"," \"transaction_time\",\n"," \"num_currency_type_in_week\",\n"," \"num_trasaction_count_in_day\",\n"," \"total_transaction_amount_in_day\",\n"," \"fraud_status\",\n"," \"is_user_registered\",\n"," \"avg_transaction_amount\",\n"," \"num_ip_address_count\",\n"," \"num_device_count\",\n"," \"time_most_recent_transaction\",\n"," \"feature_diff_current_and_avg_amount\",\n"," \"feature_time_pass_after_most_recent_transaction\"], key=accountId)\n"," \n","settings = ObservationSettings(\n"," observation_path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/observation_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n"," \n","client.get_offline_features(observation_settings=settings,\n"," feature_query=feature_query,\n"," output_path=output_path)\n","client.wait_job_to_finish(timeout_sec=10000000000)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5b7603ee-0c81-49ed-8e1f-53161ae57cbf","showTitle":false,"title":""}},"outputs":[],"source":["import pandas as pd\n","import pandavro as pdx\n","import glob\n","from pathlib import Path\n","import matplotlib.pyplot as plt\n","from datetime import datetime, timedelta\n","\n","from feathr import BackfillTime, MaterializationSettings, RedisSink"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"997db6eb-c7d8-4f5e-b6e0-09733ff706b7","showTitle":false,"title":""}},"outputs":[],"source":["def get_result_df(client: FeathrClient) -> pd.DataFrame:\n"," \"\"\"Download the job result dataset from cloud as a Pandas dataframe.\"\"\"\n"," res_url = client.get_job_result_uri(block=True, timeout_sec=600)\n"," tmp_dir = tempfile.TemporaryDirectory()\n"," client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=tmp_dir.name)\n"," dataframe_list = []\n"," # assuming the result are in avro format\n"," for file in glob.glob(os.path.join(tmp_dir.name, '*.avro')):\n"," dataframe_list.append(pdx.read_avro(file))\n"," vertical_concat_df = pd.concat(dataframe_list, axis=0)\n"," tmp_dir.cleanup()\n"," return vertical_concat_df\n","\n","df_res = get_result_df(client)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7fff1ac7-90d1-469b-a54c-397904417796","showTitle":false,"title":""}},"source":["## Feature Visualization"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e482625e-2ecd-45cb-9d43-5baacd445006","showTitle":false,"title":""}},"outputs":[],"source":["filepath = Path('./result_out.csv')\n","df_res.to_csv(filepath, index=False) \n","df_res.reset_index()\n","df_res"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b4f86c53-16cf-4836-969b-7c34f0922057","showTitle":false,"title":""}},"source":["## Train Fraud Detection Model with Calculated Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0d9d06b8-01e7-4772-8734-6ebfe1996b03","showTitle":false,"title":""}},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","from sklearn.neighbors import KNeighborsClassifier\n","from sklearn.model_selection import train_test_split \n","import seaborn as sns\n","\n","final_df = df_res\n","final_df.drop(['accountID'], axis=1, inplace=True, errors='ignore')\n","final_df.drop(['transactionDate'], axis=1, inplace=True, errors='ignore')\n","final_df.drop(['account_country'], axis=1, inplace=True, errors='ignore')\n","final_df = final_df.fillna(0)\n","\n","x_train, x_test, y_train, y_test = train_test_split(final_df.drop([\"fraud_status\"], axis=1),\n"," final_df[\"fraud_status\"],\n"," test_size=0.20,\n"," random_state=0)\n"," \n","K = []\n","training = []\n","test = []\n","scores = {}\n"," \n","for k in range(2, 21):\n"," clf = KNeighborsClassifier(n_neighbors = k)\n"," clf.fit(x_train, y_train)\n"," \n"," training_score = clf.score(x_train, y_train)\n"," test_score = clf.score(x_test, y_test)\n"," K.append(k)\n"," \n"," training.append(training_score)\n"," test.append(test_score)\n"," scores[k] = [training_score, test_score]\n","\n","for keys, values in scores.items():\n"," print(keys, ':', values)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"83e69f23-aa4e-4893-8907-6d5f0792c23f","showTitle":false,"title":""}},"source":["## Materialize Features in Redis\n","- `BackfillTime`\n","- `RedisSink`\n","- `materialize_features`\n","- `multi_get_online_features`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"faad23c1-d827-4674-b630-83530574c27d","showTitle":false,"title":""}},"outputs":[],"source":["backfill_time = BackfillTime(start=datetime(\n"," 2013, 4, 7), end=datetime(2013, 4, 7), step=timedelta(days=1))\n","redisSink = RedisSink(table_name=\"fraudDetectionDemoFeature\")\n","settings = MaterializationSettings(\"fraudDetectionDemoFeature\",\n"," backfill_time=backfill_time,\n"," sinks=[redisSink],\n"," feature_names=[\"fraud_status\"])\n","\n","client.materialize_features(settings)\n","client.wait_job_to_finish(timeout_sec=5000)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"1f5b191f-b1e8-49e4-b54d-ffc2f8c0a0b8","showTitle":false,"title":""}},"outputs":[],"source":["client.multi_get_online_features('fraudDetectionDemoFeature', ['1759222192247110', '914800996051170'], [\n"," \"fraud_status\"])"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9c3b2403-95d6-44a1-b536-d2088608ff58","showTitle":false,"title":""}},"outputs":[],"source":["client.multi_get_online_features('fraudDetectionDemoFeature', ['1759222192247110', '914800996051170', '844428033864668'], [\n"," \"fraud_status\"])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"71ba8699-3c42-4f73-be59-95b29f468696","showTitle":false,"title":""}},"source":["## Register Features with Registry APIs\n","- `register_features`\n","- `list_registered_features`\n","- Above queries are send to a Standard Registry API Service (both `Purview` and `SQL` backend are supported)\n","- More friendly interface with detailed lineage can be found in: [Feathr UI](https://feathr-sql-registry.azurewebsites.net/)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c5028dd9-01ed-4394-a5c7-623e674125f6","showTitle":false,"title":""}},"outputs":[],"source":["client.register_features()\n","client.list_registered_features(project_name=\"fraud_detection_test\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"cb814ce7-72b9-4622-8518-106d4acf9008","showTitle":false,"title":""}},"source":[]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"fraud_detection_feathr_test_2","notebookOrigID":1891349682974490,"widgets":{}},"interpreter":{"hash":"aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"},"kernelspec":{"display_name":"Python 3.10.4 64-bit","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.4"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":0} +{"cells":[{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7b19a0cd-31da-45b7-91a4-9cd561f3d3d8","showTitle":false,"title":""}},"source":["# Feathr Fraud Detection Sample\n","\n","This notebook illustrates the use of Feature Store to create a model that predicts the fraud status of transactions based on the user account data and trasaction data. All the data that was used in the notebook can be found here: https://github.com/microsoft/r-server-fraud-detection\n","\n","\n","In the following Notebook, we \n","1. Install the latest Feathr code (to include some unreleased features) \n","2. Define Environment Variables & `yaml_config` Settings \n","3. Create `FeathrClient` and Define `FeatureAnchor`\n","4. `build_features` and `get_offline_features` \n","5. Train Fraud Detection Model wih `KNeighborsClassifier`\n","6. `materialize_features` and `multi_get_online_features`\n","7. `register_features` and `list_registered_features`"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0b51153e-40dd-43d5-9d3a-501534156e6d","showTitle":false,"title":""}},"source":["## Setup Feathr Developer Environment"]},{"cell_type":"markdown","metadata":{},"source":["***Prior to running the notebook, if you have not deployed all the required resources, please refer to the guide here and follow the steps to do so: https://linkedin.github.io/feathr/how-to-guides/azure-deployment-arm.html***"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b9c63dd5-304e-4797-a230-8fb753710dbc","showTitle":false,"title":""}},"outputs":[],"source":["! pip install feathr azure-cli"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"69222adf-1cb0-410b-b98d-e22877f358c0","showTitle":false,"title":""}},"outputs":[],"source":["import glob\n","import os\n","import tempfile\n","from datetime import datetime, timedelta\n","from math import sqrt\n","\n","from feathr import FeathrClient\n","from feathr import STRING, BOOLEAN, FLOAT, INT32, ValueType\n","from feathr import Feature, DerivedFeature, FeatureAnchor\n","from feathr import BackfillTime, MaterializationSettings\n","from feathr import FeatureQuery, ObservationSettings\n","from feathr import RedisSink\n","from feathr import INPUT_CONTEXT, HdfsSource\n","from feathr import WindowAggTransformation\n","from feathr import TypedKey\n","from sklearn.model_selection import train_test_split\n","from azure.identity import DefaultAzureCredential\n","from azure.keyvault.secrets import SecretClient"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c0299d67-1103-4aa4-ba57-300498ae2579","showTitle":false,"title":""}},"outputs":[],"source":["! az login --use-device-code"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"58291272-00e5-4fe3-99d6-f1b89726f692","showTitle":false,"title":""}},"outputs":[],"source":["# replace with your prefix\n","resource_prefix = "]},{"cell_type":"markdown","metadata":{},"source":["## Permission\n","To run the cells below, you need additional permission: permission to your managed identity to access the keyvault, and permission to the user to access the Storage Blob. Run the following lines of command in the Cloud Shell in order to grant yourself the access.\n","\n","~~~ \n","userId=\n","resource_prefix=\n","keyvault_name=\"${resource_prefix}kv\"\n","objectId=$(az ad user show --id $userId --query id -o tsv)\n","az keyvault update --name $keyvault_name --enable-rbac-authorization false\n","az keyvault set-policy -n $keyvault_name --secret-permissions get list --object-id $objectId\n","az role assignment create --assignee $userId --role \"Storage Blob Data Contributor\"\n","~~~"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a8a70f27-d520-4d3c-bb8c-f364f84cb738","showTitle":false,"title":""}},"outputs":[],"source":["# Get all the required credentials from Azure Key Vault\n","key_vault_name=resource_prefix+\"kv\"\n","synapse_workspace_url=resource_prefix+\"syws\"\n","adls_account=resource_prefix+\"dls\"\n","adls_fs_name=resource_prefix+\"fs\"\n","purview_name=resource_prefix+\"purview\"\n","key_vault_uri = f\"https://{key_vault_name}.vault.azure.net\"\n","credential = DefaultAzureCredential(exclude_interactive_browser_credential=False)\n","client = SecretClient(vault_url=key_vault_uri, credential=credential)\n","secretName = \"FEATHR-ONLINE-STORE-CONN\"\n","retrieved_secret = client.get_secret(secretName).value\n","\n","# Get redis credentials; This is to parse Redis connection string.\n","redis_port=retrieved_secret.split(',')[0].split(\":\")[1]\n","redis_host=retrieved_secret.split(',')[0].split(\":\")[0]\n","redis_password=retrieved_secret.split(',')[1].split(\"password=\",1)[1]\n","redis_ssl=retrieved_secret.split(',')[2].split(\"ssl=\",1)[1]\n","\n","# Set the resource link\n","os.environ['spark_config__azure_synapse__dev_url'] = f'https://{synapse_workspace_url}.dev.azuresynapse.net'\n","os.environ['spark_config__azure_synapse__pool_name'] = 'spark31'\n","os.environ['spark_config__azure_synapse__workspace_dir'] = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_project'\n","os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n","os.environ['online_store__redis__host'] = redis_host\n","os.environ['online_store__redis__port'] = redis_port\n","os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n","os.environ['REDIS_PASSWORD']=redis_password\n","os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n","feathr_output_path = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_output'"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"50b2f73e-6380-42c3-91e8-4f3e15bc10d6","showTitle":false,"title":""}},"outputs":[],"source":["import tempfile\n","yaml_config = \"\"\"\n","# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n","api_version: 1\n","project_config:\n"," project_name: 'fraud_detection_test'\n"," required_environment_variables:\n"," - 'REDIS_PASSWORD'\n","offline_store:\n"," adls:\n"," adls_enabled: true\n"," wasb:\n"," wasb_enabled: true\n"," s3:\n"," s3_enabled: false\n"," s3_endpoint: 's3.amazonaws.com'\n"," jdbc:\n"," jdbc_enabled: false\n"," jdbc_database: 'feathrtestdb'\n"," jdbc_table: 'feathrtesttable'\n"," snowflake:\n"," url: \"dqllago-ol19457.snowflakecomputing.com\"\n"," user: \"feathrintegration\"\n"," role: \"ACCOUNTADMIN\"\n","spark_config:\n"," spark_cluster: 'azure_synapse'\n"," spark_result_output_parts: '1'\n"," azure_synapse:\n"," dev_url: 'https://feathrazuretest3synapse.dev.azuresynapse.net'\n"," pool_name: 'spark3'\n"," workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/fraud_detection_test'\n"," executor_size: 'Small'\n"," executor_num: 4\n"," feathr_runtime_location: wasbs://public@azurefeathrstorage.blob.core.windows.net/feathr-assembly-LATEST.jar\n"," databricks:\n"," workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net'\n"," config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_D3_v2','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}}\n"," work_dir: 'dbfs:/fraud_detection_test'\n"," feathr_runtime_location: https://azurefeathrstorage.blob.core.windows.net/public/feathr-assembly-LATEST.jar\n","online_store:\n"," redis:\n"," host: 'feathrazuretest3redis.redis.cache.windows.net'\n"," port: 6380\n"," ssl_enabled: True\n","feature_registry:\n"," api_endpoint: \"https://feathr-sql-registry.azurewebsites.net/api/v1\"\n","\"\"\"\n","tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n","with open(tmp.name, \"w\") as text_file:\n"," text_file.write(yaml_config)\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"eab0957c-c906-4297-a729-8dd8d79cb629","showTitle":false,"title":""}},"source":["## Initialize `Feathr Client`\n","- `FeathrClient`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3734eee3-12f9-44db-a440-ad375ef859f0","showTitle":false,"title":""}},"outputs":[],"source":["client = FeathrClient(config_path=tmp.name)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"f6adbca1-5642-4ac1-bff7-e7c9d4d9e5b2","showTitle":false,"title":""}},"source":["## Define Features\n","- `HdfsSource`\n","- `TypedKey`\n","- `Feature`\n","- `FeatureAnchor`\n","- `DerivedFeature`"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b073b509-0f95-4e23-b16b-ffd8190fb6a2","showTitle":false,"title":""}},"source":["### Account Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b3668eeb-e4a0-4327-baf6-5521c856f51d","showTitle":false,"title":""}},"outputs":[],"source":["#Refer to to learn more about the details of each method\n","account_info = HdfsSource(name=\"AccountData\",\n"," path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/account_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n","\n","accountId = TypedKey(key_column=\"accountID\",\n"," key_column_type=ValueType.INT32,\n"," description=\"account id\")\n","\n","account_country = Feature(name=\"account_country\",\n"," key=accountId,\n"," feature_type=STRING, \n"," transform=\"accountCountry\")\n","\n","is_user_registered = Feature(name=\"is_user_registered\",\n"," key=accountId,\n"," feature_type=BOOLEAN,\n"," transform=\"isUserRegistered==TRUE\")\n","\n","num_payment_rejects_1d_per_user = Feature(name=\"num_payment_rejects_1d_per_user\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=\"numPaymentRejects1dPerUser\")\n","\n","account_age = Feature(name=\"account_age\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=\"accountAge\")\n"," \n","features = [\n"," account_country,\n"," account_age,\n"," is_user_registered,\n"," num_payment_rejects_1d_per_user\n","]\n","\n","account_anchor = FeatureAnchor(name=\"account_features\",\n"," source=account_info,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"6f12c07e-4faf-4411-8acd-6f5d13b962f8","showTitle":false,"title":""}},"source":["### Transaction Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"280062b9-ae21-4a1a-ae94-86a5c17fd589","showTitle":false,"title":""}},"outputs":[],"source":["# # #Refer to to learn more about the details of each method\n","\n","transaction_data = HdfsSource(name=\"transaction_data\",\n"," path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/transaction_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n","\n","transaction_id = Feature(name=\"transaction_id\",\n"," key=accountId,\n"," feature_type=STRING,\n"," transform=\"transactionID\")\n","\n","transaction_currency_code = Feature(name=\"transaction_currency_code\",\n"," key=accountId,\n"," feature_type=STRING,\n"," transform=\"transactionCurrencyCode\")\n"," \n","transaction_amount = Feature(name=\"transaction_amount\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=\"transactionAmount\")\n","\n","transaction_device_id = Feature(name=\"transaction_device_id\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=\"transactionDeviceId\")\n","\n","transaction_ip_address = Feature(name=\"transaction_ip_address\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=\"transactionIPaddress\")\n","\n","transaction_time = Feature(name=\"transaction_time\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=\"transactionTime\")\n","\n","fraud_status = Feature(name=\"fraud_status\",\n"," key=accountId,\n"," feature_type=STRING,\n"," transform=\"fraud_tag\")\n","\n","features = [\n"," transaction_id,\n"," transaction_amount,\n"," transaction_device_id,\n"," transaction_ip_address,\n"," transaction_time,\n"," transaction_currency_code,\n"," fraud_status\n","]\n","\n","transaction_feature_anchor = FeatureAnchor(name=\"transaction_features\",\n"," source=transaction_data,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"86ac05e1-26bb-4820-87ea-f547e3561181","showTitle":false,"title":""}},"source":["### Transaction Aggregated Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4c969554-f690-42f5-b70a-d962bf558b03","showTitle":false,"title":""}},"outputs":[],"source":["# average amount of transaction in the past week\n","transactions_aggr = HdfsSource(name=\"transactions_aggr\",\n"," path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/transaction_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n","\n","# average amount of transaction in that week\n","avg_transaction_amount = Feature(name=\"avg_transaction_amount\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(transactionAmount)\",\n"," agg_func=\"AVG\",\n"," window=\"7d\"))\n","\n","# number of transaction that took place in a day\n","num_trasaction_count_in_day = Feature(name=\"num_trasaction_count_in_day\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionID\",\n"," agg_func=\"COUNT\",\n"," window=\"1d\"))\n","\n","# Amount of transaction that took place in a day\n","total_transaction_amount_in_day = Feature(name=\"total_transaction_amount_in_day\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(transactionAmount)\",\n"," agg_func=\"SUM\",\n"," window=\"1d\"))\n","\n","# average time of transaction in the past week\n","avg_transaction_time = Feature(name=\"avg_transaction_time\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(transactionTime)\",\n"," agg_func=\"AVG\",\n"," window=\"7d\")) \n","\n","# total number of currency used for transaction in the past week\n","num_currency_type_in_week = Feature(name=\"num_currency_type_in_week\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionCurrencyCode\",\n"," agg_func=\"COUNT\",\n"," window=\"7d\"))\n","\n","# number of different ip address used for transaction in the past week\n","num_ip_address_count = Feature(name=\"num_ip_address_count\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionIPaddress\",\n"," agg_func=\"COUNT\",\n"," window=\"7d\"))\n","\n","# number of devices used for the transaction in the past week\n","num_device_count = Feature(name=\"num_device_count\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionDeviceId\",\n"," agg_func=\"COUNT\",\n"," window=\"7d\"))\n","\n","# find the time of most recent transaction\n","time_most_recent_transaction = Feature(name=\"time_most_recent_transaction\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionTime\",\n"," agg_func=\"LATEST\",\n"," window=\"7d\"))\n","\n","features = [\n"," avg_transaction_amount,\n"," avg_transaction_time,\n"," total_transaction_amount_in_day,\n"," num_trasaction_count_in_day,\n"," num_currency_type_in_week,\n"," num_ip_address_count,\n"," num_device_count,\n"," time_most_recent_transaction\n","]\n","\n","aggr_anchor = FeatureAnchor(name=\"transaction_aggr_features\",\n"," source=transactions_aggr,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"17cc5132-461f-4d3d-b517-1f7e69d23252","showTitle":false,"title":""}},"source":["### Derived Features\n","- `DerivedFeature`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7ac10ce4-e222-469c-bb2e-1658b45e3eda","showTitle":false,"title":""}},"outputs":[],"source":["# derived features\n","feature_diff_current_and_avg_amount = DerivedFeature(name=\"feature_diff_current_and_avg_amount\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," input_features=[\n"," transaction_amount, avg_transaction_amount],\n"," transform=\"transaction_amount - avg_transaction_amount\")\n","\n","feature_time_pass_after_most_recent_transaction = DerivedFeature(name=\"feature_time_pass_after_most_recent_transaction\",\n"," key=accountId,\n"," feature_type=INT32,\n"," input_features=[\n"," transaction_time, time_most_recent_transaction],\n"," transform=\"cast_int(transaction_time) - cast_int(time_most_recent_transaction)\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a9ec8416-9ac6-4499-b60f-55822265b893","showTitle":false,"title":""}},"source":["## Build Defined Features\n","- `build_features`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"d9d32d4f-2b60-4978-bb87-c7d2160e98eb","showTitle":false,"title":""}},"outputs":[],"source":["client.build_features(anchor_list=[account_anchor, transaction_feature_anchor, aggr_anchor], \n"," derived_feature_list=[feature_time_pass_after_most_recent_transaction, feature_diff_current_and_avg_amount])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"fa9e53b9-e7d4-4b25-b486-dc9e6801369a","showTitle":false,"title":""}},"source":["## Get Offline Features\n","- `FeatureQuery`\n","- `ObservationSettings`\n","- `get_offline_features`\n","- `feathr_spark_launcher.download_result`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b6340f2f-79dc-442b-a202-b2f2078a62ac","showTitle":false,"title":""}},"outputs":[],"source":["if client.spark_runtime == 'databricks':\n"," output_path = 'dbfs:/feathrfrauddetection_test.avro'\n","else:\n"," output_path = feathr_output_path\n","\n","feature_query = FeatureQuery(\n"," feature_list=[\"account_country\",\n"," \"transaction_time\",\n"," \"num_currency_type_in_week\",\n"," \"num_trasaction_count_in_day\",\n"," \"total_transaction_amount_in_day\",\n"," \"fraud_status\",\n"," \"is_user_registered\",\n"," \"avg_transaction_amount\",\n"," \"num_ip_address_count\",\n"," \"num_device_count\",\n"," \"time_most_recent_transaction\",\n"," \"feature_diff_current_and_avg_amount\",\n"," \"feature_time_pass_after_most_recent_transaction\"], key=accountId)\n"," \n","settings = ObservationSettings(\n"," observation_path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/observation_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n"," \n","client.get_offline_features(observation_settings=settings,\n"," feature_query=feature_query,\n"," output_path=output_path)\n","client.wait_job_to_finish(timeout_sec=10000000000)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5b7603ee-0c81-49ed-8e1f-53161ae57cbf","showTitle":false,"title":""}},"outputs":[],"source":["import pandas as pd\n","import pandavro as pdx\n","import glob\n","from pathlib import Path\n","import matplotlib.pyplot as plt\n","from datetime import datetime, timedelta\n","\n","from feathr import BackfillTime, MaterializationSettings, RedisSink"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"997db6eb-c7d8-4f5e-b6e0-09733ff706b7","showTitle":false,"title":""}},"outputs":[],"source":["def get_result_df(client: FeathrClient) -> pd.DataFrame:\n"," \"\"\"Download the job result dataset from cloud as a Pandas dataframe.\"\"\"\n"," res_url = client.get_job_result_uri(block=True, timeout_sec=600)\n"," tmp_dir = tempfile.TemporaryDirectory()\n"," client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=tmp_dir.name)\n"," dataframe_list = []\n"," # assuming the result are in avro format\n"," for file in glob.glob(os.path.join(tmp_dir.name, '*.avro')):\n"," dataframe_list.append(pdx.read_avro(file))\n"," vertical_concat_df = pd.concat(dataframe_list, axis=0)\n"," tmp_dir.cleanup()\n"," return vertical_concat_df\n","\n","df_res = get_result_df(client)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7fff1ac7-90d1-469b-a54c-397904417796","showTitle":false,"title":""}},"source":["## Feature Visualization"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e482625e-2ecd-45cb-9d43-5baacd445006","showTitle":false,"title":""}},"outputs":[],"source":["filepath = Path('./result_out.csv')\n","df_res.to_csv(filepath, index=False) \n","df_res.reset_index()\n","df_res"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b4f86c53-16cf-4836-969b-7c34f0922057","showTitle":false,"title":""}},"source":["## Train Fraud Detection Model with Calculated Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0d9d06b8-01e7-4772-8734-6ebfe1996b03","showTitle":false,"title":""}},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","from sklearn.neighbors import KNeighborsClassifier\n","from sklearn.model_selection import train_test_split \n","import seaborn as sns\n","\n","final_df = df_res\n","final_df.drop(['accountID'], axis=1, inplace=True, errors='ignore')\n","final_df.drop(['transactionDate'], axis=1, inplace=True, errors='ignore')\n","final_df.drop(['account_country'], axis=1, inplace=True, errors='ignore')\n","final_df = final_df.fillna(0)\n","\n","x_train, x_test, y_train, y_test = train_test_split(final_df.drop([\"fraud_status\"], axis=1),\n"," final_df[\"fraud_status\"],\n"," test_size=0.20,\n"," random_state=0)\n"," \n","K = []\n","training = []\n","test = []\n","scores = {}\n"," \n","for k in range(2, 21):\n"," clf = KNeighborsClassifier(n_neighbors = k)\n"," clf.fit(x_train, y_train)\n"," \n"," training_score = clf.score(x_train, y_train)\n"," test_score = clf.score(x_test, y_test)\n"," K.append(k)\n"," \n"," training.append(training_score)\n"," test.append(test_score)\n"," scores[k] = [training_score, test_score]\n","\n","for keys, values in scores.items():\n"," print(keys, ':', values)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"83e69f23-aa4e-4893-8907-6d5f0792c23f","showTitle":false,"title":""}},"source":["## Materialize Features in Redis\n","- `BackfillTime`\n","- `RedisSink`\n","- `materialize_features`\n","- `multi_get_online_features`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"faad23c1-d827-4674-b630-83530574c27d","showTitle":false,"title":""}},"outputs":[],"source":["backfill_time = BackfillTime(start=datetime(\n"," 2013, 4, 7), end=datetime(2013, 4, 7), step=timedelta(days=1))\n","redisSink = RedisSink(table_name=\"fraudDetectionDemoFeature\")\n","settings = MaterializationSettings(\"fraudDetectionDemoFeature\",\n"," backfill_time=backfill_time,\n"," sinks=[redisSink],\n"," feature_names=[\"fraud_status\"])\n","\n","client.materialize_features(settings)\n","client.wait_job_to_finish(timeout_sec=5000)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"1f5b191f-b1e8-49e4-b54d-ffc2f8c0a0b8","showTitle":false,"title":""}},"outputs":[],"source":["client.multi_get_online_features('fraudDetectionDemoFeature', ['1759222192247110', '914800996051170'], [\n"," \"fraud_status\"])"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9c3b2403-95d6-44a1-b536-d2088608ff58","showTitle":false,"title":""}},"outputs":[],"source":["client.multi_get_online_features('fraudDetectionDemoFeature', ['1759222192247110', '914800996051170', '844428033864668'], [\n"," \"fraud_status\"])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"71ba8699-3c42-4f73-be59-95b29f468696","showTitle":false,"title":""}},"source":["## Register Features with Registry APIs\n","- `register_features`\n","- `list_registered_features`\n","- Above queries are send to a Standard Registry API Service (both `Purview` and `SQL` backend are supported)\n","- More friendly interface with detailed lineage can be found in: [Feathr UI](https://feathr-sql-registry.azurewebsites.net/)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c5028dd9-01ed-4394-a5c7-623e674125f6","showTitle":false,"title":""}},"outputs":[],"source":["client.register_features()\n","client.list_registered_features(project_name=\"fraud_detection_test\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"cb814ce7-72b9-4622-8518-106d4acf9008","showTitle":false,"title":""}},"source":[]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"fraud_detection_feathr_test_2","notebookOrigID":1891349682974490,"widgets":{}},"interpreter":{"hash":"aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"},"kernelspec":{"display_name":"Python 3.10.4 64-bit","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.4"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":0} From 2bb14944f84cdaf594d9529190deecae13e91311 Mon Sep 17 00:00:00 2001 From: Curie Date: Wed, 27 Jul 2022 16:08:04 -0700 Subject: [PATCH 4/5] add synapse permission and move notebook to samples --- docs/samples/fraud_detection_demo.ipynb | 1 + .../data/feathr_user_workspace/fraud_detection_demo.ipynb | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 docs/samples/fraud_detection_demo.ipynb delete mode 100644 feathr_project/feathrcli/data/feathr_user_workspace/fraud_detection_demo.ipynb diff --git a/docs/samples/fraud_detection_demo.ipynb b/docs/samples/fraud_detection_demo.ipynb new file mode 100644 index 000000000..cdacbc88b --- /dev/null +++ b/docs/samples/fraud_detection_demo.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7b19a0cd-31da-45b7-91a4-9cd561f3d3d8","showTitle":false,"title":""}},"source":["# Feathr Fraud Detection Sample\n","\n","This notebook illustrates the use of Feature Store to create a model that predicts the fraud status of transactions based on the user account data and trasaction data. All the data that was used in the notebook can be found here: https://github.com/microsoft/r-server-fraud-detection\n","\n","\n","In the following Notebook, we \n","1. Install the latest Feathr code (to include some unreleased features) \n","2. Define Environment Variables & `yaml_config` Settings \n","3. Create `FeathrClient` and Define `FeatureAnchor`\n","4. `build_features` and `get_offline_features` \n","5. Train Fraud Detection Model wih `KNeighborsClassifier`\n","6. `materialize_features` and `multi_get_online_features`\n","7. `register_features` and `list_registered_features`"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0b51153e-40dd-43d5-9d3a-501534156e6d","showTitle":false,"title":""}},"source":["## Setup Feathr Developer Environment"]},{"cell_type":"markdown","metadata":{},"source":["***Prior to running the notebook, if you have not deployed all the required resources, please refer to the guide here and follow the steps to do so: https://linkedin.github.io/feathr/how-to-guides/azure-deployment-arm.html***"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b9c63dd5-304e-4797-a230-8fb753710dbc","showTitle":false,"title":""}},"outputs":[],"source":["! pip install feathr azure-cli"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"69222adf-1cb0-410b-b98d-e22877f358c0","showTitle":false,"title":""}},"outputs":[],"source":["import glob\n","import os\n","import tempfile\n","from datetime import datetime, timedelta\n","from math import sqrt\n","\n","from feathr import FeathrClient\n","from feathr import STRING, BOOLEAN, FLOAT, INT32, ValueType\n","from feathr import Feature, DerivedFeature, FeatureAnchor\n","from feathr import BackfillTime, MaterializationSettings\n","from feathr import FeatureQuery, ObservationSettings\n","from feathr import RedisSink\n","from feathr import INPUT_CONTEXT, HdfsSource\n","from feathr import WindowAggTransformation\n","from feathr import TypedKey\n","from sklearn.model_selection import train_test_split\n","from azure.identity import DefaultAzureCredential\n","from azure.keyvault.secrets import SecretClient"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c0299d67-1103-4aa4-ba57-300498ae2579","showTitle":false,"title":""}},"outputs":[],"source":["! az login --use-device-code"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"58291272-00e5-4fe3-99d6-f1b89726f692","showTitle":false,"title":""}},"outputs":[],"source":["# replace with your prefix\n","resource_prefix = "]},{"cell_type":"markdown","metadata":{},"source":["## Permission\n","To run the cells below, you need additional permission: permission to your managed identity to access the keyvault, and permission to the user to access the Storage Blob. Run the following lines of command in the Cloud Shell in order to grant yourself the access.\n","\n","~~~ \n","userId=\n","resource_prefix=\n","synapse_workspace_name=\"${resource_prefix}syws\"\n","keyvault_name=\"${resource_prefix}kv\"\n","objectId=$(az ad user show --id $userId --query id -o tsv)\n","az keyvault update --name $keyvault_name --enable-rbac-authorization false\n","az keyvault set-policy -n $keyvault_name --secret-permissions get list --object-id $objectId\n","az role assignment create --assignee $userId --role \"Storage Blob Data Contributor\"\n","az synapse role assignment create --workspace-name $synapse_workspace_name --role \"Synapse Contributor\" --assignee $userId\n","~~~"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a8a70f27-d520-4d3c-bb8c-f364f84cb738","showTitle":false,"title":""}},"outputs":[],"source":["# Get all the required credentials from Azure Key Vault\n","key_vault_name=resource_prefix+\"kv\"\n","synapse_workspace_url=resource_prefix+\"syws\"\n","adls_account=resource_prefix+\"dls\"\n","adls_fs_name=resource_prefix+\"fs\"\n","purview_name=resource_prefix+\"purview\"\n","key_vault_uri = f\"https://{key_vault_name}.vault.azure.net\"\n","credential = DefaultAzureCredential(exclude_interactive_browser_credential=False)\n","client = SecretClient(vault_url=key_vault_uri, credential=credential)\n","secretName = \"FEATHR-ONLINE-STORE-CONN\"\n","retrieved_secret = client.get_secret(secretName).value\n","\n","# Get redis credentials; This is to parse Redis connection string.\n","redis_port=retrieved_secret.split(',')[0].split(\":\")[1]\n","redis_host=retrieved_secret.split(',')[0].split(\":\")[0]\n","redis_password=retrieved_secret.split(',')[1].split(\"password=\",1)[1]\n","redis_ssl=retrieved_secret.split(',')[2].split(\"ssl=\",1)[1]\n","\n","# Set the resource link\n","os.environ['spark_config__azure_synapse__dev_url'] = f'https://{synapse_workspace_url}.dev.azuresynapse.net'\n","os.environ['spark_config__azure_synapse__pool_name'] = 'spark31'\n","os.environ['spark_config__azure_synapse__workspace_dir'] = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_project'\n","os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n","os.environ['online_store__redis__host'] = redis_host\n","os.environ['online_store__redis__port'] = redis_port\n","os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n","os.environ['REDIS_PASSWORD']=redis_password\n","os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n","feathr_output_path = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_output'"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"50b2f73e-6380-42c3-91e8-4f3e15bc10d6","showTitle":false,"title":""}},"outputs":[],"source":["import tempfile\n","yaml_config = \"\"\"\n","# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n","api_version: 1\n","project_config:\n"," project_name: 'fraud_detection_test'\n"," required_environment_variables:\n"," - 'REDIS_PASSWORD'\n","offline_store:\n"," adls:\n"," adls_enabled: true\n"," wasb:\n"," wasb_enabled: true\n"," s3:\n"," s3_enabled: false\n"," s3_endpoint: 's3.amazonaws.com'\n"," jdbc:\n"," jdbc_enabled: false\n"," jdbc_database: 'feathrtestdb'\n"," jdbc_table: 'feathrtesttable'\n"," snowflake:\n"," url: \"dqllago-ol19457.snowflakecomputing.com\"\n"," user: \"feathrintegration\"\n"," role: \"ACCOUNTADMIN\"\n","spark_config:\n"," spark_cluster: 'azure_synapse'\n"," spark_result_output_parts: '1'\n"," azure_synapse:\n"," dev_url: 'https://feathrazuretest3synapse.dev.azuresynapse.net'\n"," pool_name: 'spark3'\n"," workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/fraud_detection_test'\n"," executor_size: 'Small'\n"," executor_num: 4\n"," feathr_runtime_location: wasbs://public@azurefeathrstorage.blob.core.windows.net/feathr-assembly-LATEST.jar\n"," databricks:\n"," workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net'\n"," config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_D3_v2','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}}\n"," work_dir: 'dbfs:/fraud_detection_test'\n"," feathr_runtime_location: https://azurefeathrstorage.blob.core.windows.net/public/feathr-assembly-LATEST.jar\n","online_store:\n"," redis:\n"," host: 'feathrazuretest3redis.redis.cache.windows.net'\n"," port: 6380\n"," ssl_enabled: True\n","feature_registry:\n"," api_endpoint: \"https://feathr-sql-registry.azurewebsites.net/api/v1\"\n","\"\"\"\n","tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n","with open(tmp.name, \"w\") as text_file:\n"," text_file.write(yaml_config)\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"eab0957c-c906-4297-a729-8dd8d79cb629","showTitle":false,"title":""}},"source":["## Initialize `Feathr Client`\n","- `FeathrClient`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3734eee3-12f9-44db-a440-ad375ef859f0","showTitle":false,"title":""}},"outputs":[],"source":["client = FeathrClient(config_path=tmp.name)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"f6adbca1-5642-4ac1-bff7-e7c9d4d9e5b2","showTitle":false,"title":""}},"source":["## Define Features\n","- `HdfsSource`\n","- `TypedKey`\n","- `Feature`\n","- `FeatureAnchor`\n","- `DerivedFeature`"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b073b509-0f95-4e23-b16b-ffd8190fb6a2","showTitle":false,"title":""}},"source":["### Account Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b3668eeb-e4a0-4327-baf6-5521c856f51d","showTitle":false,"title":""}},"outputs":[],"source":["#Refer to to learn more about the details of each method\n","account_info = HdfsSource(name=\"AccountData\",\n"," path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/account_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n","\n","accountId = TypedKey(key_column=\"accountID\",\n"," key_column_type=ValueType.INT32,\n"," description=\"account id\")\n","\n","account_country = Feature(name=\"account_country\",\n"," key=accountId,\n"," feature_type=STRING, \n"," transform=\"accountCountry\")\n","\n","is_user_registered = Feature(name=\"is_user_registered\",\n"," key=accountId,\n"," feature_type=BOOLEAN,\n"," transform=\"isUserRegistered==TRUE\")\n","\n","num_payment_rejects_1d_per_user = Feature(name=\"num_payment_rejects_1d_per_user\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=\"numPaymentRejects1dPerUser\")\n","\n","account_age = Feature(name=\"account_age\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=\"accountAge\")\n"," \n","features = [\n"," account_country,\n"," account_age,\n"," is_user_registered,\n"," num_payment_rejects_1d_per_user\n","]\n","\n","account_anchor = FeatureAnchor(name=\"account_features\",\n"," source=account_info,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"6f12c07e-4faf-4411-8acd-6f5d13b962f8","showTitle":false,"title":""}},"source":["### Transaction Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"280062b9-ae21-4a1a-ae94-86a5c17fd589","showTitle":false,"title":""}},"outputs":[],"source":["# # #Refer to to learn more about the details of each method\n","\n","transaction_data = HdfsSource(name=\"transaction_data\",\n"," path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/transaction_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n","\n","transaction_id = Feature(name=\"transaction_id\",\n"," key=accountId,\n"," feature_type=STRING,\n"," transform=\"transactionID\")\n","\n","transaction_currency_code = Feature(name=\"transaction_currency_code\",\n"," key=accountId,\n"," feature_type=STRING,\n"," transform=\"transactionCurrencyCode\")\n"," \n","transaction_amount = Feature(name=\"transaction_amount\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=\"transactionAmount\")\n","\n","transaction_device_id = Feature(name=\"transaction_device_id\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=\"transactionDeviceId\")\n","\n","transaction_ip_address = Feature(name=\"transaction_ip_address\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=\"transactionIPaddress\")\n","\n","transaction_time = Feature(name=\"transaction_time\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=\"transactionTime\")\n","\n","fraud_status = Feature(name=\"fraud_status\",\n"," key=accountId,\n"," feature_type=STRING,\n"," transform=\"fraud_tag\")\n","\n","features = [\n"," transaction_id,\n"," transaction_amount,\n"," transaction_device_id,\n"," transaction_ip_address,\n"," transaction_time,\n"," transaction_currency_code,\n"," fraud_status\n","]\n","\n","transaction_feature_anchor = FeatureAnchor(name=\"transaction_features\",\n"," source=transaction_data,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"86ac05e1-26bb-4820-87ea-f547e3561181","showTitle":false,"title":""}},"source":["### Transaction Aggregated Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4c969554-f690-42f5-b70a-d962bf558b03","showTitle":false,"title":""}},"outputs":[],"source":["# average amount of transaction in the past week\n","transactions_aggr = HdfsSource(name=\"transactions_aggr\",\n"," path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/transaction_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n","\n","# average amount of transaction in that week\n","avg_transaction_amount = Feature(name=\"avg_transaction_amount\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(transactionAmount)\",\n"," agg_func=\"AVG\",\n"," window=\"7d\"))\n","\n","# number of transaction that took place in a day\n","num_trasaction_count_in_day = Feature(name=\"num_trasaction_count_in_day\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionID\",\n"," agg_func=\"COUNT\",\n"," window=\"1d\"))\n","\n","# Amount of transaction that took place in a day\n","total_transaction_amount_in_day = Feature(name=\"total_transaction_amount_in_day\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(transactionAmount)\",\n"," agg_func=\"SUM\",\n"," window=\"1d\"))\n","\n","# average time of transaction in the past week\n","avg_transaction_time = Feature(name=\"avg_transaction_time\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(transactionTime)\",\n"," agg_func=\"AVG\",\n"," window=\"7d\")) \n","\n","# total number of currency used for transaction in the past week\n","num_currency_type_in_week = Feature(name=\"num_currency_type_in_week\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionCurrencyCode\",\n"," agg_func=\"COUNT\",\n"," window=\"7d\"))\n","\n","# number of different ip address used for transaction in the past week\n","num_ip_address_count = Feature(name=\"num_ip_address_count\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionIPaddress\",\n"," agg_func=\"COUNT\",\n"," window=\"7d\"))\n","\n","# number of devices used for the transaction in the past week\n","num_device_count = Feature(name=\"num_device_count\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionDeviceId\",\n"," agg_func=\"COUNT\",\n"," window=\"7d\"))\n","\n","# find the time of most recent transaction\n","time_most_recent_transaction = Feature(name=\"time_most_recent_transaction\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionTime\",\n"," agg_func=\"LATEST\",\n"," window=\"7d\"))\n","\n","features = [\n"," avg_transaction_amount,\n"," avg_transaction_time,\n"," total_transaction_amount_in_day,\n"," num_trasaction_count_in_day,\n"," num_currency_type_in_week,\n"," num_ip_address_count,\n"," num_device_count,\n"," time_most_recent_transaction\n","]\n","\n","aggr_anchor = FeatureAnchor(name=\"transaction_aggr_features\",\n"," source=transactions_aggr,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"17cc5132-461f-4d3d-b517-1f7e69d23252","showTitle":false,"title":""}},"source":["### Derived Features\n","- `DerivedFeature`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7ac10ce4-e222-469c-bb2e-1658b45e3eda","showTitle":false,"title":""}},"outputs":[],"source":["# derived features\n","feature_diff_current_and_avg_amount = DerivedFeature(name=\"feature_diff_current_and_avg_amount\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," input_features=[\n"," transaction_amount, avg_transaction_amount],\n"," transform=\"transaction_amount - avg_transaction_amount\")\n","\n","feature_time_pass_after_most_recent_transaction = DerivedFeature(name=\"feature_time_pass_after_most_recent_transaction\",\n"," key=accountId,\n"," feature_type=INT32,\n"," input_features=[\n"," transaction_time, time_most_recent_transaction],\n"," transform=\"cast_int(transaction_time) - cast_int(time_most_recent_transaction)\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a9ec8416-9ac6-4499-b60f-55822265b893","showTitle":false,"title":""}},"source":["## Build Defined Features\n","- `build_features`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"d9d32d4f-2b60-4978-bb87-c7d2160e98eb","showTitle":false,"title":""}},"outputs":[],"source":["client.build_features(anchor_list=[account_anchor, transaction_feature_anchor, aggr_anchor], \n"," derived_feature_list=[feature_time_pass_after_most_recent_transaction, feature_diff_current_and_avg_amount])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"fa9e53b9-e7d4-4b25-b486-dc9e6801369a","showTitle":false,"title":""}},"source":["## Get Offline Features\n","- `FeatureQuery`\n","- `ObservationSettings`\n","- `get_offline_features`\n","- `feathr_spark_launcher.download_result`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b6340f2f-79dc-442b-a202-b2f2078a62ac","showTitle":false,"title":""}},"outputs":[],"source":["if client.spark_runtime == 'databricks':\n"," output_path = 'dbfs:/feathrfrauddetection_test.avro'\n","else:\n"," output_path = feathr_output_path\n","\n","feature_query = FeatureQuery(\n"," feature_list=[\"account_country\",\n"," \"transaction_time\",\n"," \"num_currency_type_in_week\",\n"," \"num_trasaction_count_in_day\",\n"," \"total_transaction_amount_in_day\",\n"," \"fraud_status\",\n"," \"is_user_registered\",\n"," \"avg_transaction_amount\",\n"," \"num_ip_address_count\",\n"," \"num_device_count\",\n"," \"time_most_recent_transaction\",\n"," \"feature_diff_current_and_avg_amount\",\n"," \"feature_time_pass_after_most_recent_transaction\"], key=accountId)\n"," \n","settings = ObservationSettings(\n"," observation_path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/observation_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n"," \n","client.get_offline_features(observation_settings=settings,\n"," feature_query=feature_query,\n"," output_path=output_path)\n","client.wait_job_to_finish(timeout_sec=10000000000)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5b7603ee-0c81-49ed-8e1f-53161ae57cbf","showTitle":false,"title":""}},"outputs":[],"source":["import pandas as pd\n","import pandavro as pdx\n","import glob\n","from pathlib import Path\n","import matplotlib.pyplot as plt\n","from datetime import datetime, timedelta\n","\n","from feathr import BackfillTime, MaterializationSettings, RedisSink"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"997db6eb-c7d8-4f5e-b6e0-09733ff706b7","showTitle":false,"title":""}},"outputs":[],"source":["def get_result_df(client: FeathrClient) -> pd.DataFrame:\n"," \"\"\"Download the job result dataset from cloud as a Pandas dataframe.\"\"\"\n"," res_url = client.get_job_result_uri(block=True, timeout_sec=600)\n"," tmp_dir = tempfile.TemporaryDirectory()\n"," client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=tmp_dir.name)\n"," dataframe_list = []\n"," # assuming the result are in avro format\n"," for file in glob.glob(os.path.join(tmp_dir.name, '*.avro')):\n"," dataframe_list.append(pdx.read_avro(file))\n"," vertical_concat_df = pd.concat(dataframe_list, axis=0)\n"," tmp_dir.cleanup()\n"," return vertical_concat_df\n","\n","df_res = get_result_df(client)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7fff1ac7-90d1-469b-a54c-397904417796","showTitle":false,"title":""}},"source":["## Feature Visualization"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e482625e-2ecd-45cb-9d43-5baacd445006","showTitle":false,"title":""}},"outputs":[],"source":["filepath = Path('./result_out.csv')\n","df_res.to_csv(filepath, index=False) \n","df_res.reset_index()\n","df_res"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b4f86c53-16cf-4836-969b-7c34f0922057","showTitle":false,"title":""}},"source":["## Train Fraud Detection Model with Calculated Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0d9d06b8-01e7-4772-8734-6ebfe1996b03","showTitle":false,"title":""}},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","from sklearn.neighbors import KNeighborsClassifier\n","from sklearn.model_selection import train_test_split \n","import seaborn as sns\n","\n","final_df = df_res\n","final_df.drop(['accountID'], axis=1, inplace=True, errors='ignore')\n","final_df.drop(['transactionDate'], axis=1, inplace=True, errors='ignore')\n","final_df.drop(['account_country'], axis=1, inplace=True, errors='ignore')\n","final_df = final_df.fillna(0)\n","\n","x_train, x_test, y_train, y_test = train_test_split(final_df.drop([\"fraud_status\"], axis=1),\n"," final_df[\"fraud_status\"],\n"," test_size=0.20,\n"," random_state=0)\n"," \n","K = []\n","training = []\n","test = []\n","scores = {}\n"," \n","for k in range(2, 21):\n"," clf = KNeighborsClassifier(n_neighbors = k)\n"," clf.fit(x_train, y_train)\n"," \n"," training_score = clf.score(x_train, y_train)\n"," test_score = clf.score(x_test, y_test)\n"," K.append(k)\n"," \n"," training.append(training_score)\n"," test.append(test_score)\n"," scores[k] = [training_score, test_score]\n","\n","for keys, values in scores.items():\n"," print(keys, ':', values)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"83e69f23-aa4e-4893-8907-6d5f0792c23f","showTitle":false,"title":""}},"source":["## Materialize Features in Redis\n","- `BackfillTime`\n","- `RedisSink`\n","- `materialize_features`\n","- `multi_get_online_features`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"faad23c1-d827-4674-b630-83530574c27d","showTitle":false,"title":""}},"outputs":[],"source":["backfill_time = BackfillTime(start=datetime(\n"," 2013, 4, 7), end=datetime(2013, 4, 7), step=timedelta(days=1))\n","redisSink = RedisSink(table_name=\"fraudDetectionDemoFeature\")\n","settings = MaterializationSettings(\"fraudDetectionDemoFeature\",\n"," backfill_time=backfill_time,\n"," sinks=[redisSink],\n"," feature_names=[\"fraud_status\"])\n","\n","client.materialize_features(settings)\n","client.wait_job_to_finish(timeout_sec=5000)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"1f5b191f-b1e8-49e4-b54d-ffc2f8c0a0b8","showTitle":false,"title":""}},"outputs":[],"source":["client.multi_get_online_features('fraudDetectionDemoFeature', ['1759222192247110', '914800996051170'], [\n"," \"fraud_status\"])"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9c3b2403-95d6-44a1-b536-d2088608ff58","showTitle":false,"title":""}},"outputs":[],"source":["client.multi_get_online_features('fraudDetectionDemoFeature', ['1759222192247110', '914800996051170', '844428033864668'], [\n"," \"fraud_status\"])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"71ba8699-3c42-4f73-be59-95b29f468696","showTitle":false,"title":""}},"source":["## Register Features with Registry APIs\n","- `register_features`\n","- `list_registered_features`\n","- Above queries are send to a Standard Registry API Service (both `Purview` and `SQL` backend are supported)\n","- More friendly interface with detailed lineage can be found in: [Feathr UI](https://feathr-sql-registry.azurewebsites.net/)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c5028dd9-01ed-4394-a5c7-623e674125f6","showTitle":false,"title":""}},"outputs":[],"source":["client.register_features()\n","client.list_registered_features(project_name=\"fraud_detection_test\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"cb814ce7-72b9-4622-8518-106d4acf9008","showTitle":false,"title":""}},"source":[]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"fraud_detection_feathr_test_2","notebookOrigID":1891349682974490,"widgets":{}},"interpreter":{"hash":"aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"},"kernelspec":{"display_name":"Python 3.10.4 64-bit","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.4"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":0} diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/fraud_detection_demo.ipynb b/feathr_project/feathrcli/data/feathr_user_workspace/fraud_detection_demo.ipynb deleted file mode 100644 index 129227003..000000000 --- a/feathr_project/feathrcli/data/feathr_user_workspace/fraud_detection_demo.ipynb +++ /dev/null @@ -1 +0,0 @@ -{"cells":[{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7b19a0cd-31da-45b7-91a4-9cd561f3d3d8","showTitle":false,"title":""}},"source":["# Feathr Fraud Detection Sample\n","\n","This notebook illustrates the use of Feature Store to create a model that predicts the fraud status of transactions based on the user account data and trasaction data. All the data that was used in the notebook can be found here: https://github.com/microsoft/r-server-fraud-detection\n","\n","\n","In the following Notebook, we \n","1. Install the latest Feathr code (to include some unreleased features) \n","2. Define Environment Variables & `yaml_config` Settings \n","3. Create `FeathrClient` and Define `FeatureAnchor`\n","4. `build_features` and `get_offline_features` \n","5. Train Fraud Detection Model wih `KNeighborsClassifier`\n","6. `materialize_features` and `multi_get_online_features`\n","7. `register_features` and `list_registered_features`"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0b51153e-40dd-43d5-9d3a-501534156e6d","showTitle":false,"title":""}},"source":["## Setup Feathr Developer Environment"]},{"cell_type":"markdown","metadata":{},"source":["***Prior to running the notebook, if you have not deployed all the required resources, please refer to the guide here and follow the steps to do so: https://linkedin.github.io/feathr/how-to-guides/azure-deployment-arm.html***"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b9c63dd5-304e-4797-a230-8fb753710dbc","showTitle":false,"title":""}},"outputs":[],"source":["! pip install feathr azure-cli"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"69222adf-1cb0-410b-b98d-e22877f358c0","showTitle":false,"title":""}},"outputs":[],"source":["import glob\n","import os\n","import tempfile\n","from datetime import datetime, timedelta\n","from math import sqrt\n","\n","from feathr import FeathrClient\n","from feathr import STRING, BOOLEAN, FLOAT, INT32, ValueType\n","from feathr import Feature, DerivedFeature, FeatureAnchor\n","from feathr import BackfillTime, MaterializationSettings\n","from feathr import FeatureQuery, ObservationSettings\n","from feathr import RedisSink\n","from feathr import INPUT_CONTEXT, HdfsSource\n","from feathr import WindowAggTransformation\n","from feathr import TypedKey\n","from sklearn.model_selection import train_test_split\n","from azure.identity import DefaultAzureCredential\n","from azure.keyvault.secrets import SecretClient"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c0299d67-1103-4aa4-ba57-300498ae2579","showTitle":false,"title":""}},"outputs":[],"source":["! az login --use-device-code"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"58291272-00e5-4fe3-99d6-f1b89726f692","showTitle":false,"title":""}},"outputs":[],"source":["# replace with your prefix\n","resource_prefix = "]},{"cell_type":"markdown","metadata":{},"source":["## Permission\n","To run the cells below, you need additional permission: permission to your managed identity to access the keyvault, and permission to the user to access the Storage Blob. Run the following lines of command in the Cloud Shell in order to grant yourself the access.\n","\n","~~~ \n","userId=\n","resource_prefix=\n","keyvault_name=\"${resource_prefix}kv\"\n","objectId=$(az ad user show --id $userId --query id -o tsv)\n","az keyvault update --name $keyvault_name --enable-rbac-authorization false\n","az keyvault set-policy -n $keyvault_name --secret-permissions get list --object-id $objectId\n","az role assignment create --assignee $userId --role \"Storage Blob Data Contributor\"\n","~~~"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a8a70f27-d520-4d3c-bb8c-f364f84cb738","showTitle":false,"title":""}},"outputs":[],"source":["# Get all the required credentials from Azure Key Vault\n","key_vault_name=resource_prefix+\"kv\"\n","synapse_workspace_url=resource_prefix+\"syws\"\n","adls_account=resource_prefix+\"dls\"\n","adls_fs_name=resource_prefix+\"fs\"\n","purview_name=resource_prefix+\"purview\"\n","key_vault_uri = f\"https://{key_vault_name}.vault.azure.net\"\n","credential = DefaultAzureCredential(exclude_interactive_browser_credential=False)\n","client = SecretClient(vault_url=key_vault_uri, credential=credential)\n","secretName = \"FEATHR-ONLINE-STORE-CONN\"\n","retrieved_secret = client.get_secret(secretName).value\n","\n","# Get redis credentials; This is to parse Redis connection string.\n","redis_port=retrieved_secret.split(',')[0].split(\":\")[1]\n","redis_host=retrieved_secret.split(',')[0].split(\":\")[0]\n","redis_password=retrieved_secret.split(',')[1].split(\"password=\",1)[1]\n","redis_ssl=retrieved_secret.split(',')[2].split(\"ssl=\",1)[1]\n","\n","# Set the resource link\n","os.environ['spark_config__azure_synapse__dev_url'] = f'https://{synapse_workspace_url}.dev.azuresynapse.net'\n","os.environ['spark_config__azure_synapse__pool_name'] = 'spark31'\n","os.environ['spark_config__azure_synapse__workspace_dir'] = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_project'\n","os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n","os.environ['online_store__redis__host'] = redis_host\n","os.environ['online_store__redis__port'] = redis_port\n","os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n","os.environ['REDIS_PASSWORD']=redis_password\n","os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n","feathr_output_path = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_output'"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"50b2f73e-6380-42c3-91e8-4f3e15bc10d6","showTitle":false,"title":""}},"outputs":[],"source":["import tempfile\n","yaml_config = \"\"\"\n","# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n","api_version: 1\n","project_config:\n"," project_name: 'fraud_detection_test'\n"," required_environment_variables:\n"," - 'REDIS_PASSWORD'\n","offline_store:\n"," adls:\n"," adls_enabled: true\n"," wasb:\n"," wasb_enabled: true\n"," s3:\n"," s3_enabled: false\n"," s3_endpoint: 's3.amazonaws.com'\n"," jdbc:\n"," jdbc_enabled: false\n"," jdbc_database: 'feathrtestdb'\n"," jdbc_table: 'feathrtesttable'\n"," snowflake:\n"," url: \"dqllago-ol19457.snowflakecomputing.com\"\n"," user: \"feathrintegration\"\n"," role: \"ACCOUNTADMIN\"\n","spark_config:\n"," spark_cluster: 'azure_synapse'\n"," spark_result_output_parts: '1'\n"," azure_synapse:\n"," dev_url: 'https://feathrazuretest3synapse.dev.azuresynapse.net'\n"," pool_name: 'spark3'\n"," workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/fraud_detection_test'\n"," executor_size: 'Small'\n"," executor_num: 4\n"," feathr_runtime_location: wasbs://public@azurefeathrstorage.blob.core.windows.net/feathr-assembly-LATEST.jar\n"," databricks:\n"," workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net'\n"," config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_D3_v2','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}}\n"," work_dir: 'dbfs:/fraud_detection_test'\n"," feathr_runtime_location: https://azurefeathrstorage.blob.core.windows.net/public/feathr-assembly-LATEST.jar\n","online_store:\n"," redis:\n"," host: 'feathrazuretest3redis.redis.cache.windows.net'\n"," port: 6380\n"," ssl_enabled: True\n","feature_registry:\n"," api_endpoint: \"https://feathr-sql-registry.azurewebsites.net/api/v1\"\n","\"\"\"\n","tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n","with open(tmp.name, \"w\") as text_file:\n"," text_file.write(yaml_config)\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"eab0957c-c906-4297-a729-8dd8d79cb629","showTitle":false,"title":""}},"source":["## Initialize `Feathr Client`\n","- `FeathrClient`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3734eee3-12f9-44db-a440-ad375ef859f0","showTitle":false,"title":""}},"outputs":[],"source":["client = FeathrClient(config_path=tmp.name)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"f6adbca1-5642-4ac1-bff7-e7c9d4d9e5b2","showTitle":false,"title":""}},"source":["## Define Features\n","- `HdfsSource`\n","- `TypedKey`\n","- `Feature`\n","- `FeatureAnchor`\n","- `DerivedFeature`"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b073b509-0f95-4e23-b16b-ffd8190fb6a2","showTitle":false,"title":""}},"source":["### Account Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b3668eeb-e4a0-4327-baf6-5521c856f51d","showTitle":false,"title":""}},"outputs":[],"source":["#Refer to to learn more about the details of each method\n","account_info = HdfsSource(name=\"AccountData\",\n"," path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/account_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n","\n","accountId = TypedKey(key_column=\"accountID\",\n"," key_column_type=ValueType.INT32,\n"," description=\"account id\")\n","\n","account_country = Feature(name=\"account_country\",\n"," key=accountId,\n"," feature_type=STRING, \n"," transform=\"accountCountry\")\n","\n","is_user_registered = Feature(name=\"is_user_registered\",\n"," key=accountId,\n"," feature_type=BOOLEAN,\n"," transform=\"isUserRegistered==TRUE\")\n","\n","num_payment_rejects_1d_per_user = Feature(name=\"num_payment_rejects_1d_per_user\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=\"numPaymentRejects1dPerUser\")\n","\n","account_age = Feature(name=\"account_age\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=\"accountAge\")\n"," \n","features = [\n"," account_country,\n"," account_age,\n"," is_user_registered,\n"," num_payment_rejects_1d_per_user\n","]\n","\n","account_anchor = FeatureAnchor(name=\"account_features\",\n"," source=account_info,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"6f12c07e-4faf-4411-8acd-6f5d13b962f8","showTitle":false,"title":""}},"source":["### Transaction Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"280062b9-ae21-4a1a-ae94-86a5c17fd589","showTitle":false,"title":""}},"outputs":[],"source":["# # #Refer to to learn more about the details of each method\n","\n","transaction_data = HdfsSource(name=\"transaction_data\",\n"," path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/transaction_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n","\n","transaction_id = Feature(name=\"transaction_id\",\n"," key=accountId,\n"," feature_type=STRING,\n"," transform=\"transactionID\")\n","\n","transaction_currency_code = Feature(name=\"transaction_currency_code\",\n"," key=accountId,\n"," feature_type=STRING,\n"," transform=\"transactionCurrencyCode\")\n"," \n","transaction_amount = Feature(name=\"transaction_amount\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=\"transactionAmount\")\n","\n","transaction_device_id = Feature(name=\"transaction_device_id\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=\"transactionDeviceId\")\n","\n","transaction_ip_address = Feature(name=\"transaction_ip_address\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=\"transactionIPaddress\")\n","\n","transaction_time = Feature(name=\"transaction_time\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=\"transactionTime\")\n","\n","fraud_status = Feature(name=\"fraud_status\",\n"," key=accountId,\n"," feature_type=STRING,\n"," transform=\"fraud_tag\")\n","\n","features = [\n"," transaction_id,\n"," transaction_amount,\n"," transaction_device_id,\n"," transaction_ip_address,\n"," transaction_time,\n"," transaction_currency_code,\n"," fraud_status\n","]\n","\n","transaction_feature_anchor = FeatureAnchor(name=\"transaction_features\",\n"," source=transaction_data,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"86ac05e1-26bb-4820-87ea-f547e3561181","showTitle":false,"title":""}},"source":["### Transaction Aggregated Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4c969554-f690-42f5-b70a-d962bf558b03","showTitle":false,"title":""}},"outputs":[],"source":["# average amount of transaction in the past week\n","transactions_aggr = HdfsSource(name=\"transactions_aggr\",\n"," path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/transaction_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n","\n","# average amount of transaction in that week\n","avg_transaction_amount = Feature(name=\"avg_transaction_amount\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(transactionAmount)\",\n"," agg_func=\"AVG\",\n"," window=\"7d\"))\n","\n","# number of transaction that took place in a day\n","num_trasaction_count_in_day = Feature(name=\"num_trasaction_count_in_day\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionID\",\n"," agg_func=\"COUNT\",\n"," window=\"1d\"))\n","\n","# Amount of transaction that took place in a day\n","total_transaction_amount_in_day = Feature(name=\"total_transaction_amount_in_day\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(transactionAmount)\",\n"," agg_func=\"SUM\",\n"," window=\"1d\"))\n","\n","# average time of transaction in the past week\n","avg_transaction_time = Feature(name=\"avg_transaction_time\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(transactionTime)\",\n"," agg_func=\"AVG\",\n"," window=\"7d\")) \n","\n","# total number of currency used for transaction in the past week\n","num_currency_type_in_week = Feature(name=\"num_currency_type_in_week\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionCurrencyCode\",\n"," agg_func=\"COUNT\",\n"," window=\"7d\"))\n","\n","# number of different ip address used for transaction in the past week\n","num_ip_address_count = Feature(name=\"num_ip_address_count\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionIPaddress\",\n"," agg_func=\"COUNT\",\n"," window=\"7d\"))\n","\n","# number of devices used for the transaction in the past week\n","num_device_count = Feature(name=\"num_device_count\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionDeviceId\",\n"," agg_func=\"COUNT\",\n"," window=\"7d\"))\n","\n","# find the time of most recent transaction\n","time_most_recent_transaction = Feature(name=\"time_most_recent_transaction\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionTime\",\n"," agg_func=\"LATEST\",\n"," window=\"7d\"))\n","\n","features = [\n"," avg_transaction_amount,\n"," avg_transaction_time,\n"," total_transaction_amount_in_day,\n"," num_trasaction_count_in_day,\n"," num_currency_type_in_week,\n"," num_ip_address_count,\n"," num_device_count,\n"," time_most_recent_transaction\n","]\n","\n","aggr_anchor = FeatureAnchor(name=\"transaction_aggr_features\",\n"," source=transactions_aggr,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"17cc5132-461f-4d3d-b517-1f7e69d23252","showTitle":false,"title":""}},"source":["### Derived Features\n","- `DerivedFeature`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7ac10ce4-e222-469c-bb2e-1658b45e3eda","showTitle":false,"title":""}},"outputs":[],"source":["# derived features\n","feature_diff_current_and_avg_amount = DerivedFeature(name=\"feature_diff_current_and_avg_amount\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," input_features=[\n"," transaction_amount, avg_transaction_amount],\n"," transform=\"transaction_amount - avg_transaction_amount\")\n","\n","feature_time_pass_after_most_recent_transaction = DerivedFeature(name=\"feature_time_pass_after_most_recent_transaction\",\n"," key=accountId,\n"," feature_type=INT32,\n"," input_features=[\n"," transaction_time, time_most_recent_transaction],\n"," transform=\"cast_int(transaction_time) - cast_int(time_most_recent_transaction)\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a9ec8416-9ac6-4499-b60f-55822265b893","showTitle":false,"title":""}},"source":["## Build Defined Features\n","- `build_features`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"d9d32d4f-2b60-4978-bb87-c7d2160e98eb","showTitle":false,"title":""}},"outputs":[],"source":["client.build_features(anchor_list=[account_anchor, transaction_feature_anchor, aggr_anchor], \n"," derived_feature_list=[feature_time_pass_after_most_recent_transaction, feature_diff_current_and_avg_amount])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"fa9e53b9-e7d4-4b25-b486-dc9e6801369a","showTitle":false,"title":""}},"source":["## Get Offline Features\n","- `FeatureQuery`\n","- `ObservationSettings`\n","- `get_offline_features`\n","- `feathr_spark_launcher.download_result`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b6340f2f-79dc-442b-a202-b2f2078a62ac","showTitle":false,"title":""}},"outputs":[],"source":["if client.spark_runtime == 'databricks':\n"," output_path = 'dbfs:/feathrfrauddetection_test.avro'\n","else:\n"," output_path = feathr_output_path\n","\n","feature_query = FeatureQuery(\n"," feature_list=[\"account_country\",\n"," \"transaction_time\",\n"," \"num_currency_type_in_week\",\n"," \"num_trasaction_count_in_day\",\n"," \"total_transaction_amount_in_day\",\n"," \"fraud_status\",\n"," \"is_user_registered\",\n"," \"avg_transaction_amount\",\n"," \"num_ip_address_count\",\n"," \"num_device_count\",\n"," \"time_most_recent_transaction\",\n"," \"feature_diff_current_and_avg_amount\",\n"," \"feature_time_pass_after_most_recent_transaction\"], key=accountId)\n"," \n","settings = ObservationSettings(\n"," observation_path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/observation_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n"," \n","client.get_offline_features(observation_settings=settings,\n"," feature_query=feature_query,\n"," output_path=output_path)\n","client.wait_job_to_finish(timeout_sec=10000000000)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5b7603ee-0c81-49ed-8e1f-53161ae57cbf","showTitle":false,"title":""}},"outputs":[],"source":["import pandas as pd\n","import pandavro as pdx\n","import glob\n","from pathlib import Path\n","import matplotlib.pyplot as plt\n","from datetime import datetime, timedelta\n","\n","from feathr import BackfillTime, MaterializationSettings, RedisSink"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"997db6eb-c7d8-4f5e-b6e0-09733ff706b7","showTitle":false,"title":""}},"outputs":[],"source":["def get_result_df(client: FeathrClient) -> pd.DataFrame:\n"," \"\"\"Download the job result dataset from cloud as a Pandas dataframe.\"\"\"\n"," res_url = client.get_job_result_uri(block=True, timeout_sec=600)\n"," tmp_dir = tempfile.TemporaryDirectory()\n"," client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=tmp_dir.name)\n"," dataframe_list = []\n"," # assuming the result are in avro format\n"," for file in glob.glob(os.path.join(tmp_dir.name, '*.avro')):\n"," dataframe_list.append(pdx.read_avro(file))\n"," vertical_concat_df = pd.concat(dataframe_list, axis=0)\n"," tmp_dir.cleanup()\n"," return vertical_concat_df\n","\n","df_res = get_result_df(client)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7fff1ac7-90d1-469b-a54c-397904417796","showTitle":false,"title":""}},"source":["## Feature Visualization"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e482625e-2ecd-45cb-9d43-5baacd445006","showTitle":false,"title":""}},"outputs":[],"source":["filepath = Path('./result_out.csv')\n","df_res.to_csv(filepath, index=False) \n","df_res.reset_index()\n","df_res"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b4f86c53-16cf-4836-969b-7c34f0922057","showTitle":false,"title":""}},"source":["## Train Fraud Detection Model with Calculated Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0d9d06b8-01e7-4772-8734-6ebfe1996b03","showTitle":false,"title":""}},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","from sklearn.neighbors import KNeighborsClassifier\n","from sklearn.model_selection import train_test_split \n","import seaborn as sns\n","\n","final_df = df_res\n","final_df.drop(['accountID'], axis=1, inplace=True, errors='ignore')\n","final_df.drop(['transactionDate'], axis=1, inplace=True, errors='ignore')\n","final_df.drop(['account_country'], axis=1, inplace=True, errors='ignore')\n","final_df = final_df.fillna(0)\n","\n","x_train, x_test, y_train, y_test = train_test_split(final_df.drop([\"fraud_status\"], axis=1),\n"," final_df[\"fraud_status\"],\n"," test_size=0.20,\n"," random_state=0)\n"," \n","K = []\n","training = []\n","test = []\n","scores = {}\n"," \n","for k in range(2, 21):\n"," clf = KNeighborsClassifier(n_neighbors = k)\n"," clf.fit(x_train, y_train)\n"," \n"," training_score = clf.score(x_train, y_train)\n"," test_score = clf.score(x_test, y_test)\n"," K.append(k)\n"," \n"," training.append(training_score)\n"," test.append(test_score)\n"," scores[k] = [training_score, test_score]\n","\n","for keys, values in scores.items():\n"," print(keys, ':', values)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"83e69f23-aa4e-4893-8907-6d5f0792c23f","showTitle":false,"title":""}},"source":["## Materialize Features in Redis\n","- `BackfillTime`\n","- `RedisSink`\n","- `materialize_features`\n","- `multi_get_online_features`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"faad23c1-d827-4674-b630-83530574c27d","showTitle":false,"title":""}},"outputs":[],"source":["backfill_time = BackfillTime(start=datetime(\n"," 2013, 4, 7), end=datetime(2013, 4, 7), step=timedelta(days=1))\n","redisSink = RedisSink(table_name=\"fraudDetectionDemoFeature\")\n","settings = MaterializationSettings(\"fraudDetectionDemoFeature\",\n"," backfill_time=backfill_time,\n"," sinks=[redisSink],\n"," feature_names=[\"fraud_status\"])\n","\n","client.materialize_features(settings)\n","client.wait_job_to_finish(timeout_sec=5000)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"1f5b191f-b1e8-49e4-b54d-ffc2f8c0a0b8","showTitle":false,"title":""}},"outputs":[],"source":["client.multi_get_online_features('fraudDetectionDemoFeature', ['1759222192247110', '914800996051170'], [\n"," \"fraud_status\"])"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9c3b2403-95d6-44a1-b536-d2088608ff58","showTitle":false,"title":""}},"outputs":[],"source":["client.multi_get_online_features('fraudDetectionDemoFeature', ['1759222192247110', '914800996051170', '844428033864668'], [\n"," \"fraud_status\"])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"71ba8699-3c42-4f73-be59-95b29f468696","showTitle":false,"title":""}},"source":["## Register Features with Registry APIs\n","- `register_features`\n","- `list_registered_features`\n","- Above queries are send to a Standard Registry API Service (both `Purview` and `SQL` backend are supported)\n","- More friendly interface with detailed lineage can be found in: [Feathr UI](https://feathr-sql-registry.azurewebsites.net/)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c5028dd9-01ed-4394-a5c7-623e674125f6","showTitle":false,"title":""}},"outputs":[],"source":["client.register_features()\n","client.list_registered_features(project_name=\"fraud_detection_test\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"cb814ce7-72b9-4622-8518-106d4acf9008","showTitle":false,"title":""}},"source":[]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"fraud_detection_feathr_test_2","notebookOrigID":1891349682974490,"widgets":{}},"interpreter":{"hash":"aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"},"kernelspec":{"display_name":"Python 3.10.4 64-bit","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.4"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":0} From b36ef3cbcc8aca4c8059417d5a6b1bfaa8568b0b Mon Sep 17 00:00:00 2001 From: Curie Date: Thu, 28 Jul 2022 08:00:56 -0700 Subject: [PATCH 5/5] fix formatting of the demo notebook --- docs/samples/fraud_detection_demo.ipynb | 1026 ++++++++++++++++++++++- 1 file changed, 1025 insertions(+), 1 deletion(-) diff --git a/docs/samples/fraud_detection_demo.ipynb b/docs/samples/fraud_detection_demo.ipynb index cdacbc88b..1408de700 100644 --- a/docs/samples/fraud_detection_demo.ipynb +++ b/docs/samples/fraud_detection_demo.ipynb @@ -1 +1,1025 @@ -{"cells":[{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7b19a0cd-31da-45b7-91a4-9cd561f3d3d8","showTitle":false,"title":""}},"source":["# Feathr Fraud Detection Sample\n","\n","This notebook illustrates the use of Feature Store to create a model that predicts the fraud status of transactions based on the user account data and trasaction data. All the data that was used in the notebook can be found here: https://github.com/microsoft/r-server-fraud-detection\n","\n","\n","In the following Notebook, we \n","1. Install the latest Feathr code (to include some unreleased features) \n","2. Define Environment Variables & `yaml_config` Settings \n","3. Create `FeathrClient` and Define `FeatureAnchor`\n","4. `build_features` and `get_offline_features` \n","5. Train Fraud Detection Model wih `KNeighborsClassifier`\n","6. `materialize_features` and `multi_get_online_features`\n","7. `register_features` and `list_registered_features`"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0b51153e-40dd-43d5-9d3a-501534156e6d","showTitle":false,"title":""}},"source":["## Setup Feathr Developer Environment"]},{"cell_type":"markdown","metadata":{},"source":["***Prior to running the notebook, if you have not deployed all the required resources, please refer to the guide here and follow the steps to do so: https://linkedin.github.io/feathr/how-to-guides/azure-deployment-arm.html***"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b9c63dd5-304e-4797-a230-8fb753710dbc","showTitle":false,"title":""}},"outputs":[],"source":["! pip install feathr azure-cli"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"69222adf-1cb0-410b-b98d-e22877f358c0","showTitle":false,"title":""}},"outputs":[],"source":["import glob\n","import os\n","import tempfile\n","from datetime import datetime, timedelta\n","from math import sqrt\n","\n","from feathr import FeathrClient\n","from feathr import STRING, BOOLEAN, FLOAT, INT32, ValueType\n","from feathr import Feature, DerivedFeature, FeatureAnchor\n","from feathr import BackfillTime, MaterializationSettings\n","from feathr import FeatureQuery, ObservationSettings\n","from feathr import RedisSink\n","from feathr import INPUT_CONTEXT, HdfsSource\n","from feathr import WindowAggTransformation\n","from feathr import TypedKey\n","from sklearn.model_selection import train_test_split\n","from azure.identity import DefaultAzureCredential\n","from azure.keyvault.secrets import SecretClient"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c0299d67-1103-4aa4-ba57-300498ae2579","showTitle":false,"title":""}},"outputs":[],"source":["! az login --use-device-code"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"58291272-00e5-4fe3-99d6-f1b89726f692","showTitle":false,"title":""}},"outputs":[],"source":["# replace with your prefix\n","resource_prefix = "]},{"cell_type":"markdown","metadata":{},"source":["## Permission\n","To run the cells below, you need additional permission: permission to your managed identity to access the keyvault, and permission to the user to access the Storage Blob. Run the following lines of command in the Cloud Shell in order to grant yourself the access.\n","\n","~~~ \n","userId=\n","resource_prefix=\n","synapse_workspace_name=\"${resource_prefix}syws\"\n","keyvault_name=\"${resource_prefix}kv\"\n","objectId=$(az ad user show --id $userId --query id -o tsv)\n","az keyvault update --name $keyvault_name --enable-rbac-authorization false\n","az keyvault set-policy -n $keyvault_name --secret-permissions get list --object-id $objectId\n","az role assignment create --assignee $userId --role \"Storage Blob Data Contributor\"\n","az synapse role assignment create --workspace-name $synapse_workspace_name --role \"Synapse Contributor\" --assignee $userId\n","~~~"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a8a70f27-d520-4d3c-bb8c-f364f84cb738","showTitle":false,"title":""}},"outputs":[],"source":["# Get all the required credentials from Azure Key Vault\n","key_vault_name=resource_prefix+\"kv\"\n","synapse_workspace_url=resource_prefix+\"syws\"\n","adls_account=resource_prefix+\"dls\"\n","adls_fs_name=resource_prefix+\"fs\"\n","purview_name=resource_prefix+\"purview\"\n","key_vault_uri = f\"https://{key_vault_name}.vault.azure.net\"\n","credential = DefaultAzureCredential(exclude_interactive_browser_credential=False)\n","client = SecretClient(vault_url=key_vault_uri, credential=credential)\n","secretName = \"FEATHR-ONLINE-STORE-CONN\"\n","retrieved_secret = client.get_secret(secretName).value\n","\n","# Get redis credentials; This is to parse Redis connection string.\n","redis_port=retrieved_secret.split(',')[0].split(\":\")[1]\n","redis_host=retrieved_secret.split(',')[0].split(\":\")[0]\n","redis_password=retrieved_secret.split(',')[1].split(\"password=\",1)[1]\n","redis_ssl=retrieved_secret.split(',')[2].split(\"ssl=\",1)[1]\n","\n","# Set the resource link\n","os.environ['spark_config__azure_synapse__dev_url'] = f'https://{synapse_workspace_url}.dev.azuresynapse.net'\n","os.environ['spark_config__azure_synapse__pool_name'] = 'spark31'\n","os.environ['spark_config__azure_synapse__workspace_dir'] = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_project'\n","os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n","os.environ['online_store__redis__host'] = redis_host\n","os.environ['online_store__redis__port'] = redis_port\n","os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n","os.environ['REDIS_PASSWORD']=redis_password\n","os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n","feathr_output_path = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_output'"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"50b2f73e-6380-42c3-91e8-4f3e15bc10d6","showTitle":false,"title":""}},"outputs":[],"source":["import tempfile\n","yaml_config = \"\"\"\n","# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n","api_version: 1\n","project_config:\n"," project_name: 'fraud_detection_test'\n"," required_environment_variables:\n"," - 'REDIS_PASSWORD'\n","offline_store:\n"," adls:\n"," adls_enabled: true\n"," wasb:\n"," wasb_enabled: true\n"," s3:\n"," s3_enabled: false\n"," s3_endpoint: 's3.amazonaws.com'\n"," jdbc:\n"," jdbc_enabled: false\n"," jdbc_database: 'feathrtestdb'\n"," jdbc_table: 'feathrtesttable'\n"," snowflake:\n"," url: \"dqllago-ol19457.snowflakecomputing.com\"\n"," user: \"feathrintegration\"\n"," role: \"ACCOUNTADMIN\"\n","spark_config:\n"," spark_cluster: 'azure_synapse'\n"," spark_result_output_parts: '1'\n"," azure_synapse:\n"," dev_url: 'https://feathrazuretest3synapse.dev.azuresynapse.net'\n"," pool_name: 'spark3'\n"," workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/fraud_detection_test'\n"," executor_size: 'Small'\n"," executor_num: 4\n"," feathr_runtime_location: wasbs://public@azurefeathrstorage.blob.core.windows.net/feathr-assembly-LATEST.jar\n"," databricks:\n"," workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net'\n"," config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_D3_v2','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}}\n"," work_dir: 'dbfs:/fraud_detection_test'\n"," feathr_runtime_location: https://azurefeathrstorage.blob.core.windows.net/public/feathr-assembly-LATEST.jar\n","online_store:\n"," redis:\n"," host: 'feathrazuretest3redis.redis.cache.windows.net'\n"," port: 6380\n"," ssl_enabled: True\n","feature_registry:\n"," api_endpoint: \"https://feathr-sql-registry.azurewebsites.net/api/v1\"\n","\"\"\"\n","tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n","with open(tmp.name, \"w\") as text_file:\n"," text_file.write(yaml_config)\n"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"eab0957c-c906-4297-a729-8dd8d79cb629","showTitle":false,"title":""}},"source":["## Initialize `Feathr Client`\n","- `FeathrClient`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"3734eee3-12f9-44db-a440-ad375ef859f0","showTitle":false,"title":""}},"outputs":[],"source":["client = FeathrClient(config_path=tmp.name)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"f6adbca1-5642-4ac1-bff7-e7c9d4d9e5b2","showTitle":false,"title":""}},"source":["## Define Features\n","- `HdfsSource`\n","- `TypedKey`\n","- `Feature`\n","- `FeatureAnchor`\n","- `DerivedFeature`"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b073b509-0f95-4e23-b16b-ffd8190fb6a2","showTitle":false,"title":""}},"source":["### Account Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b3668eeb-e4a0-4327-baf6-5521c856f51d","showTitle":false,"title":""}},"outputs":[],"source":["#Refer to to learn more about the details of each method\n","account_info = HdfsSource(name=\"AccountData\",\n"," path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/account_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n","\n","accountId = TypedKey(key_column=\"accountID\",\n"," key_column_type=ValueType.INT32,\n"," description=\"account id\")\n","\n","account_country = Feature(name=\"account_country\",\n"," key=accountId,\n"," feature_type=STRING, \n"," transform=\"accountCountry\")\n","\n","is_user_registered = Feature(name=\"is_user_registered\",\n"," key=accountId,\n"," feature_type=BOOLEAN,\n"," transform=\"isUserRegistered==TRUE\")\n","\n","num_payment_rejects_1d_per_user = Feature(name=\"num_payment_rejects_1d_per_user\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=\"numPaymentRejects1dPerUser\")\n","\n","account_age = Feature(name=\"account_age\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=\"accountAge\")\n"," \n","features = [\n"," account_country,\n"," account_age,\n"," is_user_registered,\n"," num_payment_rejects_1d_per_user\n","]\n","\n","account_anchor = FeatureAnchor(name=\"account_features\",\n"," source=account_info,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"6f12c07e-4faf-4411-8acd-6f5d13b962f8","showTitle":false,"title":""}},"source":["### Transaction Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"280062b9-ae21-4a1a-ae94-86a5c17fd589","showTitle":false,"title":""}},"outputs":[],"source":["# # #Refer to to learn more about the details of each method\n","\n","transaction_data = HdfsSource(name=\"transaction_data\",\n"," path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/transaction_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n","\n","transaction_id = Feature(name=\"transaction_id\",\n"," key=accountId,\n"," feature_type=STRING,\n"," transform=\"transactionID\")\n","\n","transaction_currency_code = Feature(name=\"transaction_currency_code\",\n"," key=accountId,\n"," feature_type=STRING,\n"," transform=\"transactionCurrencyCode\")\n"," \n","transaction_amount = Feature(name=\"transaction_amount\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=\"transactionAmount\")\n","\n","transaction_device_id = Feature(name=\"transaction_device_id\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=\"transactionDeviceId\")\n","\n","transaction_ip_address = Feature(name=\"transaction_ip_address\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=\"transactionIPaddress\")\n","\n","transaction_time = Feature(name=\"transaction_time\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=\"transactionTime\")\n","\n","fraud_status = Feature(name=\"fraud_status\",\n"," key=accountId,\n"," feature_type=STRING,\n"," transform=\"fraud_tag\")\n","\n","features = [\n"," transaction_id,\n"," transaction_amount,\n"," transaction_device_id,\n"," transaction_ip_address,\n"," transaction_time,\n"," transaction_currency_code,\n"," fraud_status\n","]\n","\n","transaction_feature_anchor = FeatureAnchor(name=\"transaction_features\",\n"," source=transaction_data,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"86ac05e1-26bb-4820-87ea-f547e3561181","showTitle":false,"title":""}},"source":["### Transaction Aggregated Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"4c969554-f690-42f5-b70a-d962bf558b03","showTitle":false,"title":""}},"outputs":[],"source":["# average amount of transaction in the past week\n","transactions_aggr = HdfsSource(name=\"transactions_aggr\",\n"," path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/transaction_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n","\n","# average amount of transaction in that week\n","avg_transaction_amount = Feature(name=\"avg_transaction_amount\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(transactionAmount)\",\n"," agg_func=\"AVG\",\n"," window=\"7d\"))\n","\n","# number of transaction that took place in a day\n","num_trasaction_count_in_day = Feature(name=\"num_trasaction_count_in_day\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionID\",\n"," agg_func=\"COUNT\",\n"," window=\"1d\"))\n","\n","# Amount of transaction that took place in a day\n","total_transaction_amount_in_day = Feature(name=\"total_transaction_amount_in_day\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(transactionAmount)\",\n"," agg_func=\"SUM\",\n"," window=\"1d\"))\n","\n","# average time of transaction in the past week\n","avg_transaction_time = Feature(name=\"avg_transaction_time\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"cast_float(transactionTime)\",\n"," agg_func=\"AVG\",\n"," window=\"7d\")) \n","\n","# total number of currency used for transaction in the past week\n","num_currency_type_in_week = Feature(name=\"num_currency_type_in_week\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionCurrencyCode\",\n"," agg_func=\"COUNT\",\n"," window=\"7d\"))\n","\n","# number of different ip address used for transaction in the past week\n","num_ip_address_count = Feature(name=\"num_ip_address_count\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionIPaddress\",\n"," agg_func=\"COUNT\",\n"," window=\"7d\"))\n","\n","# number of devices used for the transaction in the past week\n","num_device_count = Feature(name=\"num_device_count\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionDeviceId\",\n"," agg_func=\"COUNT\",\n"," window=\"7d\"))\n","\n","# find the time of most recent transaction\n","time_most_recent_transaction = Feature(name=\"time_most_recent_transaction\",\n"," key=accountId,\n"," feature_type=INT32,\n"," transform=WindowAggTransformation(agg_expr=\"transactionTime\",\n"," agg_func=\"LATEST\",\n"," window=\"7d\"))\n","\n","features = [\n"," avg_transaction_amount,\n"," avg_transaction_time,\n"," total_transaction_amount_in_day,\n"," num_trasaction_count_in_day,\n"," num_currency_type_in_week,\n"," num_ip_address_count,\n"," num_device_count,\n"," time_most_recent_transaction\n","]\n","\n","aggr_anchor = FeatureAnchor(name=\"transaction_aggr_features\",\n"," source=transactions_aggr,\n"," features=features)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"17cc5132-461f-4d3d-b517-1f7e69d23252","showTitle":false,"title":""}},"source":["### Derived Features\n","- `DerivedFeature`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7ac10ce4-e222-469c-bb2e-1658b45e3eda","showTitle":false,"title":""}},"outputs":[],"source":["# derived features\n","feature_diff_current_and_avg_amount = DerivedFeature(name=\"feature_diff_current_and_avg_amount\",\n"," key=accountId,\n"," feature_type=FLOAT,\n"," input_features=[\n"," transaction_amount, avg_transaction_amount],\n"," transform=\"transaction_amount - avg_transaction_amount\")\n","\n","feature_time_pass_after_most_recent_transaction = DerivedFeature(name=\"feature_time_pass_after_most_recent_transaction\",\n"," key=accountId,\n"," feature_type=INT32,\n"," input_features=[\n"," transaction_time, time_most_recent_transaction],\n"," transform=\"cast_int(transaction_time) - cast_int(time_most_recent_transaction)\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"a9ec8416-9ac6-4499-b60f-55822265b893","showTitle":false,"title":""}},"source":["## Build Defined Features\n","- `build_features`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"d9d32d4f-2b60-4978-bb87-c7d2160e98eb","showTitle":false,"title":""}},"outputs":[],"source":["client.build_features(anchor_list=[account_anchor, transaction_feature_anchor, aggr_anchor], \n"," derived_feature_list=[feature_time_pass_after_most_recent_transaction, feature_diff_current_and_avg_amount])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"fa9e53b9-e7d4-4b25-b486-dc9e6801369a","showTitle":false,"title":""}},"source":["## Get Offline Features\n","- `FeatureQuery`\n","- `ObservationSettings`\n","- `get_offline_features`\n","- `feathr_spark_launcher.download_result`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b6340f2f-79dc-442b-a202-b2f2078a62ac","showTitle":false,"title":""}},"outputs":[],"source":["if client.spark_runtime == 'databricks':\n"," output_path = 'dbfs:/feathrfrauddetection_test.avro'\n","else:\n"," output_path = feathr_output_path\n","\n","feature_query = FeatureQuery(\n"," feature_list=[\"account_country\",\n"," \"transaction_time\",\n"," \"num_currency_type_in_week\",\n"," \"num_trasaction_count_in_day\",\n"," \"total_transaction_amount_in_day\",\n"," \"fraud_status\",\n"," \"is_user_registered\",\n"," \"avg_transaction_amount\",\n"," \"num_ip_address_count\",\n"," \"num_device_count\",\n"," \"time_most_recent_transaction\",\n"," \"feature_diff_current_and_avg_amount\",\n"," \"feature_time_pass_after_most_recent_transaction\"], key=accountId)\n"," \n","settings = ObservationSettings(\n"," observation_path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/observation_out_small.csv\",\n"," event_timestamp_column=\"transactionDate\",\n"," timestamp_format=\"yyyyMMdd\")\n"," \n","client.get_offline_features(observation_settings=settings,\n"," feature_query=feature_query,\n"," output_path=output_path)\n","client.wait_job_to_finish(timeout_sec=10000000000)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"5b7603ee-0c81-49ed-8e1f-53161ae57cbf","showTitle":false,"title":""}},"outputs":[],"source":["import pandas as pd\n","import pandavro as pdx\n","import glob\n","from pathlib import Path\n","import matplotlib.pyplot as plt\n","from datetime import datetime, timedelta\n","\n","from feathr import BackfillTime, MaterializationSettings, RedisSink"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"997db6eb-c7d8-4f5e-b6e0-09733ff706b7","showTitle":false,"title":""}},"outputs":[],"source":["def get_result_df(client: FeathrClient) -> pd.DataFrame:\n"," \"\"\"Download the job result dataset from cloud as a Pandas dataframe.\"\"\"\n"," res_url = client.get_job_result_uri(block=True, timeout_sec=600)\n"," tmp_dir = tempfile.TemporaryDirectory()\n"," client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=tmp_dir.name)\n"," dataframe_list = []\n"," # assuming the result are in avro format\n"," for file in glob.glob(os.path.join(tmp_dir.name, '*.avro')):\n"," dataframe_list.append(pdx.read_avro(file))\n"," vertical_concat_df = pd.concat(dataframe_list, axis=0)\n"," tmp_dir.cleanup()\n"," return vertical_concat_df\n","\n","df_res = get_result_df(client)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"7fff1ac7-90d1-469b-a54c-397904417796","showTitle":false,"title":""}},"source":["## Feature Visualization"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"e482625e-2ecd-45cb-9d43-5baacd445006","showTitle":false,"title":""}},"outputs":[],"source":["filepath = Path('./result_out.csv')\n","df_res.to_csv(filepath, index=False) \n","df_res.reset_index()\n","df_res"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"b4f86c53-16cf-4836-969b-7c34f0922057","showTitle":false,"title":""}},"source":["## Train Fraud Detection Model with Calculated Features"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"0d9d06b8-01e7-4772-8734-6ebfe1996b03","showTitle":false,"title":""}},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","from sklearn.neighbors import KNeighborsClassifier\n","from sklearn.model_selection import train_test_split \n","import seaborn as sns\n","\n","final_df = df_res\n","final_df.drop(['accountID'], axis=1, inplace=True, errors='ignore')\n","final_df.drop(['transactionDate'], axis=1, inplace=True, errors='ignore')\n","final_df.drop(['account_country'], axis=1, inplace=True, errors='ignore')\n","final_df = final_df.fillna(0)\n","\n","x_train, x_test, y_train, y_test = train_test_split(final_df.drop([\"fraud_status\"], axis=1),\n"," final_df[\"fraud_status\"],\n"," test_size=0.20,\n"," random_state=0)\n"," \n","K = []\n","training = []\n","test = []\n","scores = {}\n"," \n","for k in range(2, 21):\n"," clf = KNeighborsClassifier(n_neighbors = k)\n"," clf.fit(x_train, y_train)\n"," \n"," training_score = clf.score(x_train, y_train)\n"," test_score = clf.score(x_test, y_test)\n"," K.append(k)\n"," \n"," training.append(training_score)\n"," test.append(test_score)\n"," scores[k] = [training_score, test_score]\n","\n","for keys, values in scores.items():\n"," print(keys, ':', values)"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"83e69f23-aa4e-4893-8907-6d5f0792c23f","showTitle":false,"title":""}},"source":["## Materialize Features in Redis\n","- `BackfillTime`\n","- `RedisSink`\n","- `materialize_features`\n","- `multi_get_online_features`"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"faad23c1-d827-4674-b630-83530574c27d","showTitle":false,"title":""}},"outputs":[],"source":["backfill_time = BackfillTime(start=datetime(\n"," 2013, 4, 7), end=datetime(2013, 4, 7), step=timedelta(days=1))\n","redisSink = RedisSink(table_name=\"fraudDetectionDemoFeature\")\n","settings = MaterializationSettings(\"fraudDetectionDemoFeature\",\n"," backfill_time=backfill_time,\n"," sinks=[redisSink],\n"," feature_names=[\"fraud_status\"])\n","\n","client.materialize_features(settings)\n","client.wait_job_to_finish(timeout_sec=5000)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"1f5b191f-b1e8-49e4-b54d-ffc2f8c0a0b8","showTitle":false,"title":""}},"outputs":[],"source":["client.multi_get_online_features('fraudDetectionDemoFeature', ['1759222192247110', '914800996051170'], [\n"," \"fraud_status\"])"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"9c3b2403-95d6-44a1-b536-d2088608ff58","showTitle":false,"title":""}},"outputs":[],"source":["client.multi_get_online_features('fraudDetectionDemoFeature', ['1759222192247110', '914800996051170', '844428033864668'], [\n"," \"fraud_status\"])"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"71ba8699-3c42-4f73-be59-95b29f468696","showTitle":false,"title":""}},"source":["## Register Features with Registry APIs\n","- `register_features`\n","- `list_registered_features`\n","- Above queries are send to a Standard Registry API Service (both `Purview` and `SQL` backend are supported)\n","- More friendly interface with detailed lineage can be found in: [Feathr UI](https://feathr-sql-registry.azurewebsites.net/)"]},{"cell_type":"code","execution_count":null,"metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"c5028dd9-01ed-4394-a5c7-623e674125f6","showTitle":false,"title":""}},"outputs":[],"source":["client.register_features()\n","client.list_registered_features(project_name=\"fraud_detection_test\")"]},{"cell_type":"markdown","metadata":{"application/vnd.databricks.v1+cell":{"inputWidgets":{},"nuid":"cb814ce7-72b9-4622-8518-106d4acf9008","showTitle":false,"title":""}},"source":[]}],"metadata":{"application/vnd.databricks.v1+notebook":{"dashboards":[],"language":"python","notebookMetadata":{"pythonIndentUnit":4},"notebookName":"fraud_detection_feathr_test_2","notebookOrigID":1891349682974490,"widgets":{}},"interpreter":{"hash":"aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"},"kernelspec":{"display_name":"Python 3.10.4 64-bit","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.4"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":0} +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "7b19a0cd-31da-45b7-91a4-9cd561f3d3d8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Feathr Fraud Detection Sample\n", + "\n", + "This notebook illustrates the use of Feature Store to create a model that predicts the fraud status of transactions based on the user account data and trasaction data. All the data that was used in the notebook can be found here: https://github.com/microsoft/r-server-fraud-detection\n", + "\n", + "\n", + "In the following Notebook, we \n", + "1. Install the latest Feathr code (to include some unreleased features) \n", + "2. Define Environment Variables & `yaml_config` Settings \n", + "3. Create `FeathrClient` and Define `FeatureAnchor`\n", + "4. `build_features` and `get_offline_features` \n", + "5. Train Fraud Detection Model wih `KNeighborsClassifier`\n", + "6. `materialize_features` and `multi_get_online_features`\n", + "7. `register_features` and `list_registered_features`" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "0b51153e-40dd-43d5-9d3a-501534156e6d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Setup Feathr Developer Environment" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***Prior to running the notebook, if you have not deployed all the required resources, please refer to the guide here and follow the steps to do so: https://linkedin.github.io/feathr/how-to-guides/azure-deployment-arm.html***" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "b9c63dd5-304e-4797-a230-8fb753710dbc", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "! pip install feathr azure-cli" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "69222adf-1cb0-410b-b98d-e22877f358c0", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import glob\n", + "import os\n", + "import tempfile\n", + "from datetime import datetime, timedelta\n", + "from math import sqrt\n", + "\n", + "from feathr import FeathrClient\n", + "from feathr import STRING, BOOLEAN, FLOAT, INT32, ValueType\n", + "from feathr import Feature, DerivedFeature, FeatureAnchor\n", + "from feathr import BackfillTime, MaterializationSettings\n", + "from feathr import FeatureQuery, ObservationSettings\n", + "from feathr import RedisSink\n", + "from feathr import INPUT_CONTEXT, HdfsSource\n", + "from feathr import WindowAggTransformation\n", + "from feathr import TypedKey\n", + "from sklearn.model_selection import train_test_split\n", + "from azure.identity import DefaultAzureCredential\n", + "from azure.keyvault.secrets import SecretClient" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "c0299d67-1103-4aa4-ba57-300498ae2579", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "! az login --use-device-code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "58291272-00e5-4fe3-99d6-f1b89726f692", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# replace with your prefix\n", + "resource_prefix = " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Permission\n", + "To run the cells below, you need additional permission: permission to your managed identity to access the keyvault, and permission to the user to access the Storage Blob. Run the following lines of command in the Cloud Shell in order to grant yourself the access.\n", + "\n", + "~~~ \n", + "userId=\n", + "resource_prefix=\n", + "synapse_workspace_name=\"${resource_prefix}syws\"\n", + "keyvault_name=\"${resource_prefix}kv\"\n", + "objectId=$(az ad user show --id $userId --query id -o tsv)\n", + "az keyvault update --name $keyvault_name --enable-rbac-authorization false\n", + "az keyvault set-policy -n $keyvault_name --secret-permissions get list --object-id $objectId\n", + "az role assignment create --assignee $userId --role \"Storage Blob Data Contributor\"\n", + "az synapse role assignment create --workspace-name $synapse_workspace_name --role \"Synapse Contributor\" --assignee $userId\n", + "~~~" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "a8a70f27-d520-4d3c-bb8c-f364f84cb738", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Get all the required credentials from Azure Key Vault\n", + "key_vault_name=resource_prefix+\"kv\"\n", + "synapse_workspace_url=resource_prefix+\"syws\"\n", + "adls_account=resource_prefix+\"dls\"\n", + "adls_fs_name=resource_prefix+\"fs\"\n", + "purview_name=resource_prefix+\"purview\"\n", + "key_vault_uri = f\"https://{key_vault_name}.vault.azure.net\"\n", + "credential = DefaultAzureCredential(exclude_interactive_browser_credential=False)\n", + "client = SecretClient(vault_url=key_vault_uri, credential=credential)\n", + "secretName = \"FEATHR-ONLINE-STORE-CONN\"\n", + "retrieved_secret = client.get_secret(secretName).value\n", + "\n", + "# Get redis credentials; This is to parse Redis connection string.\n", + "redis_port=retrieved_secret.split(',')[0].split(\":\")[1]\n", + "redis_host=retrieved_secret.split(',')[0].split(\":\")[0]\n", + "redis_password=retrieved_secret.split(',')[1].split(\"password=\",1)[1]\n", + "redis_ssl=retrieved_secret.split(',')[2].split(\"ssl=\",1)[1]\n", + "\n", + "# Set the resource link\n", + "os.environ['spark_config__azure_synapse__dev_url'] = f'https://{synapse_workspace_url}.dev.azuresynapse.net'\n", + "os.environ['spark_config__azure_synapse__pool_name'] = 'spark31'\n", + "os.environ['spark_config__azure_synapse__workspace_dir'] = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_project'\n", + "os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n", + "os.environ['online_store__redis__host'] = redis_host\n", + "os.environ['online_store__redis__port'] = redis_port\n", + "os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n", + "os.environ['REDIS_PASSWORD']=redis_password\n", + "os.environ['feature_registry__purview__purview_name'] = f'{purview_name}'\n", + "feathr_output_path = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_output'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "50b2f73e-6380-42c3-91e8-4f3e15bc10d6", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import tempfile\n", + "yaml_config = \"\"\"\n", + "# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n", + "api_version: 1\n", + "project_config:\n", + " project_name: 'fraud_detection_test'\n", + " required_environment_variables:\n", + " - 'REDIS_PASSWORD'\n", + "offline_store:\n", + " adls:\n", + " adls_enabled: true\n", + " wasb:\n", + " wasb_enabled: true\n", + " s3:\n", + " s3_enabled: false\n", + " s3_endpoint: 's3.amazonaws.com'\n", + " jdbc:\n", + " jdbc_enabled: false\n", + " jdbc_database: 'feathrtestdb'\n", + " jdbc_table: 'feathrtesttable'\n", + " snowflake:\n", + " url: \"dqllago-ol19457.snowflakecomputing.com\"\n", + " user: \"feathrintegration\"\n", + " role: \"ACCOUNTADMIN\"\n", + "spark_config:\n", + " spark_cluster: 'azure_synapse'\n", + " spark_result_output_parts: '1'\n", + " azure_synapse:\n", + " dev_url: 'https://feathrazuretest3synapse.dev.azuresynapse.net'\n", + " pool_name: 'spark3'\n", + " workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/fraud_detection_test'\n", + " executor_size: 'Small'\n", + " executor_num: 4\n", + " feathr_runtime_location: wasbs://public@azurefeathrstorage.blob.core.windows.net/feathr-assembly-LATEST.jar\n", + " databricks:\n", + " workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net'\n", + " config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_D3_v2','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}}\n", + " work_dir: 'dbfs:/fraud_detection_test'\n", + " feathr_runtime_location: https://azurefeathrstorage.blob.core.windows.net/public/feathr-assembly-LATEST.jar\n", + "online_store:\n", + " redis:\n", + " host: 'feathrazuretest3redis.redis.cache.windows.net'\n", + " port: 6380\n", + " ssl_enabled: True\n", + "feature_registry:\n", + " api_endpoint: \"https://feathr-sql-registry.azurewebsites.net/api/v1\"\n", + "\"\"\"\n", + "tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n", + "with open(tmp.name, \"w\") as text_file:\n", + " text_file.write(yaml_config)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "eab0957c-c906-4297-a729-8dd8d79cb629", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Initialize `Feathr Client`\n", + "- `FeathrClient`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "3734eee3-12f9-44db-a440-ad375ef859f0", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "client = FeathrClient(config_path=tmp.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "f6adbca1-5642-4ac1-bff7-e7c9d4d9e5b2", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Define Features\n", + "- `HdfsSource`\n", + "- `TypedKey`\n", + "- `Feature`\n", + "- `FeatureAnchor`\n", + "- `DerivedFeature`" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "b073b509-0f95-4e23-b16b-ffd8190fb6a2", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Account Features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "b3668eeb-e4a0-4327-baf6-5521c856f51d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "#Refer to to learn more about the details of each method\n", + "account_info = HdfsSource(name=\"AccountData\",\n", + " path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/account_out_small.csv\",\n", + " event_timestamp_column=\"transactionDate\",\n", + " timestamp_format=\"yyyyMMdd\")\n", + "\n", + "accountId = TypedKey(key_column=\"accountID\",\n", + " key_column_type=ValueType.INT32,\n", + " description=\"account id\")\n", + "\n", + "account_country = Feature(name=\"account_country\",\n", + " key=accountId,\n", + " feature_type=STRING, \n", + " transform=\"accountCountry\")\n", + "\n", + "is_user_registered = Feature(name=\"is_user_registered\",\n", + " key=accountId,\n", + " feature_type=BOOLEAN,\n", + " transform=\"isUserRegistered==TRUE\")\n", + "\n", + "num_payment_rejects_1d_per_user = Feature(name=\"num_payment_rejects_1d_per_user\",\n", + " key=accountId,\n", + " feature_type=INT32,\n", + " transform=\"numPaymentRejects1dPerUser\")\n", + "\n", + "account_age = Feature(name=\"account_age\",\n", + " key=accountId,\n", + " feature_type=INT32,\n", + " transform=\"accountAge\")\n", + " \n", + "features = [\n", + " account_country,\n", + " account_age,\n", + " is_user_registered,\n", + " num_payment_rejects_1d_per_user\n", + "]\n", + "\n", + "account_anchor = FeatureAnchor(name=\"account_features\",\n", + " source=account_info,\n", + " features=features)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "6f12c07e-4faf-4411-8acd-6f5d13b962f8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Transaction Features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "280062b9-ae21-4a1a-ae94-86a5c17fd589", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# # #Refer to to learn more about the details of each method\n", + "\n", + "transaction_data = HdfsSource(name=\"transaction_data\",\n", + " path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/transaction_out_small.csv\",\n", + " event_timestamp_column=\"transactionDate\",\n", + " timestamp_format=\"yyyyMMdd\")\n", + "\n", + "transaction_id = Feature(name=\"transaction_id\",\n", + " key=accountId,\n", + " feature_type=STRING,\n", + " transform=\"transactionID\")\n", + "\n", + "transaction_currency_code = Feature(name=\"transaction_currency_code\",\n", + " key=accountId,\n", + " feature_type=STRING,\n", + " transform=\"transactionCurrencyCode\")\n", + " \n", + "transaction_amount = Feature(name=\"transaction_amount\",\n", + " key=accountId,\n", + " feature_type=FLOAT,\n", + " transform=\"transactionAmount\")\n", + "\n", + "transaction_device_id = Feature(name=\"transaction_device_id\",\n", + " key=accountId,\n", + " feature_type=FLOAT,\n", + " transform=\"transactionDeviceId\")\n", + "\n", + "transaction_ip_address = Feature(name=\"transaction_ip_address\",\n", + " key=accountId,\n", + " feature_type=FLOAT,\n", + " transform=\"transactionIPaddress\")\n", + "\n", + "transaction_time = Feature(name=\"transaction_time\",\n", + " key=accountId,\n", + " feature_type=INT32,\n", + " transform=\"transactionTime\")\n", + "\n", + "fraud_status = Feature(name=\"fraud_status\",\n", + " key=accountId,\n", + " feature_type=STRING,\n", + " transform=\"fraud_tag\")\n", + "\n", + "features = [\n", + " transaction_id,\n", + " transaction_amount,\n", + " transaction_device_id,\n", + " transaction_ip_address,\n", + " transaction_time,\n", + " transaction_currency_code,\n", + " fraud_status\n", + "]\n", + "\n", + "transaction_feature_anchor = FeatureAnchor(name=\"transaction_features\",\n", + " source=transaction_data,\n", + " features=features)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "86ac05e1-26bb-4820-87ea-f547e3561181", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Transaction Aggregated Features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "4c969554-f690-42f5-b70a-d962bf558b03", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# average amount of transaction in the past week\n", + "transactions_aggr = HdfsSource(name=\"transactions_aggr\",\n", + " path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/transaction_out_small.csv\",\n", + " event_timestamp_column=\"transactionDate\",\n", + " timestamp_format=\"yyyyMMdd\")\n", + "\n", + "# average amount of transaction in that week\n", + "avg_transaction_amount = Feature(name=\"avg_transaction_amount\",\n", + " key=accountId,\n", + " feature_type=FLOAT,\n", + " transform=WindowAggTransformation(agg_expr=\"cast_float(transactionAmount)\",\n", + " agg_func=\"AVG\",\n", + " window=\"7d\"))\n", + "\n", + "# number of transaction that took place in a day\n", + "num_trasaction_count_in_day = Feature(name=\"num_trasaction_count_in_day\",\n", + " key=accountId,\n", + " feature_type=INT32,\n", + " transform=WindowAggTransformation(agg_expr=\"transactionID\",\n", + " agg_func=\"COUNT\",\n", + " window=\"1d\"))\n", + "\n", + "# Amount of transaction that took place in a day\n", + "total_transaction_amount_in_day = Feature(name=\"total_transaction_amount_in_day\",\n", + " key=accountId,\n", + " feature_type=FLOAT,\n", + " transform=WindowAggTransformation(agg_expr=\"cast_float(transactionAmount)\",\n", + " agg_func=\"SUM\",\n", + " window=\"1d\"))\n", + "\n", + "# average time of transaction in the past week\n", + "avg_transaction_time = Feature(name=\"avg_transaction_time\",\n", + " key=accountId,\n", + " feature_type=INT32,\n", + " transform=WindowAggTransformation(agg_expr=\"cast_float(transactionTime)\",\n", + " agg_func=\"AVG\",\n", + " window=\"7d\")) \n", + "\n", + "# total number of currency used for transaction in the past week\n", + "num_currency_type_in_week = Feature(name=\"num_currency_type_in_week\",\n", + " key=accountId,\n", + " feature_type=INT32,\n", + " transform=WindowAggTransformation(agg_expr=\"transactionCurrencyCode\",\n", + " agg_func=\"COUNT\",\n", + " window=\"7d\"))\n", + "\n", + "# number of different ip address used for transaction in the past week\n", + "num_ip_address_count = Feature(name=\"num_ip_address_count\",\n", + " key=accountId,\n", + " feature_type=INT32,\n", + " transform=WindowAggTransformation(agg_expr=\"transactionIPaddress\",\n", + " agg_func=\"COUNT\",\n", + " window=\"7d\"))\n", + "\n", + "# number of devices used for the transaction in the past week\n", + "num_device_count = Feature(name=\"num_device_count\",\n", + " key=accountId,\n", + " feature_type=INT32,\n", + " transform=WindowAggTransformation(agg_expr=\"transactionDeviceId\",\n", + " agg_func=\"COUNT\",\n", + " window=\"7d\"))\n", + "\n", + "# find the time of most recent transaction\n", + "time_most_recent_transaction = Feature(name=\"time_most_recent_transaction\",\n", + " key=accountId,\n", + " feature_type=INT32,\n", + " transform=WindowAggTransformation(agg_expr=\"transactionTime\",\n", + " agg_func=\"LATEST\",\n", + " window=\"7d\"))\n", + "\n", + "features = [\n", + " avg_transaction_amount,\n", + " avg_transaction_time,\n", + " total_transaction_amount_in_day,\n", + " num_trasaction_count_in_day,\n", + " num_currency_type_in_week,\n", + " num_ip_address_count,\n", + " num_device_count,\n", + " time_most_recent_transaction\n", + "]\n", + "\n", + "aggr_anchor = FeatureAnchor(name=\"transaction_aggr_features\",\n", + " source=transactions_aggr,\n", + " features=features)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "17cc5132-461f-4d3d-b517-1f7e69d23252", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Derived Features\n", + "- `DerivedFeature`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "7ac10ce4-e222-469c-bb2e-1658b45e3eda", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# derived features\n", + "feature_diff_current_and_avg_amount = DerivedFeature(name=\"feature_diff_current_and_avg_amount\",\n", + " key=accountId,\n", + " feature_type=FLOAT,\n", + " input_features=[\n", + " transaction_amount, avg_transaction_amount],\n", + " transform=\"transaction_amount - avg_transaction_amount\")\n", + "\n", + "feature_time_pass_after_most_recent_transaction = DerivedFeature(name=\"feature_time_pass_after_most_recent_transaction\",\n", + " key=accountId,\n", + " feature_type=INT32,\n", + " input_features=[\n", + " transaction_time, time_most_recent_transaction],\n", + " transform=\"cast_int(transaction_time) - cast_int(time_most_recent_transaction)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "a9ec8416-9ac6-4499-b60f-55822265b893", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Build Defined Features\n", + "- `build_features`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "d9d32d4f-2b60-4978-bb87-c7d2160e98eb", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "client.build_features(anchor_list=[account_anchor, transaction_feature_anchor, aggr_anchor], \n", + " derived_feature_list=[feature_time_pass_after_most_recent_transaction, feature_diff_current_and_avg_amount])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "fa9e53b9-e7d4-4b25-b486-dc9e6801369a", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Get Offline Features\n", + "- `FeatureQuery`\n", + "- `ObservationSettings`\n", + "- `get_offline_features`\n", + "- `feathr_spark_launcher.download_result`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "b6340f2f-79dc-442b-a202-b2f2078a62ac", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "if client.spark_runtime == 'databricks':\n", + " output_path = 'dbfs:/feathrfrauddetection_test.avro'\n", + "else:\n", + " output_path = feathr_output_path\n", + "\n", + "feature_query = FeatureQuery(\n", + " feature_list=[\"account_country\",\n", + " \"transaction_time\",\n", + " \"num_currency_type_in_week\",\n", + " \"num_trasaction_count_in_day\",\n", + " \"total_transaction_amount_in_day\",\n", + " \"fraud_status\",\n", + " \"is_user_registered\",\n", + " \"avg_transaction_amount\",\n", + " \"num_ip_address_count\",\n", + " \"num_device_count\",\n", + " \"time_most_recent_transaction\",\n", + " \"feature_diff_current_and_avg_amount\",\n", + " \"feature_time_pass_after_most_recent_transaction\"], key=accountId)\n", + " \n", + "settings = ObservationSettings(\n", + " observation_path=\"wasbs://frauddata@feathrdatastorage.blob.core.windows.net/observation_out_small.csv\",\n", + " event_timestamp_column=\"transactionDate\",\n", + " timestamp_format=\"yyyyMMdd\")\n", + " \n", + "client.get_offline_features(observation_settings=settings,\n", + " feature_query=feature_query,\n", + " output_path=output_path)\n", + "client.wait_job_to_finish(timeout_sec=10000000000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "5b7603ee-0c81-49ed-8e1f-53161ae57cbf", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import pandavro as pdx\n", + "import glob\n", + "from pathlib import Path\n", + "import matplotlib.pyplot as plt\n", + "from datetime import datetime, timedelta\n", + "\n", + "from feathr import BackfillTime, MaterializationSettings, RedisSink" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "997db6eb-c7d8-4f5e-b6e0-09733ff706b7", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "def get_result_df(client: FeathrClient) -> pd.DataFrame:\n", + " \"\"\"Download the job result dataset from cloud as a Pandas dataframe.\"\"\"\n", + " res_url = client.get_job_result_uri(block=True, timeout_sec=600)\n", + " tmp_dir = tempfile.TemporaryDirectory()\n", + " client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=tmp_dir.name)\n", + " dataframe_list = []\n", + " # assuming the result are in avro format\n", + " for file in glob.glob(os.path.join(tmp_dir.name, '*.avro')):\n", + " dataframe_list.append(pdx.read_avro(file))\n", + " vertical_concat_df = pd.concat(dataframe_list, axis=0)\n", + " tmp_dir.cleanup()\n", + " return vertical_concat_df\n", + "\n", + "df_res = get_result_df(client)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "7fff1ac7-90d1-469b-a54c-397904417796", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Feature Visualization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "e482625e-2ecd-45cb-9d43-5baacd445006", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "filepath = Path('./result_out.csv')\n", + "df_res.to_csv(filepath, index=False) \n", + "df_res.reset_index()\n", + "df_res" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "b4f86c53-16cf-4836-969b-7c34f0922057", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Train Fraud Detection Model with Calculated Features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "0d9d06b8-01e7-4772-8734-6ebfe1996b03", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.model_selection import train_test_split \n", + "import seaborn as sns\n", + "\n", + "final_df = df_res\n", + "final_df.drop(['accountID'], axis=1, inplace=True, errors='ignore')\n", + "final_df.drop(['transactionDate'], axis=1, inplace=True, errors='ignore')\n", + "final_df.drop(['account_country'], axis=1, inplace=True, errors='ignore')\n", + "final_df = final_df.fillna(0)\n", + "\n", + "x_train, x_test, y_train, y_test = train_test_split(final_df.drop([\"fraud_status\"], axis=1),\n", + " final_df[\"fraud_status\"],\n", + " test_size=0.20,\n", + " random_state=0)\n", + " \n", + "K = []\n", + "training = []\n", + "test = []\n", + "scores = {}\n", + " \n", + "for k in range(2, 21):\n", + " clf = KNeighborsClassifier(n_neighbors = k)\n", + " clf.fit(x_train, y_train)\n", + " \n", + " training_score = clf.score(x_train, y_train)\n", + " test_score = clf.score(x_test, y_test)\n", + " K.append(k)\n", + " \n", + " training.append(training_score)\n", + " test.append(test_score)\n", + " scores[k] = [training_score, test_score]\n", + "\n", + "for keys, values in scores.items():\n", + " print(keys, ':', values)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "83e69f23-aa4e-4893-8907-6d5f0792c23f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Materialize Features in Redis\n", + "- `BackfillTime`\n", + "- `RedisSink`\n", + "- `materialize_features`\n", + "- `multi_get_online_features`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "faad23c1-d827-4674-b630-83530574c27d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "backfill_time = BackfillTime(start=datetime(\n", + " 2013, 4, 7), end=datetime(2013, 4, 7), step=timedelta(days=1))\n", + "redisSink = RedisSink(table_name=\"fraudDetectionDemoFeature\")\n", + "settings = MaterializationSettings(\"fraudDetectionDemoFeature\",\n", + " backfill_time=backfill_time,\n", + " sinks=[redisSink],\n", + " feature_names=[\"fraud_status\"])\n", + "\n", + "client.materialize_features(settings)\n", + "client.wait_job_to_finish(timeout_sec=5000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "1f5b191f-b1e8-49e4-b54d-ffc2f8c0a0b8", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "client.multi_get_online_features('fraudDetectionDemoFeature', ['1759222192247110', '914800996051170'], [\n", + " \"fraud_status\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "9c3b2403-95d6-44a1-b536-d2088608ff58", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "client.multi_get_online_features('fraudDetectionDemoFeature', ['1759222192247110', '914800996051170', '844428033864668'], [\n", + " \"fraud_status\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "71ba8699-3c42-4f73-be59-95b29f468696", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Register Features with Registry APIs\n", + "- `register_features`\n", + "- `list_registered_features`\n", + "- Above queries are send to a Standard Registry API Service (both `Purview` and `SQL` backend are supported)\n", + "- More friendly interface with detailed lineage can be found in: [Feathr UI](https://feathr-sql-registry.azurewebsites.net/)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "c5028dd9-01ed-4394-a5c7-623e674125f6", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "client.register_features()\n", + "client.list_registered_features(project_name=\"fraud_detection_test\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "cb814ce7-72b9-4622-8518-106d4acf9008", + "showTitle": false, + "title": "" + } + }, + "source": [] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "fraud_detection_feathr_test_2", + "notebookOrigID": 1891349682974490, + "widgets": {} + }, + "interpreter": { + "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" + }, + "kernelspec": { + "display_name": "Python 3.10.4 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file