From b4f09d7e3e226b4c22553a676de1d74c82066e0f Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Thu, 6 Oct 2022 21:28:15 +0000 Subject: [PATCH 1/3] Fix local spark output file-format bug Signed-off-by: Jun Ki Min <42475935+loomlike@users.noreply.github.com> --- .../spark_provider/_localspark_submission.py | 180 ++++++++++-------- 1 file changed, 96 insertions(+), 84 deletions(-) diff --git a/feathr_project/feathr/spark_provider/_localspark_submission.py b/feathr_project/feathr/spark_provider/_localspark_submission.py index 3b24fd513..3bff6dfe4 100644 --- a/feathr_project/feathr/spark_provider/_localspark_submission.py +++ b/feathr_project/feathr/spark_provider/_localspark_submission.py @@ -1,41 +1,38 @@ -import time from datetime import datetime import json import os from pathlib import Path +from shlex import split +from subprocess import STDOUT, Popen +import time from typing import Dict, List, Optional -from feathr.spark_provider._abc import SparkJobLauncher from loguru import logger - from pyspark import * -from subprocess import TimeoutExpired, STDOUT, Popen -from shlex import split from feathr.constants import FEATHR_MAVEN_ARTIFACT - +from feathr.spark_provider._abc import SparkJobLauncher class _FeathrDLocalSparkJobLauncher(SparkJobLauncher): - """Class to interact with local Spark - This class is not intended to be used in Production environments. - It is intended to be used for testing and development purposes. - No authentication is required to use this class. - Args: - workspace_path (str): Path to the workspace + """Class to interact with local Spark. This class is not intended to be used in Production environments. + It is intended to be used for testing and development purposes. No authentication is required to use this class. + + Args: + workspace_path (str): Path to the workspace """ + def __init__( self, workspace_path: str, master: str = None, - debug_folder:str = "debug", - clean_up:bool = True, - retry:int = 3, - retry_sec:int = 5, + debug_folder: str = "debug", + clean_up: bool = True, + retry: int = 3, + retry_sec: int = 5, ): - """Initialize the Local Spark job launcher - """ - self.workspace_path = workspace_path, + """Initialize the Local Spark job launcher""" + self.workspace_path = (workspace_path,) self.debug_folder = debug_folder self.spark_job_num = 0 self.clean_up = clean_up @@ -48,82 +45,82 @@ def upload_or_get_cloud_path(self, local_path_or_http_path: str): """For Local Spark Case, no need to upload to cloud workspace.""" return local_path_or_http_path - def submit_feathr_job(self, job_name: str, main_jar_path: str = None, main_class_name: str = None, arguments: List[str] = None, - python_files: List[str]= None, configuration: Dict[str, str] = {}, properties: Dict[str, str] = {}, reference_files_path: List[str] = None, job_tags: Dict[str, str] = None): - """ - Submits the Feathr job to local spark, using subprocess args. - - reference files: put everything there and the function will automatically categorize them based on the - extension name to either the "files" argument in the Livy API, or the "jars" argument in the Livy API. The - path can be local path and this function will automatically upload the function to the corresponding azure - storage - - Also, note that the Spark application will automatically run on YARN cluster mode. You cannot change it if + def submit_feathr_job( + self, + job_name: str, + main_jar_path: str = None, + main_class_name: str = None, + arguments: List[str] = None, + python_files: List[str] = None, + configuration: Dict[str, str] = {}, + properties: Dict[str, str] = {}, + *_, + ): + """Submits the Feathr job to local spark, using subprocess args. + Note that the Spark application will automatically run on YARN cluster mode. You cannot change it if you are running with Azure Synapse. Args: job_name (str): name of the job main_jar_path (str): main file paths, usually your main jar file main_class_name (str): name of your main class - arguments (str): all the arguments you want to pass into the spark job - configuration (Dict[str, str]): Additional configs for the spark job + arguments (List[str]): all the arguments you want to pass into the spark job python_files (List[str]): required .zip, .egg, or .py files of spark job - properties (Dict[str, str]): Additional System Properties for the spark job - job_tags (str): not used in local spark mode + configuration (Dict[str, str]): Additional configs for the spark job + reference_files_path (str): not used in local spark mode + job_tags (str): not used in local spark mode """ - logger.warning(f"Local Spark Mode only support basic params right now and should be used only for testing purpose.") - self.cmd_file, self.log_path = self._get_debug_file_name(self.debug_folder, prefix = job_name) - args = self._init_args(master = self.master, job_name=job_name) + logger.warning( + f"Local Spark Mode only support basic params right now and should be used only for testing purpose." + ) + self.cmd_file, self.log_path = self._get_debug_file_name(self.debug_folder, prefix=job_name) - if properties: - arguments.extend(["--system-properties", json.dumps(properties)]) + # Get conf and package arguments + cfg = configuration.copy() if configuration else {} + maven_dependency = f"{cfg.pop('spark.jars.packages', self.packages)},{FEATHR_MAVEN_ARTIFACT}" + spark_args = self._init_args(master=self.master, job_name=job_name, confs=cfg) - if configuration: - cfg = configuration.copy() # We don't want to mess up input parameters - else: - cfg = {} - if not main_jar_path: # We don't have the main jar, use Maven - # Add Maven dependency to the job configuration - if "spark.jars.packages" in cfg: - cfg["spark.jars.packages"] = ",".join( - [cfg["spark.jars.packages"], FEATHR_MAVEN_ARTIFACT]) - else: - cfg["spark.jars.packages"] = ",".join([self.packages, FEATHR_MAVEN_ARTIFACT]) - if not python_files: # This is a JAR job # Azure Synapse/Livy doesn't allow JAR job starts from Maven directly, we must have a jar file uploaded. # so we have to use a dummy jar as the main file. logger.info(f"Main JAR file is not set, using default package '{FEATHR_MAVEN_ARTIFACT}' from Maven") # Use the no-op jar as the main file - # This is a dummy jar which contains only one `org.example.Noop` class with one empty `main` function which does nothing + # This is a dummy jar which contains only one `org.example.Noop` class with one empty `main` function + # which does nothing current_dir = Path(__file__).parent.resolve() main_jar_path = os.path.join(current_dir, "noop-1.0.jar") - args.extend(["--packages", cfg["spark.jars.packages"],"--class", main_class_name, main_jar_path]) + spark_args.extend(["--packages", maven_dependency, "--class", main_class_name, main_jar_path]) else: - args.extend(["--packages", cfg["spark.jars.packages"]]) - # This is a PySpark job, no more things to + spark_args.extend(["--packages", maven_dependency]) + # This is a PySpark job, no more things to if python_files.__len__() > 1: - args.extend(["--py-files", ",".join(python_files[1:])]) + spark_args.extend(["--py-files", ",".join(python_files[1:])]) print(python_files) - args.append(python_files[0]) + spark_args.append(python_files[0]) else: - args.extend(["--class", main_class_name, main_jar_path]) + spark_args.extend(["--class", main_class_name, main_jar_path]) + + if arguments: + spark_args.extend(arguments) + + if properties: + spark_args.extend(["--system-properties", json.dumps(properties)]) - cmd = " ".join(args) + " " + " ".join(arguments) + cmd = " ".join(spark_args) - log_append = open(f"{self.log_path}_{self.spark_job_num}.txt" , "a") + log_append = open(f"{self.log_path}_{self.spark_job_num}.txt", "a") proc = Popen(split(cmd), shell=False, stdout=log_append, stderr=STDOUT) logger.info(f"Detail job stdout and stderr are in {self.log_path}.") self.spark_job_num += 1 with open(self.cmd_file, "a") as c: - c.write(" ".join(proc.args)) - c.write("\n") + c.write(" ".join(proc.args)) + c.write("\n") self.latest_spark_proc = proc @@ -132,9 +129,8 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str = None, main_clas return proc def wait_for_completion(self, timeout_seconds: Optional[float] = 500) -> bool: - """ - this function track local spark job commands and process status. - files will be write into `debug` folder under your workspace. + """This function track local spark job commands and process status. + Files will be write into `debug` folder under your workspace. """ logger.info(f"{self.spark_job_num} local spark job(s) in this Launcher, only the latest will be monitored.") logger.info(f"Please check auto generated spark command in {self.cmd_file} and detail logs in {self.log_path}.") @@ -143,12 +139,15 @@ def wait_for_completion(self, timeout_seconds: Optional[float] = 500) -> bool: start_time = time.time() retry = self.retry - log_read = open(f"{self.log_path}_{self.spark_job_num-1}.txt" , "r") + log_read = open(f"{self.log_path}_{self.spark_job_num-1}.txt", "r") while proc.poll() is None and (((timeout_seconds is None) or (time.time() - start_time < timeout_seconds))): time.sleep(1) try: if retry < 1: - logger.warning(f"Spark job has hang for {self.retry * self.retry_sec} seconds. latest msg is {last_line}. please check {log_read.name}") + logger.warning( + f"Spark job has hang for {self.retry * self.retry_sec} seconds. latest msg is {last_line}. \ + Please check {log_read.name}" + ) if self.clean_up: self._clean_up() proc.wait() @@ -168,22 +167,28 @@ def wait_for_completion(self, timeout_seconds: Optional[float] = 500) -> bool: retry -= 1 job_duration = time.time() - start_time - log_read.close() + log_read.close() if proc.returncode == None: - logger.warning(f"Spark job with pid {self.latest_spark_proc.pid} not completed after {timeout_seconds} sec time out setting, please check.") + logger.warning( + f"Spark job with pid {self.latest_spark_proc.pid} not completed after {timeout_seconds} sec \ + time out setting. Please check." + ) if self.clean_up: self._clean_up() proc.wait() return True elif proc.returncode == 1: - logger.warning(f"Spark job with pid {self.latest_spark_proc.pid} is not successful, please check.") + logger.warning(f"Spark job with pid {self.latest_spark_proc.pid} is not successful. Please check.") return False else: - logger.info(f"Spark job with pid {self.latest_spark_proc.pid} finished in: {int(job_duration)} seconds with returncode {proc.returncode}") + logger.info( + f"Spark job with pid {self.latest_spark_proc.pid} finished in: {int(job_duration)} seconds \ + with returncode {proc.returncode}" + ) return True - def _clean_up(self, proc:Popen = None): + def _clean_up(self, proc: Popen = None): logger.warning(f"Terminate the spark job due to as clean_up is set to True.") if not proc: self.latest_spark_proc.terminate() @@ -194,30 +199,37 @@ def get_status(self) -> str: """Get the status of the job, only a placeholder for local spark""" return self.latest_spark_proc.returncode - def _init_args(self, master:str, job_name:str): + def _init_args(self, master: str, job_name: str, confs: Dict[str, str]): if master is None: master = "local[*]" logger.info(f"Spark job: {job_name} is running on local spark with master: {master}.") args = [ "spark-submit", - "--master",master, - "--name",job_name, - "--conf", "spark.hadoop.fs.wasbs.impl=org.apache.hadoop.fs.azure.NativeAzureFileSystem", - "--conf", "spark.hadoop.fs.wasbs=org.apache.hadoop.fs.azure.NativeAzureFileSystem", + "--master", + master, + "--name", + job_name, + "--conf", + "spark.hadoop.fs.wasbs.impl=org.apache.hadoop.fs.azure.NativeAzureFileSystem", + "--conf", + "spark.hadoop.fs.wasbs=org.apache.hadoop.fs.azure.NativeAzureFileSystem", ] + + for key, value in confs.items(): + args.extend(["--conf", f"{key}={value}"]) + return args - def _get_debug_file_name(self, debug_folder: str = "debug", prefix:str = None): - """ - auto generated command will be write into cmd file - spark job output will be write into log path with job number as suffix + def _get_debug_file_name(self, debug_folder: str = "debug", prefix: str = None): + """Auto generated command will be write into cmd file. + Spark job output will be write into log path with job number as suffix. """ prefix += datetime.now().strftime("%Y%m%d%H%M%S") debug_path = os.path.join(debug_folder, prefix) print(debug_path) if not os.path.exists(debug_path): - os.makedirs(debug_path) + os.makedirs(debug_path) cmd_file = os.path.join(debug_path, f"command.sh") log_path = os.path.join(debug_path, f"log") @@ -227,7 +239,7 @@ def _get_debug_file_name(self, debug_folder: str = "debug", prefix:str = None): def _get_default_package(self): # default packages of Feathr Core, requires manual update when new dependency introduced or package updated. # TODO: automate this process, e.g. read from pom.xml - # TODO: dynamical modularization: add package only when it's used in the job, e.g. data source dependencies. + # TODO: dynamical modularization: add package only when it's used in the job, e.g. data source dependencies. packages = [] packages.append("org.apache.spark:spark-avro_2.12:3.3.0") packages.append("com.microsoft.sqlserver:mssql-jdbc:10.2.0.jre8") @@ -236,7 +248,7 @@ def _get_default_package(self): packages.append("com.fasterxml.jackson.core:jackson-databind:2.12.6.1") packages.append("org.apache.hadoop:hadoop-mapreduce-client-core:2.7.7") packages.append("org.apache.hadoop:hadoop-common:2.7.7") - packages.append("org.apache.hadoop:hadoop-azure:3.2.0") + packages.append("org.apache.hadoop:hadoop-azure:3.2.0") packages.append("org.apache.avro:avro:1.8.2,org.apache.xbean:xbean-asm6-shaded:4.10") packages.append("org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.3") packages.append("com.microsoft.azure:azure-eventhubs-spark_2.12:2.3.21") From 283b7c86fe8b79e9fcd13945d69ffda18e813f25 Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Thu, 6 Oct 2022 23:13:37 +0000 Subject: [PATCH 2/3] Add dev dependencies. Add unit-test for local spark job launcher Signed-off-by: Jun Ki Min <42475935+loomlike@users.noreply.github.com> --- feathr_project/feathr/client.py | 53 +++++++++---------- feathr_project/feathr/spark_provider/_abc.py | 4 +- .../spark_provider/_localspark_submission.py | 26 +++++---- feathr_project/pyproject.toml | 13 ++++- feathr_project/setup.py | 14 +++-- .../test_localspark_submission.py | 51 ++++++++++++++++++ 6 files changed, 113 insertions(+), 48 deletions(-) create mode 100644 feathr_project/test/unit/spark_provider/test_localspark_submission.py diff --git a/feathr_project/feathr/client.py b/feathr_project/feathr/client.py index f21d37d23..0686db200 100644 --- a/feathr_project/feathr/client.py +++ b/feathr_project/feathr/client.py @@ -1,39 +1,36 @@ import base64 +import copy import logging import os import tempfile from typing import Dict, List, Union -from feathr.definition.feature import FeatureBase -import copy -import redis from azure.identity import DefaultAzureCredential from jinja2 import Template from pyhocon import ConfigFactory -from feathr.definition.sink import Sink -from feathr.registry.feature_registry import default_registry_client - -from feathr.spark_provider._databricks_submission import _FeathrDatabricksJobLauncher -from feathr.spark_provider._synapse_submission import _FeathrSynapseJobLauncher -from feathr.spark_provider._localspark_submission import _FeathrDLocalSparkJobLauncher +import redis -from feathr.definition._materialization_utils import _to_materialization_config -from feathr.udf._preprocessing_pyudf_manager import _PreprocessingPyudfManager from feathr.constants import * -from feathr.spark_provider.feathr_configurations import SparkExecutionConfiguration +from feathr.definition._materialization_utils import _to_materialization_config +from feathr.definition.anchor import FeatureAnchor +from feathr.definition.feature import FeatureBase from feathr.definition.feature_derivations import DerivedFeature from feathr.definition.materialization_settings import MaterializationSettings from feathr.definition.monitoring_settings import MonitoringSettings -from feathr.protobuf.featureValue_pb2 import FeatureValue from feathr.definition.query_feature_list import FeatureQuery from feathr.definition.settings import ObservationSettings -from feathr.definition.feature_derivations import DerivedFeature -from feathr.definition.anchor import FeatureAnchor +from feathr.definition.sink import Sink +from feathr.protobuf.featureValue_pb2 import FeatureValue +from feathr.registry.feature_registry import default_registry_client +from feathr.spark_provider._databricks_submission import _FeathrDatabricksJobLauncher +from feathr.spark_provider._localspark_submission import _FeathrLocalSparkJobLauncher +from feathr.spark_provider._synapse_submission import _FeathrSynapseJobLauncher from feathr.spark_provider.feathr_configurations import SparkExecutionConfiguration +from feathr.udf._preprocessing_pyudf_manager import _PreprocessingPyudfManager from feathr.utils._envvariableutil import _EnvVaraibleUtil from feathr.utils._file_utils import write_to_file from feathr.utils.feature_printer import FeaturePrinter -from feathr.utils.spark_job_params import FeatureJoinJobParams, FeatureGenerationJobParams +from feathr.utils.spark_job_params import FeatureGenerationJobParams, FeatureJoinJobParams class FeathrClient(object): @@ -161,7 +158,7 @@ def __init__(self, config_path:str = "./feathr_config.yaml", local_workspace_dir self._FEATHR_JOB_JAR_PATH = \ self.envutils.get_environment_variable_with_default( 'spark_config', 'local', 'feathr_runtime_location') - self.feathr_spark_launcher = _FeathrDLocalSparkJobLauncher( + self.feathr_spark_launcher = _FeathrLocalSparkJobLauncher( workspace_path = self.envutils.get_environment_variable_with_default('spark_config', 'local', 'workspace'), master = self.envutils.get_environment_variable_with_default('spark_config', 'local', 'master') ) @@ -354,7 +351,7 @@ def _decode_proto(self, feature_list): else: typed_result.append(raw_feature) return typed_result - + def delete_feature_from_redis(self, feature_table, key, feature_name) -> None: """ Delete feature from Redis @@ -364,7 +361,7 @@ def delete_feature_from_redis(self, feature_table, key, feature_name) -> None: key: the key of the entity feature_name: feature name to be deleted """ - + redis_key = self._construct_redis_key(feature_table, key) if self.redis_client.hexists(redis_key, feature_name): self.redis_client.delete(redis_key, feature_name) @@ -575,20 +572,20 @@ def monitor_features(self, settings: MonitoringSettings, execution_configuration def _get_feature_key(self, feature_name: str): features = [] if 'derived_feature_list' in dir(self): - features += self.derived_feature_list + features += self.derived_feature_list if 'anchor_list' in dir(self): for anchor in self.anchor_list: - features += anchor.features + features += anchor.features for feature in features: if feature.name == feature_name: keys = feature.key - return set(key.key_column for key in keys) + return set(key.key_column for key in keys) self.logger.warning(f"Invalid feature name: {feature_name}. Please call FeathrClient.build_features() first in order to materialize the features.") return None - + # Validation on feature keys: # Features within a set of aggregation or planned to be merged should have same keys - # The param "allow_empty_key" shows if empty keys are acceptable + # The param "allow_empty_key" shows if empty keys are acceptable def _valid_materialize_keys(self, features: List[str], allow_empty_key=False): keys = None for feature in features: @@ -611,7 +608,7 @@ def _valid_materialize_keys(self, features: List[str], allow_empty_key=False): self.logger.error(f"Inconsistent feature keys. Current keys are {str(keys)}") return False return True - + def materialize_features(self, settings: MaterializationSettings, execution_configurations: Union[SparkExecutionConfiguration ,Dict[str,str]] = {}, verbose: bool = False): """Materialize feature data @@ -622,7 +619,7 @@ def materialize_features(self, settings: MaterializationSettings, execution_conf feature_list = settings.feature_names if len(feature_list) > 0 and not self._valid_materialize_keys(feature_list): raise RuntimeError(f"Invalid materialization features: {feature_list}, since they have different keys. Currently Feathr only supports materializing features of the same keys.") - + # Collect secrets from sinks secrets = [] for sink in settings.sinks: @@ -632,7 +629,7 @@ def materialize_features(self, settings: MaterializationSettings, execution_conf # produce materialization config for end in settings.get_backfill_cutoff_time(): settings.backfill_time.end = end - config = _to_materialization_config(settings) + config = _to_materialization_config(settings) config_file_name = "feature_gen_conf/auto_gen_config_{}.conf".format(end.timestamp()) config_file_path = os.path.join(self.local_workspace_dir, config_file_name) write_to_file(content=config, full_file_name=config_file_path) @@ -854,7 +851,7 @@ def get_features_from_registry(self, project_name: str) -> Dict[str, FeatureBase feature_dict[feature.name] = feature for feature in registry_derived_feature_list: feature_dict[feature.name] = feature - return feature_dict + return feature_dict def _reshape_config_str(self, config_str:str): if self.spark_runtime == 'local': diff --git a/feathr_project/feathr/spark_provider/_abc.py b/feathr_project/feathr/spark_provider/_abc.py index 2644f82fe..c91fdf5c1 100644 --- a/feathr_project/feathr/spark_provider/_abc.py +++ b/feathr_project/feathr/spark_provider/_abc.py @@ -1,6 +1,6 @@ from abc import ABC, abstractmethod +from typing import Dict, List, Optional, Tuple -from typing import Any, Dict, List, Optional, Tuple class SparkJobLauncher(ABC): """This is the abstract class for all the spark launchers. All the Spark launcher should implement those interfaces @@ -15,7 +15,6 @@ def upload_or_get_cloud_path(self, local_path_or_http_path: str): """ pass - @abstractmethod def submit_feathr_job(self, job_name: str, main_jar_path: str, main_class_name: str, arguments: List[str], reference_files_path: List[str], job_tags: Dict[str, str] = None, @@ -33,6 +32,7 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str, main_class_name: properties (Dict[str, str]): Additional System Properties for the spark job """ pass + @abstractmethod def wait_for_completion(self, timeout_seconds: Optional[float]) -> bool: """Returns true if the job completed successfully diff --git a/feathr_project/feathr/spark_provider/_localspark_submission.py b/feathr_project/feathr/spark_provider/_localspark_submission.py index 3bff6dfe4..31ec16f2e 100644 --- a/feathr_project/feathr/spark_provider/_localspark_submission.py +++ b/feathr_project/feathr/spark_provider/_localspark_submission.py @@ -5,7 +5,7 @@ from shlex import split from subprocess import STDOUT, Popen import time -from typing import Dict, List, Optional +from typing import Any, Dict, List, Optional from loguru import logger from pyspark import * @@ -14,7 +14,7 @@ from feathr.spark_provider._abc import SparkJobLauncher -class _FeathrDLocalSparkJobLauncher(SparkJobLauncher): +class _FeathrLocalSparkJobLauncher(SparkJobLauncher): """Class to interact with local Spark. This class is not intended to be used in Production environments. It is intended to be used for testing and development purposes. No authentication is required to use this class. @@ -39,7 +39,7 @@ def __init__( self.retry = retry self.retry_sec = retry_sec self.packages = self._get_default_package() - self.master = master + self.master = master or "local[*]" def upload_or_get_cloud_path(self, local_path_or_http_path: str): """For Local Spark Case, no need to upload to cloud workspace.""" @@ -48,14 +48,14 @@ def upload_or_get_cloud_path(self, local_path_or_http_path: str): def submit_feathr_job( self, job_name: str, - main_jar_path: str = None, - main_class_name: str = None, + main_jar_path: str, + main_class_name: str, arguments: List[str] = None, python_files: List[str] = None, configuration: Dict[str, str] = {}, properties: Dict[str, str] = {}, *_, - ): + ) -> Any: """Submits the Feathr job to local spark, using subprocess args. Note that the Spark application will automatically run on YARN cluster mode. You cannot change it if you are running with Azure Synapse. @@ -79,7 +79,7 @@ def submit_feathr_job( # Get conf and package arguments cfg = configuration.copy() if configuration else {} maven_dependency = f"{cfg.pop('spark.jars.packages', self.packages)},{FEATHR_MAVEN_ARTIFACT}" - spark_args = self._init_args(master=self.master, job_name=job_name, confs=cfg) + spark_args = self._init_args(job_name=job_name, confs=cfg) if not main_jar_path: # We don't have the main jar, use Maven @@ -199,14 +199,12 @@ def get_status(self) -> str: """Get the status of the job, only a placeholder for local spark""" return self.latest_spark_proc.returncode - def _init_args(self, master: str, job_name: str, confs: Dict[str, str]): - if master is None: - master = "local[*]" - logger.info(f"Spark job: {job_name} is running on local spark with master: {master}.") + def _init_args(self, job_name: str, confs: Dict[str, str]) -> List[str]: + logger.info(f"Spark job: {job_name} is running on local spark with master: {self.master}.") args = [ "spark-submit", "--master", - master, + self.master, "--name", job_name, "--conf", @@ -215,8 +213,8 @@ def _init_args(self, master: str, job_name: str, confs: Dict[str, str]): "spark.hadoop.fs.wasbs=org.apache.hadoop.fs.azure.NativeAzureFileSystem", ] - for key, value in confs.items(): - args.extend(["--conf", f"{key}={value}"]) + for k, v in confs.items(): + args.extend(["--conf", f"{k}={v}"]) return args diff --git a/feathr_project/pyproject.toml b/feathr_project/pyproject.toml index f8d897579..693233dc2 100644 --- a/feathr_project/pyproject.toml +++ b/feathr_project/pyproject.toml @@ -1,6 +1,17 @@ +[tool.black] +line-length = 120 +target_version = ['py38'] + +[tool.isort] +profile = "black" +line_length = 120 +known_first_party = ['feathr'] +force_sort_within_sections = true +multi_line_output = 3 + [build-system] requires = [ "setuptools", "wheel" ] -build-backend = "setuptools.build_meta" \ No newline at end of file +build-backend = "setuptools.build_meta" diff --git a/feathr_project/setup.py b/feathr_project/setup.py index e937f19c4..ce7ec14d6 100644 --- a/feathr_project/setup.py +++ b/feathr_project/setup.py @@ -20,7 +20,7 @@ include_package_data=True, # consider install_requires=[ - 'click<=8.1.3', + "click<=8.1.3", "py4j<=0.10.9.7", "loguru<=0.6.0", "pandas<=1.5.0", @@ -54,9 +54,17 @@ "azure-core<=1.22.1", "typing_extensions>=4.2.0" ], - tests_require=[ - 'pytest', + tests_require=[ # TODO: This has been depricated + "pytest", ], + extras_require=dict( + dev=[ + "black>=22.1.0", # formatter + "isort", # sort import statements + "pytest>=7", + "pytest-mock>=3.8.1", + ], + ), entry_points={ 'console_scripts': ['feathr=feathrcli.cli:cli'] }, diff --git a/feathr_project/test/unit/spark_provider/test_localspark_submission.py b/feathr_project/test/unit/spark_provider/test_localspark_submission.py new file mode 100644 index 000000000..9a9d7238b --- /dev/null +++ b/feathr_project/test/unit/spark_provider/test_localspark_submission.py @@ -0,0 +1,51 @@ +from typing import Dict +from unittest.mock import MagicMock + +import pytest +from pytest_mock import MockerFixture + +from feathr.spark_provider._localspark_submission import _FeathrLocalSparkJobLauncher + + +@pytest.fixture(scope="function") +def local_spark_job_launcher(tmp_path) -> _FeathrLocalSparkJobLauncher: + return _FeathrLocalSparkJobLauncher( + workspace_path=str(tmp_path), + debug_folder=str(tmp_path), + ) + + +def test__local_spark_job_launcher__submit_feathr_job( + mocker: MockerFixture, + local_spark_job_launcher: _FeathrLocalSparkJobLauncher, +): + # Mock necessary components + local_spark_job_launcher._init_args = MagicMock(return_value=[]) + mocked_proc = MagicMock() + mocked_proc.args = [] + mocked_proc.pid = 0 + + mocked_spark_proc = mocker.patch("feathr.spark_provider._localspark_submission.Popen", return_value=mocked_proc) + + local_spark_job_launcher.submit_feathr_job( + job_name="unit-test", + main_jar_path="", + main_class_name="", + ) + + # Assert if the mocked spark process has called once + mocked_spark_proc.assert_called_once() + + +@pytest.mark.parametrize( + "confs", [{}, {"spark.feathr.outputFormat": "parquet"}] +) +def test__local_spark_job_launcher__init_args( + local_spark_job_launcher: _FeathrLocalSparkJobLauncher, + confs: Dict[str, str], +): + spark_args = local_spark_job_launcher._init_args(job_name=None, confs=confs) + + # Assert if spark_args contains confs at the end + for k, v in confs.items(): + assert spark_args[-1] == f"{k}={v}" From d6c24bfea1d558be3c5fd4db7444bf92ced92fc4 Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Mon, 10 Oct 2022 20:00:46 +0000 Subject: [PATCH 3/3] Fix local spark submission unused param error Signed-off-by: Jun Ki Min <42475935+loomlike@users.noreply.github.com> --- .../spark_provider/_localspark_submission.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/feathr_project/feathr/spark_provider/_localspark_submission.py b/feathr_project/feathr/spark_provider/_localspark_submission.py index 31ec16f2e..afed9683d 100644 --- a/feathr_project/feathr/spark_provider/_localspark_submission.py +++ b/feathr_project/feathr/spark_provider/_localspark_submission.py @@ -54,22 +54,21 @@ def submit_feathr_job( python_files: List[str] = None, configuration: Dict[str, str] = {}, properties: Dict[str, str] = {}, - *_, + **_, ) -> Any: """Submits the Feathr job to local spark, using subprocess args. Note that the Spark application will automatically run on YARN cluster mode. You cannot change it if you are running with Azure Synapse. Args: - job_name (str): name of the job - main_jar_path (str): main file paths, usually your main jar file - main_class_name (str): name of your main class - arguments (List[str]): all the arguments you want to pass into the spark job - python_files (List[str]): required .zip, .egg, or .py files of spark job - configuration (Dict[str, str]): Additional configs for the spark job - - reference_files_path (str): not used in local spark mode - job_tags (str): not used in local spark mode + job_name: name of the job + main_jar_path: main file paths, usually your main jar file + main_class_name: name of your main class + arguments: all the arguments you want to pass into the spark job + python_files: required .zip, .egg, or .py files of spark job + configuration: Additional configs for the spark job + properties: System properties configuration + **_: Not used arguments in local spark mode, such as reference_files_path and job_tags """ logger.warning( f"Local Spark Mode only support basic params right now and should be used only for testing purpose."