From 54433f2c642fd122f97f0d9f23b2fb94dac93eee Mon Sep 17 00:00:00 2001 From: Daniel Zhu Date: Wed, 27 Mar 2024 12:48:00 -0700 Subject: [PATCH 1/2] feat: update implementation of ClassificationAccuracySemanticRobustness to use Transform-based approach --- ...sification_accuracy_semantic_robustness.py | 397 +++++++----------- .../qa_accuracy_semantic_robustness.py | 8 +- ...sification_accuracy_semantic_robustness.py | 225 +--------- ...sification_accuracy_semantic_robustness.py | 333 ++++----------- .../test_qa_accuracy_semantic_robustness.py | 2 +- 5 files changed, 226 insertions(+), 739 deletions(-) diff --git a/src/fmeval/eval_algorithms/classification_accuracy_semantic_robustness.py b/src/fmeval/eval_algorithms/classification_accuracy_semantic_robustness.py index 83febfcb..0628c15b 100644 --- a/src/fmeval/eval_algorithms/classification_accuracy_semantic_robustness.py +++ b/src/fmeval/eval_algorithms/classification_accuracy_semantic_robustness.py @@ -1,68 +1,45 @@ import logging import warnings -from collections import defaultdict - -from typing import Any, Callable, List, Optional, Dict +from typing import Callable, List, Optional from dataclasses import dataclass -import fmeval.util as util from fmeval.constants import ( DatasetColumns, MEAN, - BUTTER_FINGER, - RANDOM_UPPER_CASE, - WHITESPACE_ADD_REMOVE, - PREFIX_FOR_DELTA_SCORES, ) from fmeval.data_loaders.util import get_dataset from fmeval.data_loaders.data_config import DataConfig -from fmeval.eval_algorithms.semantic_perturbation_utils import ( - ButterFingerConfig, - RandomUpperCaseConfig, - WhitespaceAddRemoveConfig, +from fmeval.eval_algorithms.semantic_robustness_utils import ( + SemanticRobustnessConfig, + get_perturbation_transform, + get_model_outputs_from_perturbed_inputs, ) from fmeval.eval_algorithms.util import ( - generate_prompt_column_for_dataset, - aggregate_evaluation_scores, + get_dataset_configs, validate_dataset, - save_dataset, - generate_output_dataset_path, - generate_mean_delta_score, - generate_model_predict_response_for_dataset, -) -from fmeval.eval_algorithms.eval_algorithm import ( - EvalAlgorithmInterface, - EvalAlgorithmConfig, + create_model_invocation_pipeline, + evaluate_dataset, ) +from fmeval.eval_algorithms.eval_algorithm import EvalAlgorithmInterface from fmeval.eval_algorithms import ( EvalAlgorithm, EvalOutput, EvalScore, - EVAL_DATASETS, - DATASET_CONFIGS, get_default_prompt_template, DEFAULT_PROMPT_TEMPLATE, ) -from fmeval.exceptions import EvalAlgorithmClientError -from fmeval.model_runners.composers.composers import PromptComposer from fmeval.model_runners.model_runner import ModelRunner -from fmeval.perf_util import timed_block from fmeval.eval_algorithms.classification_accuracy import ( convert_model_output_to_label, - ClassificationAccuracy, - ClassificationAccuracyConfig, CLASSIFICATION_ACCURACY_SCORE, UNIQUENESS_FACTOR, + ClassificationAccuracyScores, + CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME, ) -from fmeval.eval_algorithms.semantic_perturbation_utils import ButterFinger, RandomUpperCase, WhitespaceAddRemove - -# All the perturbation types supported by this eval algo -PERTURBATION_TYPE_TO_HELPER_CLASS = { - BUTTER_FINGER: ButterFinger, - RANDOM_UPPER_CASE: RandomUpperCase, - WHITESPACE_ADD_REMOVE: WhitespaceAddRemove, -} +from fmeval.transforms.semantic_robustness_metrics import MeanDeltaScores +from fmeval.transforms.transform_pipeline import TransformPipeline +from fmeval.util import get_eval_results_path PREFIX_FOR_DELTA_SCORES = "delta_" DELTA_CLASSIFICATION_ACCURACY_SCORE = PREFIX_FOR_DELTA_SCORES + CLASSIFICATION_ACCURACY_SCORE @@ -70,40 +47,21 @@ logger = logging.getLogger(__name__) -@dataclass -class ClassificationAccuracySemanticRobustnessConfig(EvalAlgorithmConfig): - """ - Configuration for the Classification Accuracy Semantic Robustness Evaluation - - :param valid_labels: List of valid string label - :param converter_fn: Function to process model output to labels, defaults to simple integer conversion - :param perturbation_type: perturbation type for generating perturbed inputs - :param num_perturbations: Number of perturbed inputs to be generated for robustness evaluation - :param butter_finger_perturbation_prob: The probability that a given character will be perturbed. Used for - butter_finger perturbation_type - :param random_uppercase_corrupt_proportion: Fraction of characters to be changed to uppercase. Used for - random_upper_case perturbation_type - :param whitespace_remove_prob: Given a whitespace, remove it with this much probability. Used for - whitespace_add_remove perturbation_type - :param whitespace_add_prob: Given a non-whitespace, add a whitespace before it with this probability. Used for - whitespace_add_remove perturbation_type +@dataclass(frozen=True) +class ClassificationAccuracySemanticRobustnessConfig(SemanticRobustnessConfig): + """Configures the Classification Accuracy Semantic Robustness evaluation algorithm. + + See SemanticRobustnessConfig for the configurable parameters that this config class inherits. + + :param valid_labels: A list of valid labels. + :param converter_fn: Function to process model output to labels. Defaults to simple integer conversion. """ valid_labels: Optional[List[str]] = None converter_fn: Callable[[str, List[str]], str] = convert_model_output_to_label - perturbation_type: str = BUTTER_FINGER - num_perturbations: int = 5 - butter_finger_perturbation_prob: float = 0.1 - random_uppercase_corrupt_proportion: float = 0.1 - whitespace_remove_prob: float = 0.1 - whitespace_add_prob: float = 0.05 def __post_init__(self): - if self.perturbation_type not in PERTURBATION_TYPE_TO_HELPER_CLASS.keys(): - raise EvalAlgorithmClientError( - f"Invalid perturbation type '{self.perturbation_type} requested, please " - f"choose from acceptable values: {PERTURBATION_TYPE_TO_HELPER_CLASS.keys()}" - ) + super().__post_init__() if self.valid_labels: for i, label in enumerate(self.valid_labels): if not isinstance(label, str): @@ -111,9 +69,6 @@ def __post_init__(self): self.valid_labels[i] = str(label) -CLASSIFICATION_ACCURACY_SEMANTIC_ROBUSTNESS = EvalAlgorithm.CLASSIFICATION_ACCURACY_SEMANTIC_ROBUSTNESS.value - - class ClassificationAccuracySemanticRobustness(EvalAlgorithmInterface): """Semantic Robustness evaluation algorithm for Classification Accuracy @@ -134,55 +89,116 @@ def __init__( self, eval_algorithm_config: ClassificationAccuracySemanticRobustnessConfig = ClassificationAccuracySemanticRobustnessConfig(), ): - """Default constructor + """ClassificationAccuracySemanticRobustness initializer. - :param eval_algorithm_config: Classification Accuracy Semantic Robustness eval algorithm config. + :param eval_algorithm_config: Classification Accuracy Semantic Robustness evaluation algorithm config. """ super().__init__(eval_algorithm_config) - self.eval_name = CLASSIFICATION_ACCURACY_SEMANTIC_ROBUSTNESS - self._eval_algorithm_config = eval_algorithm_config - self._classification_accuracy_eval_algo = ClassificationAccuracy( - eval_algorithm_config=ClassificationAccuracyConfig( - valid_labels=self._eval_algorithm_config.valid_labels, - converter_fn=self._eval_algorithm_config.converter_fn, - ) + self.config = eval_algorithm_config + self.perturbation_transform = get_perturbation_transform(eval_algorithm_config) + self.valid_labels = eval_algorithm_config.valid_labels + self.converter_fn = eval_algorithm_config.converter_fn + + def _build_pipeline( + self, + model: ModelRunner, + prompt_template: str, + valid_labels: Optional[List[str]], + ) -> TransformPipeline: + """Build the TransformPipeline to be used by `evaluate` and `evaluate_sample`. + + While other evaluation algorithms (ex: Classification Accuracy) can configure + their TransformPipeline at algorithm initialization, because the Classification Accuracy + Semantic Robustness algorithm's evaluation logic depends on the ModelRunner + and prompt template that are evaluation-specific (i.e. these parameters aren't + configured at the algorithm level), the pipeline used by this algorithm is built + when `evaluate` or `evaluate_sample` is called. + + :param model: The ModelRunner representing the model under evaluation. + :param prompt_template: A template that is used to construct the prompt fed to the model. + :param valid_labels: A list of valid labels for the classified model output. + :returns: A TransformPipeline that can be used by either `evaluate_sample` or `evaluate`. + """ + get_perturbed_inputs, gen_perturbed_prompts, get_perturbed_outputs = get_model_outputs_from_perturbed_inputs( + self.perturbation_transform, + prompt_template, + model, ) - if self._eval_algorithm_config.perturbation_type == BUTTER_FINGER: - self._perturbation_config = ButterFingerConfig(self._eval_algorithm_config.butter_finger_perturbation_prob) - elif self._eval_algorithm_config.perturbation_type == RANDOM_UPPER_CASE: - self._perturbation_config = RandomUpperCaseConfig( - self._eval_algorithm_config.random_uppercase_corrupt_proportion - ) - else: - self._perturbation_config = WhitespaceAddRemoveConfig( - self._eval_algorithm_config.whitespace_remove_prob, self._eval_algorithm_config.whitespace_add_prob + original_scores = ClassificationAccuracyScores(valid_labels=valid_labels, converter_fn=self.converter_fn) + perturbed_scores = [ + ClassificationAccuracyScores( + valid_labels=valid_labels, + model_output_key=perturbed_output_key, + classified_model_output_key=f"{CLASSIFIED_MODEL_OUTPUT_COLUMN_NAME}_perturbed_{i}", + classification_accuracy_score_key=f"{CLASSIFICATION_ACCURACY_SCORE}_perturbed_{i}", + converter_fn=self.converter_fn, ) + for i, perturbed_output_key in enumerate(get_perturbed_outputs.output_keys) + ] - self._classification_accuracy_eval_algo = ClassificationAccuracy( - eval_algorithm_config=ClassificationAccuracyConfig( - valid_labels=eval_algorithm_config.valid_labels, converter_fn=eval_algorithm_config.converter_fn - ) + perturbed_score_keys = [ + perturbed_score_transform.classification_accuracy_score_key + for perturbed_score_transform in perturbed_scores + ] + mean_delta_scores = MeanDeltaScores( + {CLASSIFICATION_ACCURACY_SCORE: (perturbed_score_keys, DELTA_CLASSIFICATION_ACCURACY_SCORE)} ) - def __reduce__(self): # pragma: no cover - """ - Custom serializer method used by Ray when it serializes instances of this - class during dataset.map() operations. + transforms = [ + get_perturbed_inputs, + gen_perturbed_prompts, + get_perturbed_outputs, + original_scores, + TransformPipeline(perturbed_scores), + mean_delta_scores, + ] + pipeline = TransformPipeline(transforms) + return pipeline + + def evaluate_sample( + self, + model_input: str, + target_output: str, + model: ModelRunner, + prompt_template: str = DEFAULT_PROMPT_TEMPLATE, + ) -> List[EvalScore]: + """Compute classification accuracy semantic robustness metrics for a single sample. + + A sample is defined as a model input and target output pair. + + :param model_input: Text input, which will be composed into a prompt that gets fed to the model. + :param target_output: The expected response from the model. + :param model: An instance of ModelRunner representing the model under evaluation. + :param prompt_template: A template used to compose the prompt from `model_input`. + :return: A list of EvalScores. """ - serialized_data = (self._eval_algorithm_config,) - return ClassificationAccuracySemanticRobustness, serialized_data + sample = { + DatasetColumns.MODEL_INPUT.value.name: model_input, + DatasetColumns.TARGET_OUTPUT.value.name: target_output, + } + invoke_model = create_model_invocation_pipeline(model, prompt_template) + compute_metrics = self._build_pipeline(model, prompt_template, self.valid_labels) + pipeline = TransformPipeline([invoke_model, compute_metrics]) + output_record = pipeline.execute_record(sample) + + original_score = EvalScore( + name=CLASSIFICATION_ACCURACY_SCORE, value=output_record[CLASSIFICATION_ACCURACY_SCORE] + ) + delta_score = EvalScore( + name=DELTA_CLASSIFICATION_ACCURACY_SCORE, value=output_record[DELTA_CLASSIFICATION_ACCURACY_SCORE] + ) + return [original_score, delta_score] def evaluate( self, model: ModelRunner, dataset_config: Optional[DataConfig] = None, prompt_template: Optional[str] = None, + num_records: int = 100, save: bool = False, - num_records=100, ) -> List[EvalOutput]: - """ - Classification Accuracy Semantic Robustness evaluate. + """Compute classification accuracy semantic robustness metrics on one or more datasets. :param model: An instance of ModelRunner representing the model under evaluation. This is a required argument, as even if the dataset contains model outputs, @@ -199,171 +215,40 @@ def evaluate( evaluation :returns: A List of EvalOutput objects. """ - util.require( - model, - "Missing required input: model i.e. ModelRunner, for ClassificationAccuracySemanticRobustness evaluate", - ) - if dataset_config: - dataset_configs = [dataset_config] - else: - dataset_configs = [DATASET_CONFIGS[dataset_name] for dataset_name in EVAL_DATASETS[self.eval_name]] - + dataset_configs = get_dataset_configs(dataset_config, self.eval_name) eval_outputs: List[EvalOutput] = [] + for dataset_config in dataset_configs: - dataset = get_dataset(dataset_config, num_records) - validate_dataset(dataset, [DatasetColumns.TARGET_OUTPUT.value.name, DatasetColumns.MODEL_INPUT.value.name]) dataset_prompt_template = ( get_default_prompt_template(dataset_config.dataset_name) if not prompt_template else prompt_template ) - dataset = generate_prompt_column_for_dataset( - prompt_template=dataset_prompt_template, - data=dataset, - model_input_column_name=DatasetColumns.MODEL_INPUT.value.name, - prompt_column_name=DatasetColumns.PROMPT.value.name, - ) + dataset = get_dataset(dataset_config, num_records) + validate_dataset(dataset, [DatasetColumns.TARGET_OUTPUT.value.name, DatasetColumns.MODEL_INPUT.value.name]) - dataset = generate_model_predict_response_for_dataset( - model=model, - data=dataset, - model_input_column_name=DatasetColumns.PROMPT.value.name, - model_output_column_name=DatasetColumns.MODEL_OUTPUT.value.name, + valid_labels = ( + self.valid_labels + if self.valid_labels + else dataset.unique(column=DatasetColumns.TARGET_OUTPUT.value.name) ) - - config_valid_labels = self._eval_algorithm_config.valid_labels - if not self._eval_algorithm_config.valid_labels: # pragma: no branch - self._eval_algorithm_config.valid_labels = dataset.unique( - column=DatasetColumns.TARGET_OUTPUT.value.name - ) - row_count = dataset.count() - assert self._eval_algorithm_config.valid_labels is not None # to satisfy mypy - if ( - len(self._eval_algorithm_config.valid_labels) / (row_count + 1) < UNIQUENESS_FACTOR - ): # pragma: no cover - logger.warning( - f"The number of classes: {len(self._eval_algorithm_config.valid_labels)} in the dataset is too large " - f"for the number of rows in the dataset: {row_count}", - ) - self._classification_accuracy_eval_algo = ClassificationAccuracy( - eval_algorithm_config=ClassificationAccuracyConfig( - valid_labels=self._eval_algorithm_config.valid_labels, - converter_fn=self._eval_algorithm_config.converter_fn, - ) - ) - with timed_block(f"Computing score and aggregation on dataset {dataset_config.dataset_name}", logger): - - def _generate_score_columns(row: Dict[str, Any]) -> Dict[str, Any]: # pragma: no cover - scores = self.evaluate_sample( - model_input=row[DatasetColumns.MODEL_INPUT.value.name], - model=model, - target_output=row[DatasetColumns.TARGET_OUTPUT.value.name], - model_output=row[DatasetColumns.MODEL_OUTPUT.value.name], - prompt_template=dataset_prompt_template, - ) - for score in scores: - row[score.name] = score.value - return row - - dataset = dataset.map(_generate_score_columns).materialize() - - dataset_scores, category_scores = aggregate_evaluation_scores( - dataset, [CLASSIFICATION_ACCURACY_SCORE, DELTA_CLASSIFICATION_ACCURACY_SCORE], agg_method=MEAN - ) - - eval_outputs.append( - EvalOutput( - eval_name=self.eval_name, - dataset_name=dataset_config.dataset_name, - prompt_template=dataset_prompt_template, - dataset_scores=dataset_scores, - category_scores=category_scores, - output_path=generate_output_dataset_path( - path_to_parent_dir=util.get_eval_results_path(), - eval_name=self.eval_name, - dataset_name=dataset_config.dataset_name, - ), - ) - ) - # set it back to the same value as before the start of evaluating this dataset - self._eval_algorithm_config.valid_labels = config_valid_labels - if save: - save_dataset( - dataset=dataset, - score_names=[CLASSIFICATION_ACCURACY_SCORE, DELTA_CLASSIFICATION_ACCURACY_SCORE], - path=generate_output_dataset_path( - path_to_parent_dir=util.get_eval_results_path(), - eval_name=self.eval_name, - dataset_name=dataset_config.dataset_name, - ), + row_count = dataset.count() + if len(valid_labels) / (row_count + 1) < UNIQUENESS_FACTOR: # pragma: no cover + logger.warning( + f"The number of classes: {len(valid_labels)} in the dataset is too large " + f"for the number of rows in the dataset: {row_count}", ) - return eval_outputs - - def evaluate_sample( - self, - model_input: str, - model: ModelRunner, - target_output: str, - model_output: Optional[str] = None, - prompt_template: str = DEFAULT_PROMPT_TEMPLATE, - ) -> List[EvalScore]: # type: ignore[override] - """ - Evaluate a single record for Classification Accuracy Semantic Robustness. - - :param model_input: text input for model - :param model: An instance of ModelRunner which is the model under evaluation - :param target_output: The expected responses from the model - :param model_output: The output of a model that we want to evaluate - :param prompt_template: A template which can be used to compose prompt using model_input - :returns: A List of EvalScores computed for prompts and responses. - """ - util.require( - model_input, - "Missing required input: model_input, for ClassificationAccuracySemanticRobustness evaluate_sample", - ) - util.require( - model, - "Missing required input: model i.e. ModelRunner, for ClassificationAccuracySemanticRobustness evaluate_sample", - ) - util.require( - target_output, - "Missing required input: target_output, for " "ClassificationAccuracySemanticRobustness evaluate_sample", - ) - - prompt_composer = PromptComposer(prompt_template) - original_prompt = prompt_composer.compose(model_input) - original_model_output = model_output if model_output else model.predict(original_prompt)[0] - - perturbation = PERTURBATION_TYPE_TO_HELPER_CLASS[self._eval_algorithm_config.perturbation_type]() - - perturbed_inputs = perturbation.perturb( - text=model_input, - config=self._perturbation_config, - num_perturbations=self._eval_algorithm_config.num_perturbations, - ) - - perturbed_input_prompts = [prompt_composer.compose(perturbed_input) for perturbed_input in perturbed_inputs] - perturbed_input_outputs = [model.predict(prompt)[0] for prompt in perturbed_input_prompts] - - original_classification_accuracy_scores = self._classification_accuracy_eval_algo.evaluate_sample( - target_output=target_output, model_output=original_model_output - ) - - perturbed_outputs_classification_accuracy_scores = defaultdict(lambda: []) - for perturbed_input_output in perturbed_input_outputs: - accuracy_scores = self._classification_accuracy_eval_algo.evaluate_sample( - target_output=target_output, model_output=perturbed_input_output - ) - for accuracy_score in accuracy_scores: - perturbed_outputs_classification_accuracy_scores[accuracy_score.name].append(accuracy_score) - - delta_scores = [ - EvalScore( - name=PREFIX_FOR_DELTA_SCORES + original_score.name, - value=generate_mean_delta_score( - original_score, perturbed_outputs_classification_accuracy_scores[original_score.name] - ), + eval_output = evaluate_dataset( + dataset=dataset, + pipeline=self._build_pipeline(model, dataset_prompt_template, valid_labels), + dataset_name=dataset_config.dataset_name, + eval_name=self.eval_name, + metric_names=[CLASSIFICATION_ACCURACY_SCORE, DELTA_CLASSIFICATION_ACCURACY_SCORE], + eval_results_path=get_eval_results_path(), + model=model, + prompt_template=dataset_prompt_template, + agg_method=MEAN, + save=save, ) - for original_score in original_classification_accuracy_scores - ] + eval_outputs.append(eval_output) - return original_classification_accuracy_scores + delta_scores + return eval_outputs diff --git a/src/fmeval/eval_algorithms/qa_accuracy_semantic_robustness.py b/src/fmeval/eval_algorithms/qa_accuracy_semantic_robustness.py index d0108e52..48d89a0d 100644 --- a/src/fmeval/eval_algorithms/qa_accuracy_semantic_robustness.py +++ b/src/fmeval/eval_algorithms/qa_accuracy_semantic_robustness.py @@ -122,7 +122,7 @@ def __init__( self.perturbation_transform = get_perturbation_transform(eval_algorithm_config) self.target_output_delimiter = eval_algorithm_config.target_output_delimiter - def build_pipeline( + def _build_pipeline( self, model: ModelRunner, prompt_template: str, @@ -187,7 +187,7 @@ def evaluate_sample( ) -> List[EvalScore]: """Compute question answering accuracy semantic robustness metrics for a single sample. - A sample is defined as a model input and model output pair. + A sample is defined as a model input and target output pair. :param model_input: Text input, which will be composed into a prompt that gets fed to the model. :param target_output: The expected response from the model. @@ -200,7 +200,7 @@ def evaluate_sample( DatasetColumns.TARGET_OUTPUT.value.name: target_output, } invoke_model = create_model_invocation_pipeline(model, prompt_template) - compute_metrics = self.build_pipeline(model, prompt_template) + compute_metrics = self._build_pipeline(model, prompt_template) pipeline = TransformPipeline([invoke_model, compute_metrics]) output_record = pipeline.execute_record(sample) @@ -247,7 +247,7 @@ def evaluate( validate_dataset(dataset, [DatasetColumns.MODEL_INPUT.value.name, DatasetColumns.TARGET_OUTPUT.value.name]) eval_output = evaluate_dataset( dataset=dataset, - pipeline=self.build_pipeline(model, dataset_prompt_template), + pipeline=self._build_pipeline(model, dataset_prompt_template), dataset_name=dataset_config.dataset_name, eval_name=self.eval_name, metric_names=ORIGINAL_SCORES + DELTA_SCORES, diff --git a/test/integration/test_classification_accuracy_semantic_robustness.py b/test/integration/test_classification_accuracy_semantic_robustness.py index 49010e1d..ae1e2ecf 100644 --- a/test/integration/test_classification_accuracy_semantic_robustness.py +++ b/test/integration/test_classification_accuracy_semantic_robustness.py @@ -3,18 +3,16 @@ from pytest import approx from typing import NamedTuple, Dict +from fmeval.constants import BUTTER_FINGER, RANDOM_UPPER_CASE, WHITESPACE_ADD_REMOVE from fmeval.eval_algorithms import ( DATASET_CONFIGS, WOMENS_CLOTHING_ECOMMERCE_REVIEWS, ) from fmeval.eval_algorithms.classification_accuracy_semantic_robustness import ( - BUTTER_FINGER, CLASSIFICATION_ACCURACY_SCORE, ClassificationAccuracySemanticRobustness, ClassificationAccuracySemanticRobustnessConfig, DELTA_CLASSIFICATION_ACCURACY_SCORE, - RANDOM_UPPER_CASE, - WHITESPACE_ADD_REMOVE, ) from test.integration.models.model_runners import ( @@ -39,10 +37,10 @@ class TestClassificationAccuracySemanticRobustness: [ CASRTestCase( config=ClassificationAccuracySemanticRobustnessConfig( - valid_labels=SAMPLE_VALID_LABELS, perturbation_type=BUTTER_FINGER, num_perturbations=5, butter_finger_perturbation_prob=0.1, + valid_labels=SAMPLE_VALID_LABELS, ), aggregate_scores={ CLASSIFICATION_ACCURACY_SCORE: 1, @@ -52,10 +50,10 @@ class TestClassificationAccuracySemanticRobustness: ), CASRTestCase( config=ClassificationAccuracySemanticRobustnessConfig( - valid_labels=SAMPLE_VALID_LABELS, perturbation_type=RANDOM_UPPER_CASE, num_perturbations=5, random_uppercase_corrupt_proportion=0.1, + valid_labels=SAMPLE_VALID_LABELS, ), aggregate_scores={ CLASSIFICATION_ACCURACY_SCORE: 1, @@ -65,11 +63,11 @@ class TestClassificationAccuracySemanticRobustness: ), CASRTestCase( config=ClassificationAccuracySemanticRobustnessConfig( - valid_labels=SAMPLE_VALID_LABELS, perturbation_type=WHITESPACE_ADD_REMOVE, num_perturbations=5, whitespace_remove_prob=0.1, whitespace_add_prob=0.05, + valid_labels=SAMPLE_VALID_LABELS, ), aggregate_scores={ CLASSIFICATION_ACCURACY_SCORE: 1, @@ -86,8 +84,8 @@ def test_evaluate_sample(self, casr_test_case): "or 1 (positive sentiment). Review: $model_input. Classification:" eval_scores = ca_semantic_robustness.evaluate_sample( model_input=model_input, - model=hf_model_runner, target_output="1", + model=hf_model_runner, prompt_template=prompt_template, ) for eval_score in eval_scores: @@ -98,80 +96,10 @@ def test_evaluate_sample(self, casr_test_case): [ CASRTestCase( config=ClassificationAccuracySemanticRobustnessConfig( - valid_labels=SAMPLE_VALID_LABELS, perturbation_type=BUTTER_FINGER, num_perturbations=5, butter_finger_perturbation_prob=0.1, - ), - aggregate_scores={ - CLASSIFICATION_ACCURACY_SCORE: 0.83, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0, - }, - category_scores={ - "Blouses": { - CLASSIFICATION_ACCURACY_SCORE: 0.8, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Dresses": { - CLASSIFICATION_ACCURACY_SCORE: 0.8571428571428571, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Fine gauge": { - CLASSIFICATION_ACCURACY_SCORE: 0.75, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Jackets": { - CLASSIFICATION_ACCURACY_SCORE: 0.0, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Jeans": { - CLASSIFICATION_ACCURACY_SCORE: 1.0, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Knits": { - CLASSIFICATION_ACCURACY_SCORE: 0.9166666666666666, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Lounge": { - CLASSIFICATION_ACCURACY_SCORE: 0.75, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Outerwear": { - CLASSIFICATION_ACCURACY_SCORE: 0.0, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Pants": { - CLASSIFICATION_ACCURACY_SCORE: 1.0, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Shorts": { - CLASSIFICATION_ACCURACY_SCORE: 0.6666666666666666, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Skirts": { - CLASSIFICATION_ACCURACY_SCORE: 0.8, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Sweaters": { - CLASSIFICATION_ACCURACY_SCORE: 0.7142857142857143, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Swim": { - CLASSIFICATION_ACCURACY_SCORE: 1.0, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Trend": { - CLASSIFICATION_ACCURACY_SCORE: 1.0, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - }, - ), - CASRTestCase( - config=ClassificationAccuracySemanticRobustnessConfig( valid_labels=SAMPLE_VALID_LABELS, - perturbation_type=RANDOM_UPPER_CASE, - num_perturbations=5, - random_uppercase_corrupt_proportion=0.1, ), aggregate_scores={ CLASSIFICATION_ACCURACY_SCORE: 0.83, @@ -236,151 +164,10 @@ def test_evaluate_sample(self, casr_test_case): }, }, ), - CASRTestCase( - config=ClassificationAccuracySemanticRobustnessConfig( - valid_labels=SAMPLE_VALID_LABELS, - perturbation_type=WHITESPACE_ADD_REMOVE, - num_perturbations=5, - whitespace_remove_prob=0.1, - whitespace_add_prob=0.05, - ), - aggregate_scores={ - CLASSIFICATION_ACCURACY_SCORE: 0.83, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0, - }, - category_scores={ - "Blouses": { - CLASSIFICATION_ACCURACY_SCORE: 0.8, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Dresses": { - CLASSIFICATION_ACCURACY_SCORE: 0.8571428571428571, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Fine gauge": { - CLASSIFICATION_ACCURACY_SCORE: 0.75, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Jackets": { - CLASSIFICATION_ACCURACY_SCORE: 0.0, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Jeans": { - CLASSIFICATION_ACCURACY_SCORE: 1.0, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Knits": { - CLASSIFICATION_ACCURACY_SCORE: 0.9166666666666666, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Lounge": { - CLASSIFICATION_ACCURACY_SCORE: 0.75, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Outerwear": { - CLASSIFICATION_ACCURACY_SCORE: 0.0, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Pants": { - CLASSIFICATION_ACCURACY_SCORE: 1.0, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Shorts": { - CLASSIFICATION_ACCURACY_SCORE: 0.6666666666666666, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Skirts": { - CLASSIFICATION_ACCURACY_SCORE: 0.8, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Sweaters": { - CLASSIFICATION_ACCURACY_SCORE: 0.7142857142857143, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Swim": { - CLASSIFICATION_ACCURACY_SCORE: 1.0, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Trend": { - CLASSIFICATION_ACCURACY_SCORE: 1.0, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - }, - ), - CASRTestCase( - config=None, - aggregate_scores={ - CLASSIFICATION_ACCURACY_SCORE: 0.83, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0, - }, - category_scores={ - "Blouses": { - CLASSIFICATION_ACCURACY_SCORE: 0.8, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Dresses": { - CLASSIFICATION_ACCURACY_SCORE: 0.8571428571428571, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Fine gauge": { - CLASSIFICATION_ACCURACY_SCORE: 0.75, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Jackets": { - CLASSIFICATION_ACCURACY_SCORE: 0.0, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Jeans": { - CLASSIFICATION_ACCURACY_SCORE: 1.0, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Knits": { - CLASSIFICATION_ACCURACY_SCORE: 0.9166666666666666, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Lounge": { - CLASSIFICATION_ACCURACY_SCORE: 0.75, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Outerwear": { - CLASSIFICATION_ACCURACY_SCORE: 0.0, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Pants": { - CLASSIFICATION_ACCURACY_SCORE: 1.0, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Shorts": { - CLASSIFICATION_ACCURACY_SCORE: 0.6666666666666666, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Skirts": { - CLASSIFICATION_ACCURACY_SCORE: 0.8, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Sweaters": { - CLASSIFICATION_ACCURACY_SCORE: 0.7142857142857143, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Swim": { - CLASSIFICATION_ACCURACY_SCORE: 1.0, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - "Trend": { - CLASSIFICATION_ACCURACY_SCORE: 1.0, - DELTA_CLASSIFICATION_ACCURACY_SCORE: 0.0, - }, - }, - ), ], ) def test_evaluate(self, casr_test_case): - if casr_test_case.config: - ca_semantic_robustness = ClassificationAccuracySemanticRobustness( - eval_algorithm_config=casr_test_case.config - ) - else: - ca_semantic_robustness = ClassificationAccuracySemanticRobustness() + ca_semantic_robustness = ClassificationAccuracySemanticRobustness(eval_algorithm_config=casr_test_case.config) prompt_template = "Classify the sentiment of the following review with 0 (negative sentiment) " "or 1 (positive sentiment). Review: $model_input. Classification:" dataset_config = DATASET_CONFIGS[WOMENS_CLOTHING_ECOMMERCE_REVIEWS] diff --git a/test/unit/eval_algorithms/test_classification_accuracy_semantic_robustness.py b/test/unit/eval_algorithms/test_classification_accuracy_semantic_robustness.py index a738c891..c09f6691 100644 --- a/test/unit/eval_algorithms/test_classification_accuracy_semantic_robustness.py +++ b/test/unit/eval_algorithms/test_classification_accuracy_semantic_robustness.py @@ -1,6 +1,6 @@ import re from typing import NamedTuple, List, Optional, Tuple -from unittest.mock import patch, MagicMock +from unittest.mock import patch, MagicMock, Mock import pytest import ray @@ -9,7 +9,7 @@ from fmeval.constants import ( DatasetColumns, - MIME_TYPE_JSON, + MIME_TYPE_JSON, BUTTER_FINGER, RANDOM_UPPER_CASE, WHITESPACE_ADD_REMOVE, MEAN, ) from fmeval.data_loaders.data_config import DataConfig from fmeval.eval_algorithms import ( @@ -23,9 +23,6 @@ from fmeval.eval_algorithms.classification_accuracy_semantic_robustness import ( ClassificationAccuracySemanticRobustnessConfig, ClassificationAccuracySemanticRobustness, - RANDOM_UPPER_CASE, - WHITESPACE_ADD_REMOVE, - BUTTER_FINGER, DELTA_CLASSIFICATION_ACCURACY_SCORE, ) from fmeval.eval_algorithms.classification_accuracy import CLASSIFICATION_ACCURACY_SCORE @@ -214,9 +211,9 @@ def test_classification_accuracy_semantic_robustness_evaluate_sample(self, test_ """ model = MagicMock() model.predict.side_effect = [ - (test_case.original_model_output,), - (test_case.perturbed_model_output_1,), - (test_case.perturbed_model_output_2,), + (test_case.original_model_output, None), + (test_case.perturbed_model_output_1, None), + (test_case.perturbed_model_output_2, None), ] eval_algorithm = ClassificationAccuracySemanticRobustness(test_case.config) @@ -228,270 +225,88 @@ def test_classification_accuracy_semantic_robustness_evaluate_sample(self, test_ ) assert model.predict.call_count == 3 - @pytest.mark.parametrize( - "test_case", - [ - TestCaseClassificationAccuracySemanticRobustnessEvaluateSample( - model_input="Ok brownie.", - original_model_output="3", - perturbed_model_output_1="Some model output.", - perturbed_model_output_2="Some model output.", - target_output="3", - expected_response=[ - EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=1.0), - EvalScore(name=DELTA_CLASSIFICATION_ACCURACY_SCORE, value=1.0), - ], - config=ClassificationAccuracySemanticRobustnessConfig( - valid_labels=["1", "2", "3", "4", "5"], - num_perturbations=2, - ), - ) - ], - ) - def test_classification_accuracy_semantic_robustness_evaluate_sample_with_model_output(self, test_case): - """ - GIVEN valid inputs with model_output - WHEN ClassificationAccuracySemanticRobustness.evaluate_sample is called - THEN correct List of EvalScores is returned - """ - model = MagicMock() - model.predict.side_effect = [ - (test_case.perturbed_model_output_1,), - (test_case.perturbed_model_output_2,), - ] - - eval_algorithm = ClassificationAccuracySemanticRobustness(test_case.config) - assert ( - eval_algorithm.evaluate_sample( - model_input=test_case.model_input, - model=model, - model_output=test_case.original_model_output, - target_output=test_case.target_output, - ) - == test_case.expected_response - ) - assert model.predict.call_count == 2 - - class TestCaseClassificationAccuracySemanticRobustnessEvaluate(NamedTuple): - input_dataset: Dataset - input_dataset_with_generated_model_output: Dataset - prompt_template: Optional[str] - dataset_config: Optional[DataConfig] - expected_response: List[EvalOutput] - save_data: bool + class TestCaseEvaluate(NamedTuple): + user_provided_prompt_template: Optional[str] + dataset_prompt_template: str + valid_labels: Optional[List[str]] = None @pytest.mark.parametrize( "test_case", [ - # Built-in datasets evaluate for dataset without category - TestCaseClassificationAccuracySemanticRobustnessEvaluate( - input_dataset=DATASET_WITHOUT_MODEL_OUTPUT.drop_columns(cols=DatasetColumns.CATEGORY.value.name), - input_dataset_with_generated_model_output=DATASET_WITHOUT_CATEGORY, - dataset_config=None, - prompt_template=None, - save_data=True, - expected_response=[ - EvalOutput( - eval_name="classification_accuracy_semantic_robustness", - dataset_name=WOMENS_CLOTHING_ECOMMERCE_REVIEWS, - dataset_scores=[ - EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=0.0), - EvalScore(name=DELTA_CLASSIFICATION_ACCURACY_SCORE, value=0.0), - ], - prompt_template=BUILT_IN_DATASET_DEFAULT_PROMPT_TEMPLATES[WOMENS_CLOTHING_ECOMMERCE_REVIEWS], - category_scores=None, - output_path="/tmp/eval_results/classification_accuracy_semantic_robustness_womens_clothing_ecommerce_reviews.jsonl", - ), - ], + TestCaseEvaluate( + user_provided_prompt_template="Summarize: $model_input", + dataset_prompt_template="Summarize: $model_input", + valid_labels=["0", "1"], ), - # Built-in datasets evaluate for dataset with category - TestCaseClassificationAccuracySemanticRobustnessEvaluate( - input_dataset=DATASET_WITHOUT_MODEL_OUTPUT, - input_dataset_with_generated_model_output=DATASET, - dataset_config=None, - prompt_template=None, - save_data=True, - expected_response=[ - EvalOutput( - eval_name="classification_accuracy_semantic_robustness", - dataset_name=WOMENS_CLOTHING_ECOMMERCE_REVIEWS, - dataset_scores=[ - EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=0.0), - EvalScore(name=DELTA_CLASSIFICATION_ACCURACY_SCORE, value=0.0), - ], - prompt_template=BUILT_IN_DATASET_DEFAULT_PROMPT_TEMPLATES[WOMENS_CLOTHING_ECOMMERCE_REVIEWS], - category_scores=CATEGORY_SCORES, - output_path="/tmp/eval_results/classification_accuracy_semantic_robustness_womens_clothing_ecommerce_reviews.jsonl", - ), - ], - ), - # Custom dataset evaluate, with input prompt template - TestCaseClassificationAccuracySemanticRobustnessEvaluate( - input_dataset=DATASET_WITHOUT_MODEL_OUTPUT.drop_columns(cols=DatasetColumns.CATEGORY.value.name), - input_dataset_with_generated_model_output=DATASET_WITHOUT_CATEGORY, - dataset_config=DataConfig( - dataset_name="my_custom_dataset", - dataset_uri="tba", - dataset_mime_type=MIME_TYPE_JSON, - model_input_location="tba", - target_output_location="tba", - model_output_location=None, - category_location="tba", - ), - prompt_template="$model_input", - save_data=False, - expected_response=[ - EvalOutput( - eval_name="classification_accuracy_semantic_robustness", - dataset_name="my_custom_dataset", - dataset_scores=[ - EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=0.0), - EvalScore(name=DELTA_CLASSIFICATION_ACCURACY_SCORE, value=0.0), - ], - prompt_template="$model_input", - category_scores=None, - output_path="/tmp/eval_results/classification_accuracy_semantic_robustness_my_custom_dataset.jsonl", - ), - ], - ), - # Custom dataset evaluate, without input prompt template - TestCaseClassificationAccuracySemanticRobustnessEvaluate( - input_dataset=DATASET_WITHOUT_MODEL_OUTPUT.drop_columns(cols=DatasetColumns.CATEGORY.value.name), - input_dataset_with_generated_model_output=DATASET_WITHOUT_CATEGORY, - dataset_config=DataConfig( - dataset_name="my_custom_dataset", - dataset_uri="tba", - dataset_mime_type=MIME_TYPE_JSON, - model_input_location="tba", - target_output_location="tba", - model_output_location=None, - category_location="tba", - ), - prompt_template=None, - save_data=False, - expected_response=[ - EvalOutput( - eval_name="classification_accuracy_semantic_robustness", - dataset_name="my_custom_dataset", - dataset_scores=[ - EvalScore(name=CLASSIFICATION_ACCURACY_SCORE, value=0.0), - EvalScore(name=DELTA_CLASSIFICATION_ACCURACY_SCORE, value=0.0), - ], - prompt_template=DEFAULT_PROMPT_TEMPLATE, - category_scores=None, - output_path="/tmp/eval_results/classification_accuracy_semantic_robustness_my_custom_dataset.jsonl", - ), - ], + TestCaseEvaluate( + user_provided_prompt_template=None, + dataset_prompt_template="$model_input", ), ], ) + @patch("fmeval.eval_algorithms.classification_accuracy_semantic_robustness.get_eval_results_path") + @patch("fmeval.eval_algorithms.classification_accuracy_semantic_robustness.evaluate_dataset") + @patch("fmeval.eval_algorithms.classification_accuracy_semantic_robustness.ClassificationAccuracySemanticRobustness._build_pipeline") @patch("fmeval.eval_algorithms.classification_accuracy_semantic_robustness.get_dataset") - @patch("fmeval.eval_algorithms.classification_accuracy_semantic_robustness.save_dataset") - @patch( - "fmeval.eval_algorithms.classification_accuracy_semantic_robustness.generate_model_predict_response_for_dataset" - ) - @patch("fmeval.eval_algorithms.classification_accuracy_semantic_robustness.ClassificationAccuracy") - def test_classification_accuracy_semantic_robustness_evaluate( - self, - classification_accuracy, - generate_model_predict_response_for_dataset, - save_dataset, - get_dataset, - test_case, - config, + @patch("fmeval.eval_algorithms.classification_accuracy_semantic_robustness.get_dataset_configs") + def test_evaluate( + self, + mock_get_dataset_configs, + mock_get_dataset, + mock_build_pipeline, + mock_evaluate_dataset, + mock_get_results_path, + test_case, ): """ - GIVEN valid inputs i.e. input data config for a dataset without model_outputs, an input ModelRunner - and request to save records with scores - WHEN ClassificationAccuracySemanticRobustness evaluate() method is called - THEN correct EvalOutput is returned + GIVEN a ClassificationAccuracySemanticRobustness instance. + WHEN its evaluate method is called with valid arguments. + THEN `evaluate_dataset` is called with the correct arguments. """ - get_dataset.return_value = test_case.input_dataset - generate_model_predict_response_for_dataset.return_value = test_case.input_dataset_with_generated_model_output - classification_accuracy.return_value = MagicMock() + dataset_config = Mock() + dataset_config.dataset_name = "my_custom_dataset" + mock_get_dataset_configs.return_value = [dataset_config] - eval_algorithm = ClassificationAccuracySemanticRobustness() - actual_response = eval_algorithm.evaluate( - model=ConstantModel(), - dataset_config=test_case.dataset_config, - save=test_case.save_data, - prompt_template=test_case.prompt_template, + mock_dataset = Mock() + # So that validate_dataset does not error + mock_dataset.columns = Mock( + return_value=[DatasetColumns.MODEL_INPUT.value.name, DatasetColumns.TARGET_OUTPUT.value.name] ) - assert save_dataset.called == test_case.save_data - assert actual_response == test_case.expected_response + mock_dataset.unique = Mock(return_value=["0", "1", "2"]) + # So that the uniqueness factor check passes + mock_dataset.count = Mock(return_value=100) + mock_get_dataset.return_value = mock_dataset - class TestCaseClassificationAccuracySemanticRobustnessEvaluateInvalid(NamedTuple): - input_dataset: Dataset - dataset_config: Optional[DataConfig] - prompt_template: Optional[str] - model_provided: bool - expected_error_message: str + mock_build_pipeline.return_value = Mock() + mock_get_results_path.return_value = "/path/to/results" + model_runner = Mock() - @pytest.mark.parametrize( - "test_case", - [ - TestCaseClassificationAccuracySemanticRobustnessEvaluateInvalid( - input_dataset=DATASET_WITHOUT_CATEGORY, - dataset_config=None, - prompt_template=None, - model_provided=False, - expected_error_message="Missing required input: model i.e. ModelRunner, for ClassificationAccuracySemanticRobustness " - "evaluate", - ), - TestCaseClassificationAccuracySemanticRobustnessEvaluateInvalid( - input_dataset=DATASET_WITHOUT_CATEGORY.drop_columns(cols=[DatasetColumns.MODEL_INPUT.value.name]), - dataset_config=DataConfig( - dataset_name="my_custom_dataset", - dataset_uri="tba", - dataset_mime_type=MIME_TYPE_JSON, - model_input_location="tba", - target_output_location="tba", - model_output_location=None, - category_location="tba", - ), - prompt_template=None, - model_provided=True, - expected_error_message="Missing required column: model_input, for evaluate", - ), - TestCaseClassificationAccuracySemanticRobustnessEvaluateInvalid( - input_dataset=DATASET_WITHOUT_CATEGORY.drop_columns(cols=[DatasetColumns.TARGET_OUTPUT.value.name]), - dataset_config=DataConfig( - dataset_name="my_custom_dataset", - dataset_uri="tba", - dataset_mime_type=MIME_TYPE_JSON, - model_input_location="tba", - target_output_location="tba", - model_output_location=None, - category_location="tba", - ), - prompt_template=None, - model_provided=True, - expected_error_message="Missing required column: target_output, for evaluate", - ), - ], - ) - @patch("fmeval.model_runners.model_runner.ModelRunner") - @patch("fmeval.eval_algorithms.classification_accuracy_semantic_robustness.get_dataset") - @patch("fmeval.eval_algorithms.classification_accuracy_semantic_robustness.ClassificationAccuracy") - def test_classification_accuracy_semantic_robustness_evaluate_invalid_input( - self, - classification_accuracy, - get_dataset, - model, - test_case, - config, - ): - """ - GIVEN invalid inputs - WHEN ClassificationAccuracySemanticRobustness evaluate is called - THEN correct exception with proper message is raised - """ - classification_accuracy.return_value = MagicMock() - eval_algorithm = ClassificationAccuracySemanticRobustness(config) - get_dataset.return_value = test_case.input_dataset - if not test_case.model_provided: - model = None - with pytest.raises(EvalAlgorithmClientError, match=re.escape(test_case.expected_error_message)): - eval_algorithm.evaluate( - model=model, dataset_config=test_case.dataset_config, prompt_template=test_case.prompt_template - ) + eval_algo = ClassificationAccuracySemanticRobustness( + ClassificationAccuracySemanticRobustnessConfig(valid_labels=test_case.valid_labels) + ) + output = eval_algo.evaluate( + model=model_runner, + dataset_config=dataset_config, + prompt_template=test_case.user_provided_prompt_template, + num_records=162, + save=True, + ) + + mock_evaluate_dataset.assert_called_once_with( + dataset=mock_dataset, + pipeline=mock_build_pipeline.return_value, + dataset_name=dataset_config.dataset_name, + eval_name=eval_algo.eval_name, + metric_names=[CLASSIFICATION_ACCURACY_SCORE, DELTA_CLASSIFICATION_ACCURACY_SCORE], + eval_results_path="/path/to/results", + model=model_runner, + prompt_template=test_case.dataset_prompt_template, + agg_method=MEAN, + save=True, + ) + mock_build_pipeline.assert_called_with( + model_runner, + test_case.dataset_prompt_template, + test_case.valid_labels if test_case.valid_labels else mock_dataset.unique.return_value + ) + assert output == [mock_evaluate_dataset.return_value] diff --git a/test/unit/eval_algorithms/test_qa_accuracy_semantic_robustness.py b/test/unit/eval_algorithms/test_qa_accuracy_semantic_robustness.py index daad9668..e0ef2eb5 100644 --- a/test/unit/eval_algorithms/test_qa_accuracy_semantic_robustness.py +++ b/test/unit/eval_algorithms/test_qa_accuracy_semantic_robustness.py @@ -224,7 +224,7 @@ class TestCaseEvaluate(NamedTuple): ) @patch("fmeval.eval_algorithms.qa_accuracy_semantic_robustness.get_eval_results_path") @patch("fmeval.eval_algorithms.qa_accuracy_semantic_robustness.evaluate_dataset") - @patch("fmeval.eval_algorithms.qa_accuracy_semantic_robustness." "QAAccuracySemanticRobustness.build_pipeline") + @patch("fmeval.eval_algorithms.qa_accuracy_semantic_robustness.QAAccuracySemanticRobustness._build_pipeline") @patch("fmeval.eval_algorithms.qa_accuracy_semantic_robustness.get_dataset") @patch("fmeval.eval_algorithms.qa_accuracy_semantic_robustness.get_dataset_configs") def test_evaluate( From 7f9e7c8ca15738a158ca82fbd6c4156ae65aad9a Mon Sep 17 00:00:00 2001 From: Daniel Zhu Date: Wed, 27 Mar 2024 15:06:34 -0700 Subject: [PATCH 2/2] fix linting --- ...sification_accuracy_semantic_robustness.py | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/test/unit/eval_algorithms/test_classification_accuracy_semantic_robustness.py b/test/unit/eval_algorithms/test_classification_accuracy_semantic_robustness.py index c09f6691..ebb8db71 100644 --- a/test/unit/eval_algorithms/test_classification_accuracy_semantic_robustness.py +++ b/test/unit/eval_algorithms/test_classification_accuracy_semantic_robustness.py @@ -5,13 +5,15 @@ import pytest import ray from _pytest.fixtures import fixture -from ray.data import Dataset from fmeval.constants import ( DatasetColumns, - MIME_TYPE_JSON, BUTTER_FINGER, RANDOM_UPPER_CASE, WHITESPACE_ADD_REMOVE, MEAN, + MIME_TYPE_JSON, + BUTTER_FINGER, + RANDOM_UPPER_CASE, + WHITESPACE_ADD_REMOVE, + MEAN, ) -from fmeval.data_loaders.data_config import DataConfig from fmeval.eval_algorithms import ( EvalScore, EvalOutput, @@ -246,17 +248,19 @@ class TestCaseEvaluate(NamedTuple): ) @patch("fmeval.eval_algorithms.classification_accuracy_semantic_robustness.get_eval_results_path") @patch("fmeval.eval_algorithms.classification_accuracy_semantic_robustness.evaluate_dataset") - @patch("fmeval.eval_algorithms.classification_accuracy_semantic_robustness.ClassificationAccuracySemanticRobustness._build_pipeline") + @patch( + "fmeval.eval_algorithms.classification_accuracy_semantic_robustness.ClassificationAccuracySemanticRobustness._build_pipeline" + ) @patch("fmeval.eval_algorithms.classification_accuracy_semantic_robustness.get_dataset") @patch("fmeval.eval_algorithms.classification_accuracy_semantic_robustness.get_dataset_configs") def test_evaluate( - self, - mock_get_dataset_configs, - mock_get_dataset, - mock_build_pipeline, - mock_evaluate_dataset, - mock_get_results_path, - test_case, + self, + mock_get_dataset_configs, + mock_get_dataset, + mock_build_pipeline, + mock_evaluate_dataset, + mock_get_results_path, + test_case, ): """ GIVEN a ClassificationAccuracySemanticRobustness instance. @@ -307,6 +311,6 @@ def test_evaluate( mock_build_pipeline.assert_called_with( model_runner, test_case.dataset_prompt_template, - test_case.valid_labels if test_case.valid_labels else mock_dataset.unique.return_value + test_case.valid_labels if test_case.valid_labels else mock_dataset.unique.return_value, ) assert output == [mock_evaluate_dataset.return_value]