aws · danielezhu · Mar 26, 2024 · Mar 26, 2024 · Mar 26, 2024 · Mar 26, 2024
diff --git a/src/fmeval/eval_algorithms/classification_accuracy.py b/src/fmeval/eval_algorithms/classification_accuracy.py
@@ -122,6 +122,7 @@ def __init__(self, eval_algorithm_config: ClassificationAccuracyConfig = Classif
 
         :param eval_algorithm_config: Classification Accuracy eval algorithm config.
         """
+        super().__init__(eval_algorithm_config)
         self._eval_algorithm_config = eval_algorithm_config
         self._valid_labels = self._eval_algorithm_config.valid_labels
 

diff --git a/src/fmeval/eval_algorithms/classification_accuracy_semantic_robustness.py b/src/fmeval/eval_algorithms/classification_accuracy_semantic_robustness.py
@@ -138,6 +138,7 @@ def __init__(
 
         :param eval_algorithm_config: Classification Accuracy Semantic Robustness eval algorithm config.
         """
+        super().__init__(eval_algorithm_config)
         self.eval_name = CLASSIFICATION_ACCURACY_SEMANTIC_ROBUSTNESS
         self._eval_algorithm_config = eval_algorithm_config
         self._classification_accuracy_eval_algo = ClassificationAccuracy(

diff --git a/src/fmeval/eval_algorithms/eval_algorithm.py b/src/fmeval/eval_algorithms/eval_algorithm.py
@@ -1,47 +1,77 @@
 from abc import ABC, abstractmethod
-from typing import List
+from typing import List, Optional
+
+from fmeval.data_loaders.data_config import DataConfig
 from fmeval.eval_algorithms import EvalScore, EvalOutput
+from fmeval.model_runners.model_runner import ModelRunner
 
 
 class EvalAlgorithmConfig:
-    """Configuration class to be used or extended to provide evaluation algorithm-specific parameters."""
+    """Configuration class to be inherited from to provide evaluation algorithm-specific parameters."""
 
 
 class EvalAlgorithmInterface(ABC):
     """Interface for evaluation algorithms.
 
     This interface defines two required methods that all evaluation algorithms must implement.
-    The signatures of these methods is intentionally as generic as possible, to allow for
-    maximum freedom when implementing a new evaluation algorithm.
     """
 
+    def __init__(self, eval_algorithm_config: EvalAlgorithmConfig):
+        """Initialize an evaluation algorithm instance.
+
+        :param eval_algorithm_config: Contains all configurable parameters for the evaluation algorithm.
+        """
+
     @abstractmethod
-    def evaluate_sample(self, *args, **kwargs) -> List[EvalScore]:
+    def evaluate_sample(
+        self,
+        model_input: Optional[str] = None,
+        model_output: Optional[str] = None,
+        target_output: Optional[str] = None,
+        model: Optional[ModelRunner] = None,
+    ) -> List[EvalScore]:
         """Compute metrics for a single sample, where a sample is defined by the particular algorithm.
 
-        The arguments to this method should be any data that pertains to the sample to be evaluated,
-        and any additional data used to compute the relevant metrics/scores.
+        The `evaluate_sample` method implemented by different algorithms should use a subset of
+        these input parameters, but not all of them are required.
 
-        Example:
-            The evaluate_sample method of the FactualKnowledge evaluation algorithm takes
-            two arguments: `target_output` and `model_output`, which are used to compute the factual
-            knowledge score.
+        :param model_input: The input passed to `model`. If this parameter is not None,
+            `model` should likewise not be None.
+        :param model_output: The output from invoking a model. If provided, `model` generally
+            will not be required, as the output is already available.
+        :param target_output: The reference output that `model_output` will be compared against.
+            Note that if `model_output` is not provided but `model` and `model_input` are provided
+            instead, the output from invoking `model` will take the place of `model_output`.
+        :param model: A ModelRunner representing the model being evaluated.
 
         :returns: A list of EvalScore objects, where each EvalScore represents a single
             score/metric that is computed by the evaluation algorithm.
-            See the built-in evaluation algorithms (ex: FactualKnowledge, SummarizationAccuracy)
-            for concrete examples.
         """
 
     @abstractmethod
-    def evaluate(self, *args, **kwargs) -> List[EvalOutput]:
+    def evaluate(
+        self,
+        model: Optional[ModelRunner] = None,
+        dataset_config: Optional[DataConfig] = None,
+        prompt_template: Optional[str] = None,
+        num_records: int = 100,
+        save: bool = False,
+    ) -> List[EvalOutput]:
         """Compute metrics on all samples in one or more datasets.
 
-        The format that the dataset(s) in question take is up to the implementer
-        of the evaluation algorithm. All built-in evaluation algorithms in fmeval
-        currently utilize Ray Datasets. See the built-in evaluation algorithms
-        (ex: FactualKnowledge, SummarizationAccuracy) for concrete examples of how
-        to implement the `evaluate` method.
+        :param model: An instance of ModelRunner representing the model being evaluated.
+        :param dataset_config: Configures the single dataset used for the evaluation.
+            If not provided, this method will run evaluations using all of its supported
+            built-in datasets.
+        :param prompt_template: A template used to generate prompts from raw text inputs.
+            This parameter is not required if you with to run evaluations using the built-in
+            datasets, as they have their own default prompt templates pre-configured.
+        :param save: If True, model responses and scores will be saved to a file.
+            By default, the directory that this output file gets written to is
+            DEFAULT_EVAL_RESULTS_PATH, but this directory can be configured through
+            the EVAL_RESULTS_PATH environment variable.
+        :param num_records: The number of records to be randomly sampled from the input dataset
+            that is used for the evaluation.
 
         :returns: A list of EvalOutput objects, where an EvalOutput encapsulates
         the EvalScores (and optionally, CategoryScores) generated by the evaluation,

diff --git a/src/fmeval/eval_algorithms/factual_knowledge.py b/src/fmeval/eval_algorithms/factual_knowledge.py
@@ -73,6 +73,7 @@ def __init__(self, eval_algorithm_config: FactualKnowledgeConfig = FactualKnowle
 
         :param eval_algorithm_config: Factual knowledge eval algorithm config.
         """
+        super().__init__(eval_algorithm_config)
         self.eval_name = FACTUAL_KNOWLEDGE
         self._eval_algorithm_config = eval_algorithm_config
 

diff --git a/src/fmeval/eval_algorithms/general_semantic_robustness.py b/src/fmeval/eval_algorithms/general_semantic_robustness.py
@@ -127,14 +127,15 @@ def __init__(
             `evaluate_sample` method, which is a computationally cheap operation that does not
             require utilizing Ray for parallel execution.
         """
+        super().__init__(eval_algorithm_config)
         self.num_perturbations = eval_algorithm_config.num_perturbations
         self.num_baseline_samples = eval_algorithm_config.num_baseline_samples
         self.perturbation_transform = get_perturbation_transform(eval_algorithm_config)
         self.bertscore_model = BertscoreModel(eval_algorithm_config.model_type_for_bertscore)
         if use_ray:
             self.bertscore_model = create_shared_resource(self.bertscore_model)
 
-    def build_pipeline(
+    def _build_pipeline(
         self,
         model: ModelRunner,
         prompt_template: str,
@@ -159,12 +160,15 @@ def build_pipeline(
         :returns: A TransformPipeline that can be used by either `evaluate_sample` or `evaluate`.
         """
 
-        transforms = get_model_responses_from_perturbed_inputs(
+        (
+            get_perturbed_inputs,
+            gen_perturbed_prompts,
+            get_perturbed_responses,
+        ) = get_model_responses_from_perturbed_inputs(
             self.perturbation_transform,
             prompt_template,
             model,
         )
-        get_perturbed_inputs, gen_perturbed_prompts, get_perturbed_responses = transforms
 
         original_model_output_key = DatasetColumns.MODEL_OUTPUT.value.name
         # Compute BERTScores with target_output = the original model output
@@ -286,7 +290,7 @@ def evaluate_sample(
             DatasetColumns.PROMPT.value.name: prompt,
             DatasetColumns.MODEL_OUTPUT.value.name: model_output,
         }
-        pipeline = self.build_pipeline(model, prompt_template, is_deterministic=is_deterministic)
+        pipeline = self._build_pipeline(model, prompt_template, is_deterministic=is_deterministic)
         output_record = pipeline.execute_record(sample)
 
         bert_score_dissimilarity_value = output_record[BERT_SCORE_DISSIMILARITY]
@@ -329,7 +333,7 @@ def evaluate(
             model_invocation_pipeline = create_model_invocation_pipeline(model, dataset_prompt_template)
             dataset = model_invocation_pipeline.execute(dataset)
             is_deterministic = verify_model_determinism(model, dataset, DatasetColumns.PROMPT.value.name)
-            pipeline = self.build_pipeline(model, dataset_prompt_template, is_deterministic=is_deterministic)
+            pipeline = self._build_pipeline(model, dataset_prompt_template, is_deterministic=is_deterministic)
             eval_output = compute_and_aggregate_metrics(
                 pipeline=pipeline,
                 dataset=dataset,

diff --git a/src/fmeval/eval_algorithms/prompt_stereotyping.py b/src/fmeval/eval_algorithms/prompt_stereotyping.py
@@ -8,7 +8,7 @@
     MEAN,
 )
 from fmeval.data_loaders.util import DataConfig, get_dataset
-from fmeval.eval_algorithms.eval_algorithm import EvalAlgorithmInterface
+from fmeval.eval_algorithms.eval_algorithm import EvalAlgorithmInterface, EvalAlgorithmConfig
 from fmeval.eval_algorithms import (
     EvalAlgorithm,
     EvalOutput,
@@ -53,6 +53,9 @@ class PromptStereotyping(EvalAlgorithmInterface):
 
     eval_name = PROMPT_STEREOTYPING
 
+    def __init__(self):
+        super().__init__(EvalAlgorithmConfig())
+
     def evaluate(
         self,
         model: Optional[ModelRunner] = None,

diff --git a/src/fmeval/eval_algorithms/qa_accuracy.py b/src/fmeval/eval_algorithms/qa_accuracy.py
@@ -242,6 +242,7 @@ def __init__(self, eval_algorithm_config: QAAccuracyConfig = QAAccuracyConfig())
 
         :param eval_algorithm_config: QA Accuracy eval algorithm config.
         """
+        super().__init__(eval_algorithm_config)
         self._eval_algorithm_config = eval_algorithm_config
 
     def evaluate(

diff --git a/src/fmeval/eval_algorithms/qa_accuracy_semantic_robustness.py b/src/fmeval/eval_algorithms/qa_accuracy_semantic_robustness.py
@@ -139,6 +139,7 @@ def __init__(
 
         :param eval_algorithm_config: QA Accuracy Semantic Robustness eval algorithm config.
         """
+        super().__init__(eval_algorithm_config)
         self.eval_name = QA_ACCURACY_SEMANTIC_ROBUSTNESS
         self._eval_algorithm_config = eval_algorithm_config
 

diff --git a/src/fmeval/eval_algorithms/semantic_robustness_utils.py b/src/fmeval/eval_algorithms/semantic_robustness_utils.py
@@ -2,6 +2,7 @@
 from typing import Tuple
 
 from fmeval.constants import BUTTER_FINGER, RANDOM_UPPER_CASE, WHITESPACE_ADD_REMOVE, DatasetColumns
+from fmeval.eval_algorithms.eval_algorithm import EvalAlgorithmConfig
 from fmeval.model_runners.model_runner import ModelRunner
 from fmeval.transforms.common import GeneratePrompt, GetModelOutputs
 from fmeval.transforms.semantic_perturbations import (
@@ -21,7 +22,7 @@
 
 
 @dataclass(frozen=True)
-class SemanticRobustnessConfig:
+class SemanticRobustnessConfig(EvalAlgorithmConfig):
     """Configures the semantic robustness evaluation algorithms.
 
     :param perturbation_type: Perturbation type for generating perturbed inputs.
@@ -53,6 +54,12 @@ def __post_init__(self):
 
 
 def get_perturbation_transform(config: SemanticRobustnessConfig) -> SemanticPerturbation:
+    """Returns a semantic perturbation transform based on parameters in `config`.
+
+    :param config: A config that specifies a perturbation type, which dictates the
+        SemanticPerturbation that gets returned, and its configurable parameters.
+    :returns: A SemanticPerturbation instance, initialized with parameters passed via `config`.
+    """
     if config.perturbation_type == BUTTER_FINGER:
         return ButterFinger(
             input_key=DatasetColumns.MODEL_INPUT.value.name,
@@ -91,6 +98,14 @@ def get_model_responses_from_perturbed_inputs(
     prompt_template: str,
     model: ModelRunner,
 ) -> Tuple[SemanticPerturbation, GeneratePrompt, GetModelOutputs]:
+    """Returns a tuple of transforms for perturbing model inputs, composing prompts, and getting model outputs.
+
+    :param perturbation: The semantic perturbation transform used to perturb inputs.
+    :param prompt_template: The template used for composing prompts out of the perturbed inputs.
+    :param model: The model that is invoked on the prompts constructed from perturbed inputs.
+    :returns: A tuple of three transforms, where the first is the same SemanticPerturbation
+        that was passed in, and the second two are created in this function.
+    """
     # Generate prompts from perturbed inputs
     gen_perturbed_prompts = GeneratePrompt(
         input_keys=perturbation.output_keys,