Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/fmeval/eval_algorithms/classification_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ def __init__(self, eval_algorithm_config: ClassificationAccuracyConfig = Classif

:param eval_algorithm_config: Classification Accuracy eval algorithm config.
"""
super().__init__(eval_algorithm_config)
self._eval_algorithm_config = eval_algorithm_config
self._valid_labels = self._eval_algorithm_config.valid_labels

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ def __init__(

:param eval_algorithm_config: Classification Accuracy Semantic Robustness eval algorithm config.
"""
super().__init__(eval_algorithm_config)
self.eval_name = CLASSIFICATION_ACCURACY_SEMANTIC_ROBUSTNESS
self._eval_algorithm_config = eval_algorithm_config
self._classification_accuracy_eval_algo = ClassificationAccuracy(
Expand Down
68 changes: 49 additions & 19 deletions src/fmeval/eval_algorithms/eval_algorithm.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,77 @@
from abc import ABC, abstractmethod
from typing import List
from typing import List, Optional

from fmeval.data_loaders.data_config import DataConfig
from fmeval.eval_algorithms import EvalScore, EvalOutput
from fmeval.model_runners.model_runner import ModelRunner


class EvalAlgorithmConfig:
"""Configuration class to be used or extended to provide evaluation algorithm-specific parameters."""
"""Configuration class to be inherited from to provide evaluation algorithm-specific parameters."""


class EvalAlgorithmInterface(ABC):
"""Interface for evaluation algorithms.

This interface defines two required methods that all evaluation algorithms must implement.
The signatures of these methods is intentionally as generic as possible, to allow for
maximum freedom when implementing a new evaluation algorithm.
"""

def __init__(self, eval_algorithm_config: EvalAlgorithmConfig):
"""Initialize an evaluation algorithm instance.

:param eval_algorithm_config: Contains all configurable parameters for the evaluation algorithm.
"""

@abstractmethod
def evaluate_sample(self, *args, **kwargs) -> List[EvalScore]:
def evaluate_sample(
self,
model_input: Optional[str] = None,
model_output: Optional[str] = None,
target_output: Optional[str] = None,
model: Optional[ModelRunner] = None,
) -> List[EvalScore]:
"""Compute metrics for a single sample, where a sample is defined by the particular algorithm.

The arguments to this method should be any data that pertains to the sample to be evaluated,
and any additional data used to compute the relevant metrics/scores.
The `evaluate_sample` method implemented by different algorithms should use a subset of
these input parameters, but not all of them are required.

Example:
The evaluate_sample method of the FactualKnowledge evaluation algorithm takes
two arguments: `target_output` and `model_output`, which are used to compute the factual
knowledge score.
:param model_input: The input passed to `model`. If this parameter is not None,
`model` should likewise not be None.
:param model_output: The output from invoking a model. If provided, `model` generally
will not be required, as the output is already available.
:param target_output: The reference output that `model_output` will be compared against.
Note that if `model_output` is not provided but `model` and `model_input` are provided
instead, the output from invoking `model` will take the place of `model_output`.
:param model: A ModelRunner representing the model being evaluated.

:returns: A list of EvalScore objects, where each EvalScore represents a single
score/metric that is computed by the evaluation algorithm.
See the built-in evaluation algorithms (ex: FactualKnowledge, SummarizationAccuracy)
for concrete examples.
"""

@abstractmethod
def evaluate(self, *args, **kwargs) -> List[EvalOutput]:
def evaluate(
self,
model: Optional[ModelRunner] = None,
dataset_config: Optional[DataConfig] = None,
prompt_template: Optional[str] = None,
num_records: int = 100,
save: bool = False,
) -> List[EvalOutput]:
"""Compute metrics on all samples in one or more datasets.

The format that the dataset(s) in question take is up to the implementer
of the evaluation algorithm. All built-in evaluation algorithms in fmeval
currently utilize Ray Datasets. See the built-in evaluation algorithms
(ex: FactualKnowledge, SummarizationAccuracy) for concrete examples of how
to implement the `evaluate` method.
:param model: An instance of ModelRunner representing the model being evaluated.
:param dataset_config: Configures the single dataset used for the evaluation.
If not provided, this method will run evaluations using all of its supported
built-in datasets.
:param prompt_template: A template used to generate prompts from raw text inputs.
This parameter is not required if you with to run evaluations using the built-in
datasets, as they have their own default prompt templates pre-configured.
:param save: If True, model responses and scores will be saved to a file.
By default, the directory that this output file gets written to is
DEFAULT_EVAL_RESULTS_PATH, but this directory can be configured through
the EVAL_RESULTS_PATH environment variable.
:param num_records: The number of records to be randomly sampled from the input dataset
that is used for the evaluation.

:returns: A list of EvalOutput objects, where an EvalOutput encapsulates
the EvalScores (and optionally, CategoryScores) generated by the evaluation,
Expand Down
1 change: 1 addition & 0 deletions src/fmeval/eval_algorithms/factual_knowledge.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def __init__(self, eval_algorithm_config: FactualKnowledgeConfig = FactualKnowle

:param eval_algorithm_config: Factual knowledge eval algorithm config.
"""
super().__init__(eval_algorithm_config)
self.eval_name = FACTUAL_KNOWLEDGE
self._eval_algorithm_config = eval_algorithm_config

Expand Down
14 changes: 9 additions & 5 deletions src/fmeval/eval_algorithms/general_semantic_robustness.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,14 +127,15 @@ def __init__(
`evaluate_sample` method, which is a computationally cheap operation that does not
require utilizing Ray for parallel execution.
"""
super().__init__(eval_algorithm_config)
self.num_perturbations = eval_algorithm_config.num_perturbations
self.num_baseline_samples = eval_algorithm_config.num_baseline_samples
self.perturbation_transform = get_perturbation_transform(eval_algorithm_config)
self.bertscore_model = BertscoreModel(eval_algorithm_config.model_type_for_bertscore)
if use_ray:
self.bertscore_model = create_shared_resource(self.bertscore_model)

def build_pipeline(
def _build_pipeline(
self,
model: ModelRunner,
prompt_template: str,
Expand All @@ -159,12 +160,15 @@ def build_pipeline(
:returns: A TransformPipeline that can be used by either `evaluate_sample` or `evaluate`.
"""

transforms = get_model_responses_from_perturbed_inputs(
(
get_perturbed_inputs,
gen_perturbed_prompts,
get_perturbed_responses,
) = get_model_responses_from_perturbed_inputs(
self.perturbation_transform,
prompt_template,
model,
)
get_perturbed_inputs, gen_perturbed_prompts, get_perturbed_responses = transforms

original_model_output_key = DatasetColumns.MODEL_OUTPUT.value.name
# Compute BERTScores with target_output = the original model output
Expand Down Expand Up @@ -286,7 +290,7 @@ def evaluate_sample(
DatasetColumns.PROMPT.value.name: prompt,
DatasetColumns.MODEL_OUTPUT.value.name: model_output,
}
pipeline = self.build_pipeline(model, prompt_template, is_deterministic=is_deterministic)
pipeline = self._build_pipeline(model, prompt_template, is_deterministic=is_deterministic)
output_record = pipeline.execute_record(sample)

bert_score_dissimilarity_value = output_record[BERT_SCORE_DISSIMILARITY]
Expand Down Expand Up @@ -329,7 +333,7 @@ def evaluate(
model_invocation_pipeline = create_model_invocation_pipeline(model, dataset_prompt_template)
dataset = model_invocation_pipeline.execute(dataset)
is_deterministic = verify_model_determinism(model, dataset, DatasetColumns.PROMPT.value.name)
pipeline = self.build_pipeline(model, dataset_prompt_template, is_deterministic=is_deterministic)
pipeline = self._build_pipeline(model, dataset_prompt_template, is_deterministic=is_deterministic)
eval_output = compute_and_aggregate_metrics(
pipeline=pipeline,
dataset=dataset,
Expand Down
5 changes: 4 additions & 1 deletion src/fmeval/eval_algorithms/prompt_stereotyping.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
MEAN,
)
from fmeval.data_loaders.util import DataConfig, get_dataset
from fmeval.eval_algorithms.eval_algorithm import EvalAlgorithmInterface
from fmeval.eval_algorithms.eval_algorithm import EvalAlgorithmInterface, EvalAlgorithmConfig
from fmeval.eval_algorithms import (
EvalAlgorithm,
EvalOutput,
Expand Down Expand Up @@ -53,6 +53,9 @@ class PromptStereotyping(EvalAlgorithmInterface):

eval_name = PROMPT_STEREOTYPING

def __init__(self):
super().__init__(EvalAlgorithmConfig())

def evaluate(
self,
model: Optional[ModelRunner] = None,
Expand Down
1 change: 1 addition & 0 deletions src/fmeval/eval_algorithms/qa_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ def __init__(self, eval_algorithm_config: QAAccuracyConfig = QAAccuracyConfig())

:param eval_algorithm_config: QA Accuracy eval algorithm config.
"""
super().__init__(eval_algorithm_config)
self._eval_algorithm_config = eval_algorithm_config

def evaluate(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ def __init__(

:param eval_algorithm_config: QA Accuracy Semantic Robustness eval algorithm config.
"""
super().__init__(eval_algorithm_config)
self.eval_name = QA_ACCURACY_SEMANTIC_ROBUSTNESS
self._eval_algorithm_config = eval_algorithm_config

Expand Down
17 changes: 16 additions & 1 deletion src/fmeval/eval_algorithms/semantic_robustness_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Tuple

from fmeval.constants import BUTTER_FINGER, RANDOM_UPPER_CASE, WHITESPACE_ADD_REMOVE, DatasetColumns
from fmeval.eval_algorithms.eval_algorithm import EvalAlgorithmConfig
from fmeval.model_runners.model_runner import ModelRunner
from fmeval.transforms.common import GeneratePrompt, GetModelOutputs
from fmeval.transforms.semantic_perturbations import (
Expand All @@ -21,7 +22,7 @@


@dataclass(frozen=True)
class SemanticRobustnessConfig:
class SemanticRobustnessConfig(EvalAlgorithmConfig):
"""Configures the semantic robustness evaluation algorithms.

:param perturbation_type: Perturbation type for generating perturbed inputs.
Expand Down Expand Up @@ -53,6 +54,12 @@ def __post_init__(self):


def get_perturbation_transform(config: SemanticRobustnessConfig) -> SemanticPerturbation:
"""Returns a semantic perturbation transform based on parameters in `config`.

:param config: A config that specifies a perturbation type, which dictates the
SemanticPerturbation that gets returned, and its configurable parameters.
:returns: A SemanticPerturbation instance, initialized with parameters passed via `config`.
"""
if config.perturbation_type == BUTTER_FINGER:
return ButterFinger(
input_key=DatasetColumns.MODEL_INPUT.value.name,
Expand Down Expand Up @@ -91,6 +98,14 @@ def get_model_responses_from_perturbed_inputs(
prompt_template: str,
model: ModelRunner,
) -> Tuple[SemanticPerturbation, GeneratePrompt, GetModelOutputs]:
"""Returns a tuple of transforms for perturbing model inputs, composing prompts, and getting model outputs.

:param perturbation: The semantic perturbation transform used to perturb inputs.
:param prompt_template: The template used for composing prompts out of the perturbed inputs.
:param model: The model that is invoked on the prompts constructed from perturbed inputs.
:returns: A tuple of three transforms, where the first is the same SemanticPerturbation
that was passed in, and the second two are created in this function.
"""
# Generate prompts from perturbed inputs
gen_perturbed_prompts = GeneratePrompt(
input_keys=perturbation.output_keys,
Expand Down
Loading