Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 28 additions & 1 deletion src/fmeval/reporting/eval_output_cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,14 @@
import ray.data
from textwrap import shorten
import numpy as np
from fmeval.eval_algorithms import EvalOutput, DATASET_CONFIGS, EvalAlgorithm, TREX, CROWS_PAIRS
from fmeval.eval_algorithms import (
EvalOutput,
DATASET_CONFIGS,
EvalAlgorithm,
TREX,
CROWS_PAIRS,
get_default_prompt_template,
)
from fmeval.eval_algorithms.classification_accuracy import CLASSIFICATION_ACCURACY_SCORE
from fmeval.eval_algorithms.general_semantic_robustness import WER_SCORE
from fmeval.eval_algorithms.prompt_stereotyping import PROMPT_STEREOTYPING
Expand Down Expand Up @@ -357,6 +364,9 @@ def __init__(
dataset=dataset,
eval_name=eval_output.eval_name,
)
prompt_template = EvalOutputCell.format_prompt_template(
dataset_type, eval_output.dataset_name, eval_output.prompt_template
)
toxicity_detector_name = (
f"**Toxicity detector model**: {add_hyperlink(DETOXIFY_NAME, DETOXIFY_URI)}"
if eval_output.eval_name in TOXICITY_EVAL_NAMES and len(eval_output.dataset_scores) > 1
Expand All @@ -368,6 +378,7 @@ def __init__(
eval_cells = [
HeadingCell(f"{dataset_type}: {format_dataset_name(eval_output.dataset_name, hyperlink=True)}", level=4),
MarkdownCell(dataset_description),
MarkdownCell(prompt_template),
MarkdownCell(toxicity_detector_name),
]
if eval_output.error:
Expand Down Expand Up @@ -450,3 +461,19 @@ def get_dataset_description(
else DATASET_DETAILS[dataset_name].description + " " + dataset_sampling_description
)
return dataset_description

@staticmethod
def format_prompt_template(dataset_type: str, dataset_name: str, prompt_template: Optional[str] = None) -> str:
"""
:param dataset_type: string indicating if dataset is a built-in or custom dataset.
:param dataset_name: the name of the dataset.
:param prompt_template: optional prompt template used in the evaluation.
:return: prompt template string formatted for the report.
"""
prompt_template_str = "**Prompt Template:** "
if prompt_template:
return prompt_template_str + escape(prompt_template)
elif dataset_type == BUILT_IN_DATASET:
return prompt_template_str + get_default_prompt_template(dataset_name)
else:
return prompt_template_str + "No prompt template was provided for this dataset."
12 changes: 5 additions & 7 deletions test/unit/reporting/test_eval_output_cells.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from unittest.mock import patch, Mock, call, MagicMock

from fmeval.eval_algorithms import EvalOutput, EvalScore, CategoryScore, EvalAlgorithm
from fmeval.eval_algorithms import EvalOutput, EvalScore, CategoryScore, EvalAlgorithm, DEFAULT_PROMPT_TEMPLATE
from fmeval.eval_algorithms.prompt_stereotyping import PROMPT_STEREOTYPING
from fmeval.reporting.cells import BarPlotCell, TableCell
from fmeval.reporting.eval_output_cells import (
Expand Down Expand Up @@ -411,7 +411,7 @@ def test_eval_output_cell_custom_dataset(self):
eval_output = EvalOutput(
eval_name="summarization_accuracy",
dataset_name="Dataset 1",
prompt_template="prompt",
prompt_template="Summarize the following: $feature",
dataset_scores=dataset_scores,
category_scores=category_scores,
)
Expand All @@ -422,7 +422,7 @@ def test_eval_output_cell_custom_dataset(self):
dataset.select_columns = Mock()
with patch("fmeval.reporting.eval_output_cells.ScoreCell", return_value="score_cell"):
cell = EvalOutputCell(eval_output=eval_output, dataset=dataset)
expected_cell = "#### Custom Dataset: Dataset 1 \n\nWe sampled 10 records out of 10 in the full dataset. \n\n \n\nscore_cell \n\nscore_cell \n\nscore_cell"
expected_cell = "#### Custom Dataset: Dataset 1 \n\nWe sampled 10 records out of 10 in the full dataset. \n\n**Prompt Template:** Summarize the following: $feature \n\n \n\nscore_cell \n\nscore_cell \n\nscore_cell"
assert str(cell) == expected_cell

def test_eval_output_cell_built_in_dataset(self):
Expand Down Expand Up @@ -457,7 +457,6 @@ def test_eval_output_cell_built_in_dataset(self):
eval_output = EvalOutput(
eval_name="summarization_accuracy",
dataset_name="gov_report",
prompt_template="prompt",
dataset_scores=dataset_scores,
category_scores=category_scores,
)
Expand All @@ -467,7 +466,7 @@ def test_eval_output_cell_built_in_dataset(self):
dataset.select_columns = Mock()
with patch("fmeval.reporting.eval_output_cells.ScoreCell", return_value="score_cell"):
cell = EvalOutputCell(eval_output=eval_output, dataset=dataset)
expected_cell = '#### Built-in Dataset: <a style="color:#006DAA;" href="https://gov-report-data.github.io/">Government Report</a> \n\nA dataset including a long-form summarization benchmark. It contains significantly longer documents (9.4k words) and summaries (553 words) than most existing datasets. We sampled 10 records out of 7238 in the full dataset. \n\n \n\nscore_cell \n\nscore_cell \n\nscore_cell'
expected_cell = f'#### Built-in Dataset: <a style="color:#006DAA;" href="https://gov-report-data.github.io/">Government Report</a> \n\nA dataset including a long-form summarization benchmark. It contains significantly longer documents (9.4k words) and summaries (553 words) than most existing datasets. We sampled 10 records out of 7238 in the full dataset. \n\n**Prompt Template:** Summarize the following text in a few sentences: {DEFAULT_PROMPT_TEMPLATE} \n\n \n\nscore_cell \n\nscore_cell \n\nscore_cell'
assert str(cell) == expected_cell

def test_eval_output_cell_eval_error(self):
Expand All @@ -479,10 +478,9 @@ def test_eval_output_cell_eval_error(self):
eval_output = EvalOutput(
eval_name="summarization accuracy",
dataset_name="Dataset 1",
prompt_template="prompt",
error="The summarization accuracy evaluation failed.",
)
with patch("fmeval.reporting.eval_output_cells.ScoreCell", return_value="score_cell"):
cell = EvalOutputCell(eval_output=eval_output)
expected_cell = "#### Custom Dataset: Dataset 1 \n\n \n\n \n\n**This evaluation failed with the error message: The summarization accuracy evaluation failed.**"
expected_cell = "#### Custom Dataset: Dataset 1 \n\n \n\n**Prompt Template:** No prompt template was provided for this dataset. \n\n \n\n**This evaluation failed with the error message: The summarization accuracy evaluation failed.**"
assert str(cell) == expected_cell