aws · oyangz · Mar 18, 2024 · Mar 18, 2024 · Mar 18, 2024 · Mar 18, 2024
diff --git a/src/fmeval/reporting/eval_output_cells.py b/src/fmeval/reporting/eval_output_cells.py
@@ -2,7 +2,14 @@
 import ray.data
 from textwrap import shorten
 import numpy as np
-from fmeval.eval_algorithms import EvalOutput, DATASET_CONFIGS, EvalAlgorithm, TREX, CROWS_PAIRS
+from fmeval.eval_algorithms import (
+    EvalOutput,
+    DATASET_CONFIGS,
+    EvalAlgorithm,
+    TREX,
+    CROWS_PAIRS,
+    get_default_prompt_template,
+)
 from fmeval.eval_algorithms.classification_accuracy import CLASSIFICATION_ACCURACY_SCORE
 from fmeval.eval_algorithms.general_semantic_robustness import WER_SCORE
 from fmeval.eval_algorithms.prompt_stereotyping import PROMPT_STEREOTYPING
@@ -357,6 +364,9 @@ def __init__(
             dataset=dataset,
             eval_name=eval_output.eval_name,
         )
+        prompt_template = EvalOutputCell.format_prompt_template(
+            dataset_type, eval_output.dataset_name, eval_output.prompt_template
+        )
         toxicity_detector_name = (
             f"**Toxicity detector model**: {add_hyperlink(DETOXIFY_NAME, DETOXIFY_URI)}"
             if eval_output.eval_name in TOXICITY_EVAL_NAMES and len(eval_output.dataset_scores) > 1
@@ -368,6 +378,7 @@ def __init__(
         eval_cells = [
             HeadingCell(f"{dataset_type}: {format_dataset_name(eval_output.dataset_name, hyperlink=True)}", level=4),
             MarkdownCell(dataset_description),
+            MarkdownCell(prompt_template),
             MarkdownCell(toxicity_detector_name),
         ]
         if eval_output.error:
@@ -450,3 +461,19 @@ def get_dataset_description(
                 else DATASET_DETAILS[dataset_name].description + " " + dataset_sampling_description
             )
             return dataset_description
+
+    @staticmethod
+    def format_prompt_template(dataset_type: str, dataset_name: str, prompt_template: Optional[str] = None) -> str:
+        """
+        :param dataset_type: string indicating if dataset is a built-in or custom dataset.
+        :param dataset_name: the name of the dataset.
+        :param prompt_template: optional prompt template used in the evaluation.
+        :return: prompt template string formatted for the report.
+        """
+        prompt_template_str = "**Prompt Template:** "
+        if prompt_template:
+            return prompt_template_str + escape(prompt_template)
+        elif dataset_type == BUILT_IN_DATASET:
+            return prompt_template_str + get_default_prompt_template(dataset_name)
+        else:
+            return prompt_template_str + "No prompt template was provided for this dataset."
diff --git a/test/unit/reporting/test_eval_output_cells.py b/test/unit/reporting/test_eval_output_cells.py
@@ -1,6 +1,6 @@
 from unittest.mock import patch, Mock, call, MagicMock
 
-from fmeval.eval_algorithms import EvalOutput, EvalScore, CategoryScore, EvalAlgorithm
+from fmeval.eval_algorithms import EvalOutput, EvalScore, CategoryScore, EvalAlgorithm, DEFAULT_PROMPT_TEMPLATE
 from fmeval.eval_algorithms.prompt_stereotyping import PROMPT_STEREOTYPING
 from fmeval.reporting.cells import BarPlotCell, TableCell
 from fmeval.reporting.eval_output_cells import (
@@ -411,7 +411,7 @@ def test_eval_output_cell_custom_dataset(self):
         eval_output = EvalOutput(
             eval_name="summarization_accuracy",
             dataset_name="Dataset 1",
-            prompt_template="prompt",
+            prompt_template="Summarize the following: $feature",
             dataset_scores=dataset_scores,
             category_scores=category_scores,
         )
@@ -422,7 +422,7 @@ def test_eval_output_cell_custom_dataset(self):
         dataset.select_columns = Mock()
         with patch("fmeval.reporting.eval_output_cells.ScoreCell", return_value="score_cell"):
             cell = EvalOutputCell(eval_output=eval_output, dataset=dataset)
-            expected_cell = "#### Custom Dataset: Dataset 1  \n\nWe sampled 10 records out of 10 in the full dataset.  \n\n  \n\nscore_cell  \n\nscore_cell  \n\nscore_cell"
+            expected_cell = "#### Custom Dataset: Dataset 1  \n\nWe sampled 10 records out of 10 in the full dataset.  \n\n**Prompt Template:** Summarize the following: $feature  \n\n  \n\nscore_cell  \n\nscore_cell  \n\nscore_cell"
             assert str(cell) == expected_cell
 
     def test_eval_output_cell_built_in_dataset(self):
@@ -457,7 +457,6 @@ def test_eval_output_cell_built_in_dataset(self):
         eval_output = EvalOutput(
             eval_name="summarization_accuracy",
             dataset_name="gov_report",
-            prompt_template="prompt",
             dataset_scores=dataset_scores,
             category_scores=category_scores,
         )
@@ -467,7 +466,7 @@ def test_eval_output_cell_built_in_dataset(self):
         dataset.select_columns = Mock()
         with patch("fmeval.reporting.eval_output_cells.ScoreCell", return_value="score_cell"):
             cell = EvalOutputCell(eval_output=eval_output, dataset=dataset)
-            expected_cell = '#### Built-in Dataset: <a style="color:#006DAA;" href="https://gov-report-data.github.io/">Government Report</a>  \n\nA dataset including a long-form summarization benchmark. It contains significantly longer documents (9.4k words) and summaries (553 words) than most existing datasets. We sampled 10 records out of 7238 in the full dataset.  \n\n  \n\nscore_cell  \n\nscore_cell  \n\nscore_cell'
+            expected_cell = f'#### Built-in Dataset: <a style="color:#006DAA;" href="https://gov-report-data.github.io/">Government Report</a>  \n\nA dataset including a long-form summarization benchmark. It contains significantly longer documents (9.4k words) and summaries (553 words) than most existing datasets. We sampled 10 records out of 7238 in the full dataset.  \n\n**Prompt Template:** Summarize the following text in a few sentences: {DEFAULT_PROMPT_TEMPLATE}  \n\n  \n\nscore_cell  \n\nscore_cell  \n\nscore_cell'
             assert str(cell) == expected_cell
 
     def test_eval_output_cell_eval_error(self):
@@ -479,10 +478,9 @@ def test_eval_output_cell_eval_error(self):
         eval_output = EvalOutput(
             eval_name="summarization accuracy",
             dataset_name="Dataset 1",
-            prompt_template="prompt",
             error="The summarization accuracy evaluation failed.",
         )
         with patch("fmeval.reporting.eval_output_cells.ScoreCell", return_value="score_cell"):
             cell = EvalOutputCell(eval_output=eval_output)
-            expected_cell = "#### Custom Dataset: Dataset 1  \n\n  \n\n  \n\n**This evaluation failed with the error message: The summarization accuracy evaluation failed.**"
+            expected_cell = "#### Custom Dataset: Dataset 1  \n\n  \n\n**Prompt Template:** No prompt template was provided for this dataset.  \n\n  \n\n**This evaluation failed with the error message: The summarization accuracy evaluation failed.**"
             assert str(cell) == expected_cell