From 30a4ae62592f54204a1697c700b9ca5902b37c23 Mon Sep 17 00:00:00 2001
From: Michele Donini <mikko108382892@gmail.com>
Date: Mon, 11 Mar 2024 16:57:35 +0100
Subject: [PATCH 1/9] fix: update the default prompt templates for the built-in
 datasets

Changing the default prompt templates for the built-in datasets in order to align better with our prompt templating structure by keeping the $feature as the last token.
---
 src/fmeval/eval_algorithms/__init__.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/fmeval/eval_algorithms/__init__.py b/src/fmeval/eval_algorithms/__init__.py
index edca221e..71666a07 100644
--- a/src/fmeval/eval_algorithms/__init__.py
+++ b/src/fmeval/eval_algorithms/__init__.py
@@ -215,14 +215,14 @@ class ModelTask(Enum):
 DEFAULT_PROMPT_TEMPLATE = "$feature"
 
 BUILT_IN_DATASET_DEFAULT_PROMPT_TEMPLATES = {
-    BOOLQ: 'Respond to the following question. Valid answers are "True" or "False". $feature Answer:',
-    TRIVIA_QA: "Respond to the following question with a short answer: $feature Answer:",
-    NATURAL_QUESTIONS: "Respond to the following question with a short answer: $feature Answer:",
-    GIGAWORD: "Summarise the following text in one sentence: $feature",
-    GOV_REPORT: "Summarise the following text in a few sentences: $feature",
+    BOOLQ: 'Respond to the following question. Valid answers are "True" or "False". $feature',
+    TRIVIA_QA: "Respond to the following question with a short answer: $feature",
+    NATURAL_QUESTIONS: "Respond to the following question with a short answer: $feature",
+    GIGAWORD: "Summarize the following text in one sentence: $feature",
+    GOV_REPORT: "Summarize the following text in a few sentences: $feature",
     WOMENS_CLOTHING_ECOMMERCE_REVIEWS: (
         "Classify the sentiment of the following review with 0 (negative sentiment)"
-        " or 1 (positive sentiment). Review: $feature. Classification:"
+        " or 1 (positive sentiment): $feature"
     ),
 }
 

From 22662da6ed81454798ea9dfbd1ce173262ec9a0b Mon Sep 17 00:00:00 2001
From: Michele Donini <mikko108382892@gmail.com>
Date: Mon, 11 Mar 2024 17:17:14 +0100
Subject: [PATCH 2/9] Update unit test

Update the unit test for the default prompt template
---
 test/unit/eval_algorithms/test_eval_algorithm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/unit/eval_algorithms/test_eval_algorithm.py b/test/unit/eval_algorithms/test_eval_algorithm.py
index 49342772..2ff54aee 100644
--- a/test/unit/eval_algorithms/test_eval_algorithm.py
+++ b/test/unit/eval_algorithms/test_eval_algorithm.py
@@ -259,6 +259,6 @@ def test_get_default_prompt_template():
     """
     assert (
         get_default_prompt_template("trivia_qa")
-        == "Respond to the following question with a short answer: $feature Answer:"
+        == "Respond to the following question with a short answer: $feature"
     )
     assert get_default_prompt_template("my_custom_dataset") == "$feature"

From 162e69ff5632b31b06d15a524b92311a607944f2 Mon Sep 17 00:00:00 2001
From: jmikko <mikko108382892@gmail.com>
Date: Mon, 11 Mar 2024 17:57:54 +0100
Subject: [PATCH 3/9] Update unit tests

---
 test/unit/eval_algorithms/test_eval_algorithm.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/test/unit/eval_algorithms/test_eval_algorithm.py b/test/unit/eval_algorithms/test_eval_algorithm.py
index 2ff54aee..e8b38914 100644
--- a/test/unit/eval_algorithms/test_eval_algorithm.py
+++ b/test/unit/eval_algorithms/test_eval_algorithm.py
@@ -257,8 +257,5 @@ def test_get_default_prompt_template():
     WHEN get_default_prompt_template() method is called
     THEN expected default prompt template is returned
     """
-    assert (
-        get_default_prompt_template("trivia_qa")
-        == "Respond to the following question with a short answer: $feature"
-    )
+    assert get_default_prompt_template("trivia_qa") == "Respond to the following question with a short answer: $feature"
     assert get_default_prompt_template("my_custom_dataset") == "$feature"

From 53a6609a215d44e9cbcbb8e0eba9b46f997bcea5 Mon Sep 17 00:00:00 2001
From: jmikko <mikko108382892@gmail.com>
Date: Wed, 13 Mar 2024 10:29:25 +0100
Subject: [PATCH 4/9] Update integration tests

---
 .../test_summarization_accuracy_semantic_robustness.py      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/integration/test_summarization_accuracy_semantic_robustness.py b/test/integration/test_summarization_accuracy_semantic_robustness.py
index e2fda8ea..32e774d5 100644
--- a/test/integration/test_summarization_accuracy_semantic_robustness.py
+++ b/test/integration/test_summarization_accuracy_semantic_robustness.py
@@ -66,7 +66,7 @@ class TestSummarizationAccuracySemanticRobustness:
                 expected_evaluate_scores={
                     ROUGE_SCORE: 0.021908,
                     METEOR_SCORE: 0.107623,
-                    BERT_SCORE: 0.559997,
+                    BERT_SCORE: 0.559893,
                     DELTA_ROUGE_SCORE: 0.019394,
                     DELTA_METEOR_SCORE: 0.044310,
                     DELTA_BERT_SCORE: 0.033714,
@@ -85,7 +85,7 @@ class TestSummarizationAccuracySemanticRobustness:
                 expected_evaluate_scores={
                     ROUGE_SCORE: 0.021908,
                     METEOR_SCORE: 0.107623,
-                    BERT_SCORE: 0.559998,
+                    BERT_SCORE: 0.559893,
                     DELTA_ROUGE_SCORE: 0.035696,
                     DELTA_METEOR_SCORE: 0.056931,
                     DELTA_BERT_SCORE: 0.027971,
@@ -104,7 +104,7 @@ class TestSummarizationAccuracySemanticRobustness:
                 expected_evaluate_scores={
                     ROUGE_SCORE: 0.021908,
                     METEOR_SCORE: 0.107623,
-                    BERT_SCORE: 0.559998,
+                    BERT_SCORE: 0.559893,
                     DELTA_ROUGE_SCORE: 0.032187,
                     DELTA_METEOR_SCORE: 0.057705,
                     DELTA_BERT_SCORE: 0.027511,

From 912f863092df1e596cfc746b79e2a472e9de7ae7 Mon Sep 17 00:00:00 2001
From: jmikko <mikko108382892@gmail.com>
Date: Wed, 13 Mar 2024 11:39:21 +0100
Subject: [PATCH 5/9] Update integration tests (meteor)

---
 .../test_summarization_accuracy_semantic_robustness.py      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/integration/test_summarization_accuracy_semantic_robustness.py b/test/integration/test_summarization_accuracy_semantic_robustness.py
index 32e774d5..7669b2a9 100644
--- a/test/integration/test_summarization_accuracy_semantic_robustness.py
+++ b/test/integration/test_summarization_accuracy_semantic_robustness.py
@@ -65,7 +65,7 @@ class TestSummarizationAccuracySemanticRobustness:
                 },
                 expected_evaluate_scores={
                     ROUGE_SCORE: 0.021908,
-                    METEOR_SCORE: 0.107623,
+                    METEOR_SCORE: 0.105540,
                     BERT_SCORE: 0.559893,
                     DELTA_ROUGE_SCORE: 0.019394,
                     DELTA_METEOR_SCORE: 0.044310,
@@ -84,7 +84,7 @@ class TestSummarizationAccuracySemanticRobustness:
                 },
                 expected_evaluate_scores={
                     ROUGE_SCORE: 0.021908,
-                    METEOR_SCORE: 0.107623,
+                    METEOR_SCORE: 0.105540,
                     BERT_SCORE: 0.559893,
                     DELTA_ROUGE_SCORE: 0.035696,
                     DELTA_METEOR_SCORE: 0.056931,
@@ -103,7 +103,7 @@ class TestSummarizationAccuracySemanticRobustness:
                 },
                 expected_evaluate_scores={
                     ROUGE_SCORE: 0.021908,
-                    METEOR_SCORE: 0.107623,
+                    METEOR_SCORE: 0.105540,
                     BERT_SCORE: 0.559893,
                     DELTA_ROUGE_SCORE: 0.032187,
                     DELTA_METEOR_SCORE: 0.057705,

From d994a50bd7f8dcea95aa4bc0faa6e1204cbe5edc Mon Sep 17 00:00:00 2001
From: jmikko <mikko108382892@gmail.com>
Date: Wed, 13 Mar 2024 12:51:58 +0100
Subject: [PATCH 6/9] Update integration tests (delta rouge)

---
 .../test_summarization_accuracy_semantic_robustness.py      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/integration/test_summarization_accuracy_semantic_robustness.py b/test/integration/test_summarization_accuracy_semantic_robustness.py
index 7669b2a9..efb481f6 100644
--- a/test/integration/test_summarization_accuracy_semantic_robustness.py
+++ b/test/integration/test_summarization_accuracy_semantic_robustness.py
@@ -67,7 +67,7 @@ class TestSummarizationAccuracySemanticRobustness:
                     ROUGE_SCORE: 0.021908,
                     METEOR_SCORE: 0.105540,
                     BERT_SCORE: 0.559893,
-                    DELTA_ROUGE_SCORE: 0.019394,
+                    DELTA_ROUGE_SCORE: 0.021061,
                     DELTA_METEOR_SCORE: 0.044310,
                     DELTA_BERT_SCORE: 0.033714,
                 },
@@ -86,7 +86,7 @@ class TestSummarizationAccuracySemanticRobustness:
                     ROUGE_SCORE: 0.021908,
                     METEOR_SCORE: 0.105540,
                     BERT_SCORE: 0.559893,
-                    DELTA_ROUGE_SCORE: 0.035696,
+                    DELTA_ROUGE_SCORE: 0.037362,
                     DELTA_METEOR_SCORE: 0.056931,
                     DELTA_BERT_SCORE: 0.027971,
                 },
@@ -105,7 +105,7 @@ class TestSummarizationAccuracySemanticRobustness:
                     ROUGE_SCORE: 0.021908,
                     METEOR_SCORE: 0.105540,
                     BERT_SCORE: 0.559893,
-                    DELTA_ROUGE_SCORE: 0.032187,
+                    DELTA_ROUGE_SCORE: 0.030725,
                     DELTA_METEOR_SCORE: 0.057705,
                     DELTA_BERT_SCORE: 0.027511,
                 },

From 3c92f26d8ce09497d3c254b1ef4a444d7bd693bf Mon Sep 17 00:00:00 2001
From: jmikko <mikko108382892@gmail.com>
Date: Wed, 13 Mar 2024 13:43:29 +0100
Subject: [PATCH 7/9] Update integration tests (delta bert)

---
 .../test_summarization_accuracy_semantic_robustness.py      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/integration/test_summarization_accuracy_semantic_robustness.py b/test/integration/test_summarization_accuracy_semantic_robustness.py
index efb481f6..0ae3277d 100644
--- a/test/integration/test_summarization_accuracy_semantic_robustness.py
+++ b/test/integration/test_summarization_accuracy_semantic_robustness.py
@@ -69,7 +69,7 @@ class TestSummarizationAccuracySemanticRobustness:
                     BERT_SCORE: 0.559893,
                     DELTA_ROUGE_SCORE: 0.021061,
                     DELTA_METEOR_SCORE: 0.044310,
-                    DELTA_BERT_SCORE: 0.033714,
+                    DELTA_BERT_SCORE: 0.032417,
                 },
             ),
             TestCaseEvaluate(
@@ -88,7 +88,7 @@ class TestSummarizationAccuracySemanticRobustness:
                     BERT_SCORE: 0.559893,
                     DELTA_ROUGE_SCORE: 0.037362,
                     DELTA_METEOR_SCORE: 0.056931,
-                    DELTA_BERT_SCORE: 0.027971,
+                    DELTA_BERT_SCORE: 0.026363,
                 },
             ),
             TestCaseEvaluate(
@@ -107,7 +107,7 @@ class TestSummarizationAccuracySemanticRobustness:
                     BERT_SCORE: 0.559893,
                     DELTA_ROUGE_SCORE: 0.030725,
                     DELTA_METEOR_SCORE: 0.057705,
-                    DELTA_BERT_SCORE: 0.027511,
+                    DELTA_BERT_SCORE: 0.026511,
                 },
             ),
         ],

From e5dbeaf5e5511eb61a580dc14e975a00c01fa520 Mon Sep 17 00:00:00 2001
From: jmikko <mikko108382892@gmail.com>
Date: Wed, 13 Mar 2024 14:38:52 +0100
Subject: [PATCH 8/9] Update integration tests (delta meteor)

---
 .../test_summarization_accuracy_semantic_robustness.py        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/integration/test_summarization_accuracy_semantic_robustness.py b/test/integration/test_summarization_accuracy_semantic_robustness.py
index 0ae3277d..46f09f7c 100644
--- a/test/integration/test_summarization_accuracy_semantic_robustness.py
+++ b/test/integration/test_summarization_accuracy_semantic_robustness.py
@@ -68,7 +68,7 @@ class TestSummarizationAccuracySemanticRobustness:
                     METEOR_SCORE: 0.105540,
                     BERT_SCORE: 0.559893,
                     DELTA_ROUGE_SCORE: 0.021061,
-                    DELTA_METEOR_SCORE: 0.044310,
+                    DELTA_METEOR_SCORE: 0.046859,
                     DELTA_BERT_SCORE: 0.032417,
                 },
             ),
@@ -87,7 +87,7 @@ class TestSummarizationAccuracySemanticRobustness:
                     METEOR_SCORE: 0.105540,
                     BERT_SCORE: 0.559893,
                     DELTA_ROUGE_SCORE: 0.037362,
-                    DELTA_METEOR_SCORE: 0.056931,
+                    DELTA_METEOR_SCORE: 0.056909,
                     DELTA_BERT_SCORE: 0.026363,
                 },
             ),

From 7dcf1568ea3186f527dc32668f0a3d9ba9c8a5ac Mon Sep 17 00:00:00 2001
From: jmikko <mikko108382892@gmail.com>
Date: Wed, 13 Mar 2024 15:31:57 +0100
Subject: [PATCH 9/9] Update integration tests (delta meteor)

---
 .../test_summarization_accuracy_semantic_robustness.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/integration/test_summarization_accuracy_semantic_robustness.py b/test/integration/test_summarization_accuracy_semantic_robustness.py
index 46f09f7c..11c4a8fe 100644
--- a/test/integration/test_summarization_accuracy_semantic_robustness.py
+++ b/test/integration/test_summarization_accuracy_semantic_robustness.py
@@ -106,7 +106,7 @@ class TestSummarizationAccuracySemanticRobustness:
                     METEOR_SCORE: 0.105540,
                     BERT_SCORE: 0.559893,
                     DELTA_ROUGE_SCORE: 0.030725,
-                    DELTA_METEOR_SCORE: 0.057705,
+                    DELTA_METEOR_SCORE: 0.054234,
                     DELTA_BERT_SCORE: 0.026511,
                 },
             ),