From 30a4ae62592f54204a1697c700b9ca5902b37c23 Mon Sep 17 00:00:00 2001 From: Michele Donini Date: Mon, 11 Mar 2024 16:57:35 +0100 Subject: [PATCH 1/9] fix: update the default prompt templates for the built-in datasets Changing the default prompt templates for the built-in datasets in order to align better with our prompt templating structure by keeping the $feature as the last token. --- src/fmeval/eval_algorithms/__init__.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/fmeval/eval_algorithms/__init__.py b/src/fmeval/eval_algorithms/__init__.py index edca221e..71666a07 100644 --- a/src/fmeval/eval_algorithms/__init__.py +++ b/src/fmeval/eval_algorithms/__init__.py @@ -215,14 +215,14 @@ class ModelTask(Enum): DEFAULT_PROMPT_TEMPLATE = "$feature" BUILT_IN_DATASET_DEFAULT_PROMPT_TEMPLATES = { - BOOLQ: 'Respond to the following question. Valid answers are "True" or "False". $feature Answer:', - TRIVIA_QA: "Respond to the following question with a short answer: $feature Answer:", - NATURAL_QUESTIONS: "Respond to the following question with a short answer: $feature Answer:", - GIGAWORD: "Summarise the following text in one sentence: $feature", - GOV_REPORT: "Summarise the following text in a few sentences: $feature", + BOOLQ: 'Respond to the following question. Valid answers are "True" or "False". $feature', + TRIVIA_QA: "Respond to the following question with a short answer: $feature", + NATURAL_QUESTIONS: "Respond to the following question with a short answer: $feature", + GIGAWORD: "Summarize the following text in one sentence: $feature", + GOV_REPORT: "Summarize the following text in a few sentences: $feature", WOMENS_CLOTHING_ECOMMERCE_REVIEWS: ( "Classify the sentiment of the following review with 0 (negative sentiment)" - " or 1 (positive sentiment). Review: $feature. Classification:" + " or 1 (positive sentiment): $feature" ), } From 22662da6ed81454798ea9dfbd1ce173262ec9a0b Mon Sep 17 00:00:00 2001 From: Michele Donini Date: Mon, 11 Mar 2024 17:17:14 +0100 Subject: [PATCH 2/9] Update unit test Update the unit test for the default prompt template --- test/unit/eval_algorithms/test_eval_algorithm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/eval_algorithms/test_eval_algorithm.py b/test/unit/eval_algorithms/test_eval_algorithm.py index 49342772..2ff54aee 100644 --- a/test/unit/eval_algorithms/test_eval_algorithm.py +++ b/test/unit/eval_algorithms/test_eval_algorithm.py @@ -259,6 +259,6 @@ def test_get_default_prompt_template(): """ assert ( get_default_prompt_template("trivia_qa") - == "Respond to the following question with a short answer: $feature Answer:" + == "Respond to the following question with a short answer: $feature" ) assert get_default_prompt_template("my_custom_dataset") == "$feature" From 162e69ff5632b31b06d15a524b92311a607944f2 Mon Sep 17 00:00:00 2001 From: jmikko Date: Mon, 11 Mar 2024 17:57:54 +0100 Subject: [PATCH 3/9] Update unit tests --- test/unit/eval_algorithms/test_eval_algorithm.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/test/unit/eval_algorithms/test_eval_algorithm.py b/test/unit/eval_algorithms/test_eval_algorithm.py index 2ff54aee..e8b38914 100644 --- a/test/unit/eval_algorithms/test_eval_algorithm.py +++ b/test/unit/eval_algorithms/test_eval_algorithm.py @@ -257,8 +257,5 @@ def test_get_default_prompt_template(): WHEN get_default_prompt_template() method is called THEN expected default prompt template is returned """ - assert ( - get_default_prompt_template("trivia_qa") - == "Respond to the following question with a short answer: $feature" - ) + assert get_default_prompt_template("trivia_qa") == "Respond to the following question with a short answer: $feature" assert get_default_prompt_template("my_custom_dataset") == "$feature" From 53a6609a215d44e9cbcbb8e0eba9b46f997bcea5 Mon Sep 17 00:00:00 2001 From: jmikko Date: Wed, 13 Mar 2024 10:29:25 +0100 Subject: [PATCH 4/9] Update integration tests --- .../test_summarization_accuracy_semantic_robustness.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/integration/test_summarization_accuracy_semantic_robustness.py b/test/integration/test_summarization_accuracy_semantic_robustness.py index e2fda8ea..32e774d5 100644 --- a/test/integration/test_summarization_accuracy_semantic_robustness.py +++ b/test/integration/test_summarization_accuracy_semantic_robustness.py @@ -66,7 +66,7 @@ class TestSummarizationAccuracySemanticRobustness: expected_evaluate_scores={ ROUGE_SCORE: 0.021908, METEOR_SCORE: 0.107623, - BERT_SCORE: 0.559997, + BERT_SCORE: 0.559893, DELTA_ROUGE_SCORE: 0.019394, DELTA_METEOR_SCORE: 0.044310, DELTA_BERT_SCORE: 0.033714, @@ -85,7 +85,7 @@ class TestSummarizationAccuracySemanticRobustness: expected_evaluate_scores={ ROUGE_SCORE: 0.021908, METEOR_SCORE: 0.107623, - BERT_SCORE: 0.559998, + BERT_SCORE: 0.559893, DELTA_ROUGE_SCORE: 0.035696, DELTA_METEOR_SCORE: 0.056931, DELTA_BERT_SCORE: 0.027971, @@ -104,7 +104,7 @@ class TestSummarizationAccuracySemanticRobustness: expected_evaluate_scores={ ROUGE_SCORE: 0.021908, METEOR_SCORE: 0.107623, - BERT_SCORE: 0.559998, + BERT_SCORE: 0.559893, DELTA_ROUGE_SCORE: 0.032187, DELTA_METEOR_SCORE: 0.057705, DELTA_BERT_SCORE: 0.027511, From 912f863092df1e596cfc746b79e2a472e9de7ae7 Mon Sep 17 00:00:00 2001 From: jmikko Date: Wed, 13 Mar 2024 11:39:21 +0100 Subject: [PATCH 5/9] Update integration tests (meteor) --- .../test_summarization_accuracy_semantic_robustness.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/integration/test_summarization_accuracy_semantic_robustness.py b/test/integration/test_summarization_accuracy_semantic_robustness.py index 32e774d5..7669b2a9 100644 --- a/test/integration/test_summarization_accuracy_semantic_robustness.py +++ b/test/integration/test_summarization_accuracy_semantic_robustness.py @@ -65,7 +65,7 @@ class TestSummarizationAccuracySemanticRobustness: }, expected_evaluate_scores={ ROUGE_SCORE: 0.021908, - METEOR_SCORE: 0.107623, + METEOR_SCORE: 0.105540, BERT_SCORE: 0.559893, DELTA_ROUGE_SCORE: 0.019394, DELTA_METEOR_SCORE: 0.044310, @@ -84,7 +84,7 @@ class TestSummarizationAccuracySemanticRobustness: }, expected_evaluate_scores={ ROUGE_SCORE: 0.021908, - METEOR_SCORE: 0.107623, + METEOR_SCORE: 0.105540, BERT_SCORE: 0.559893, DELTA_ROUGE_SCORE: 0.035696, DELTA_METEOR_SCORE: 0.056931, @@ -103,7 +103,7 @@ class TestSummarizationAccuracySemanticRobustness: }, expected_evaluate_scores={ ROUGE_SCORE: 0.021908, - METEOR_SCORE: 0.107623, + METEOR_SCORE: 0.105540, BERT_SCORE: 0.559893, DELTA_ROUGE_SCORE: 0.032187, DELTA_METEOR_SCORE: 0.057705, From d994a50bd7f8dcea95aa4bc0faa6e1204cbe5edc Mon Sep 17 00:00:00 2001 From: jmikko Date: Wed, 13 Mar 2024 12:51:58 +0100 Subject: [PATCH 6/9] Update integration tests (delta rouge) --- .../test_summarization_accuracy_semantic_robustness.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/integration/test_summarization_accuracy_semantic_robustness.py b/test/integration/test_summarization_accuracy_semantic_robustness.py index 7669b2a9..efb481f6 100644 --- a/test/integration/test_summarization_accuracy_semantic_robustness.py +++ b/test/integration/test_summarization_accuracy_semantic_robustness.py @@ -67,7 +67,7 @@ class TestSummarizationAccuracySemanticRobustness: ROUGE_SCORE: 0.021908, METEOR_SCORE: 0.105540, BERT_SCORE: 0.559893, - DELTA_ROUGE_SCORE: 0.019394, + DELTA_ROUGE_SCORE: 0.021061, DELTA_METEOR_SCORE: 0.044310, DELTA_BERT_SCORE: 0.033714, }, @@ -86,7 +86,7 @@ class TestSummarizationAccuracySemanticRobustness: ROUGE_SCORE: 0.021908, METEOR_SCORE: 0.105540, BERT_SCORE: 0.559893, - DELTA_ROUGE_SCORE: 0.035696, + DELTA_ROUGE_SCORE: 0.037362, DELTA_METEOR_SCORE: 0.056931, DELTA_BERT_SCORE: 0.027971, }, @@ -105,7 +105,7 @@ class TestSummarizationAccuracySemanticRobustness: ROUGE_SCORE: 0.021908, METEOR_SCORE: 0.105540, BERT_SCORE: 0.559893, - DELTA_ROUGE_SCORE: 0.032187, + DELTA_ROUGE_SCORE: 0.030725, DELTA_METEOR_SCORE: 0.057705, DELTA_BERT_SCORE: 0.027511, }, From 3c92f26d8ce09497d3c254b1ef4a444d7bd693bf Mon Sep 17 00:00:00 2001 From: jmikko Date: Wed, 13 Mar 2024 13:43:29 +0100 Subject: [PATCH 7/9] Update integration tests (delta bert) --- .../test_summarization_accuracy_semantic_robustness.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/integration/test_summarization_accuracy_semantic_robustness.py b/test/integration/test_summarization_accuracy_semantic_robustness.py index efb481f6..0ae3277d 100644 --- a/test/integration/test_summarization_accuracy_semantic_robustness.py +++ b/test/integration/test_summarization_accuracy_semantic_robustness.py @@ -69,7 +69,7 @@ class TestSummarizationAccuracySemanticRobustness: BERT_SCORE: 0.559893, DELTA_ROUGE_SCORE: 0.021061, DELTA_METEOR_SCORE: 0.044310, - DELTA_BERT_SCORE: 0.033714, + DELTA_BERT_SCORE: 0.032417, }, ), TestCaseEvaluate( @@ -88,7 +88,7 @@ class TestSummarizationAccuracySemanticRobustness: BERT_SCORE: 0.559893, DELTA_ROUGE_SCORE: 0.037362, DELTA_METEOR_SCORE: 0.056931, - DELTA_BERT_SCORE: 0.027971, + DELTA_BERT_SCORE: 0.026363, }, ), TestCaseEvaluate( @@ -107,7 +107,7 @@ class TestSummarizationAccuracySemanticRobustness: BERT_SCORE: 0.559893, DELTA_ROUGE_SCORE: 0.030725, DELTA_METEOR_SCORE: 0.057705, - DELTA_BERT_SCORE: 0.027511, + DELTA_BERT_SCORE: 0.026511, }, ), ], From e5dbeaf5e5511eb61a580dc14e975a00c01fa520 Mon Sep 17 00:00:00 2001 From: jmikko Date: Wed, 13 Mar 2024 14:38:52 +0100 Subject: [PATCH 8/9] Update integration tests (delta meteor) --- .../test_summarization_accuracy_semantic_robustness.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/integration/test_summarization_accuracy_semantic_robustness.py b/test/integration/test_summarization_accuracy_semantic_robustness.py index 0ae3277d..46f09f7c 100644 --- a/test/integration/test_summarization_accuracy_semantic_robustness.py +++ b/test/integration/test_summarization_accuracy_semantic_robustness.py @@ -68,7 +68,7 @@ class TestSummarizationAccuracySemanticRobustness: METEOR_SCORE: 0.105540, BERT_SCORE: 0.559893, DELTA_ROUGE_SCORE: 0.021061, - DELTA_METEOR_SCORE: 0.044310, + DELTA_METEOR_SCORE: 0.046859, DELTA_BERT_SCORE: 0.032417, }, ), @@ -87,7 +87,7 @@ class TestSummarizationAccuracySemanticRobustness: METEOR_SCORE: 0.105540, BERT_SCORE: 0.559893, DELTA_ROUGE_SCORE: 0.037362, - DELTA_METEOR_SCORE: 0.056931, + DELTA_METEOR_SCORE: 0.056909, DELTA_BERT_SCORE: 0.026363, }, ), From 7dcf1568ea3186f527dc32668f0a3d9ba9c8a5ac Mon Sep 17 00:00:00 2001 From: jmikko Date: Wed, 13 Mar 2024 15:31:57 +0100 Subject: [PATCH 9/9] Update integration tests (delta meteor) --- .../test_summarization_accuracy_semantic_robustness.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/integration/test_summarization_accuracy_semantic_robustness.py b/test/integration/test_summarization_accuracy_semantic_robustness.py index 46f09f7c..11c4a8fe 100644 --- a/test/integration/test_summarization_accuracy_semantic_robustness.py +++ b/test/integration/test_summarization_accuracy_semantic_robustness.py @@ -106,7 +106,7 @@ class TestSummarizationAccuracySemanticRobustness: METEOR_SCORE: 0.105540, BERT_SCORE: 0.559893, DELTA_ROUGE_SCORE: 0.030725, - DELTA_METEOR_SCORE: 0.057705, + DELTA_METEOR_SCORE: 0.054234, DELTA_BERT_SCORE: 0.026511, }, ),