From 6f3554487272658bd728687298d96c5f054f683e Mon Sep 17 00:00:00 2001 From: Serina Grill Date: Sun, 25 Sep 2022 13:43:16 -0700 Subject: [PATCH 1/9] Initial commit --- openai/validators.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/openai/validators.py b/openai/validators.py index 0d4d85d4f2..6e6ea140c5 100644 --- a/openai/validators.py +++ b/openai/validators.py @@ -128,7 +128,7 @@ def duplicated_rows_validator(df, fields=["prompt", "completion"]): This validator will suggest to the user to remove duplicate rows if they exist. """ duplicated_rows = df.duplicated(subset=fields) - duplicated_indexes = df.reset_index().index[duplicated_rows].tolist() + duplicated_indexes = df.index[duplicated_rows].tolist() immediate_msg = None optional_msg = None optional_fn = None @@ -158,17 +158,26 @@ def long_examples_validator(df): ft_type = infer_task_type(df) if ft_type != "open-ended generation": - long_examples = df.apply( - lambda x: len(x.prompt) + len(x.completion) > 10000, axis=1 - ) - long_indexes = df.reset_index().index[long_examples].tolist() + def find_long_indexes(d): + long_examples = d.apply( + lambda x: len(x.prompt) + len(x.completion) > 10000, axis=1 + ) + long_indexes = d.index[long_examples].tolist() + return long_indexes + + long_indexes = find_long_indexes(df) if len(long_indexes) > 0: immediate_msg = f"\n- There are {len(long_indexes)} examples that are very long. These are rows: {long_indexes}\nFor conditional generation, and for classification the examples shouldn't be longer than 2048 tokens." optional_msg = f"Remove {len(long_indexes)} long examples" def optional_fn(x): - return x.drop(long_indexes) + + long_indexes_to_drop = find_long_indexes(x) + if long_indexes != long_indexes_to_drop: + print( + f"The indices of the long examples has changed as a result of a previously applied recommendation.\nThe {len(long_indexes_to_drop)} long examples to be dropped are now at the following indices: {long_indexes_to_drop}") + return x.drop(long_indexes_to_drop) return Remediation( name="long_examples", From 32d936c9009cd11952bcce94f83f750b776c4c67 Mon Sep 17 00:00:00 2001 From: Serina Grill Date: Tue, 27 Sep 2022 13:03:10 -0700 Subject: [PATCH 2/9] Add fix --- openai/validators.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/openai/validators.py b/openai/validators.py index 6e6ea140c5..5d08da60bf 100644 --- a/openai/validators.py +++ b/openai/validators.py @@ -158,14 +158,14 @@ def long_examples_validator(df): ft_type = infer_task_type(df) if ft_type != "open-ended generation": - def find_long_indexes(d): + def get_long_indexes(d): long_examples = d.apply( lambda x: len(x.prompt) + len(x.completion) > 10000, axis=1 ) long_indexes = d.index[long_examples].tolist() return long_indexes - long_indexes = find_long_indexes(df) + long_indexes = get_long_indexes(df) if len(long_indexes) > 0: immediate_msg = f"\n- There are {len(long_indexes)} examples that are very long. These are rows: {long_indexes}\nFor conditional generation, and for classification the examples shouldn't be longer than 2048 tokens." @@ -173,7 +173,7 @@ def find_long_indexes(d): def optional_fn(x): - long_indexes_to_drop = find_long_indexes(x) + long_indexes_to_drop = get_long_indexes(x) if long_indexes != long_indexes_to_drop: print( f"The indices of the long examples has changed as a result of a previously applied recommendation.\nThe {len(long_indexes_to_drop)} long examples to be dropped are now at the following indices: {long_indexes_to_drop}") From 4f1ed5bb8339186822735cc572ec8b0165279db2 Mon Sep 17 00:00:00 2001 From: Serina Grill Date: Tue, 27 Sep 2022 13:10:44 -0700 Subject: [PATCH 3/9] Reinstate reset_index() --- openai/validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openai/validators.py b/openai/validators.py index 5d08da60bf..0f2237b803 100644 --- a/openai/validators.py +++ b/openai/validators.py @@ -128,7 +128,7 @@ def duplicated_rows_validator(df, fields=["prompt", "completion"]): This validator will suggest to the user to remove duplicate rows if they exist. """ duplicated_rows = df.duplicated(subset=fields) - duplicated_indexes = df.index[duplicated_rows].tolist() + duplicated_indexes = df.reset_index().index[duplicated_rows].tolist() immediate_msg = None optional_msg = None optional_fn = None From 60062e2d12b0aa303362a2b20528cb8e54df1e23 Mon Sep 17 00:00:00 2001 From: Serina Grill Date: Fri, 14 Oct 2022 14:09:48 -0700 Subject: [PATCH 4/9] Add suggestions --- openai/validators.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/openai/validators.py b/openai/validators.py index 0f2237b803..24be2c4136 100644 --- a/openai/validators.py +++ b/openai/validators.py @@ -162,21 +162,21 @@ def get_long_indexes(d): long_examples = d.apply( lambda x: len(x.prompt) + len(x.completion) > 10000, axis=1 ) - long_indexes = d.index[long_examples].tolist() - return long_indexes + return d.reset_index().index[long_examples].tolist() long_indexes = get_long_indexes(df) + print("long_indexes:", long_indexes) + if len(long_indexes) > 0: immediate_msg = f"\n- There are {len(long_indexes)} examples that are very long. These are rows: {long_indexes}\nFor conditional generation, and for classification the examples shouldn't be longer than 2048 tokens." - optional_msg = f"Remove {len(long_indexes)} long examples" + optional_msg = f"Remove {len(long_indexes)} long examples." def optional_fn(x): long_indexes_to_drop = get_long_indexes(x) if long_indexes != long_indexes_to_drop: - print( - f"The indices of the long examples has changed as a result of a previously applied recommendation.\nThe {len(long_indexes_to_drop)} long examples to be dropped are now at the following indices: {long_indexes_to_drop}") + sys.stdout.write(f"The indices of the long examples has changed as a result of a previously applied recommendation.\nThe {len(long_indexes_to_drop)} long examples to be dropped are now at the following indices: {long_indexes_to_drop}\n") return x.drop(long_indexes_to_drop) return Remediation( From dfa7e73f0b336da716d47e626f21a247845380f9 Mon Sep 17 00:00:00 2001 From: Serina Grill Date: Fri, 14 Oct 2022 15:42:44 -0700 Subject: [PATCH 5/9] Remove print stmt --- openai/validators.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/openai/validators.py b/openai/validators.py index 24be2c4136..0df02d8409 100644 --- a/openai/validators.py +++ b/openai/validators.py @@ -166,8 +166,6 @@ def get_long_indexes(d): long_indexes = get_long_indexes(df) - print("long_indexes:", long_indexes) - if len(long_indexes) > 0: immediate_msg = f"\n- There are {len(long_indexes)} examples that are very long. These are rows: {long_indexes}\nFor conditional generation, and for classification the examples shouldn't be longer than 2048 tokens." optional_msg = f"Remove {len(long_indexes)} long examples." From dca5f6c18607da2ae48336279fb635c4fec528e8 Mon Sep 17 00:00:00 2001 From: Serina Grill Date: Fri, 14 Oct 2022 15:44:10 -0700 Subject: [PATCH 6/9] punctuation --- openai/validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openai/validators.py b/openai/validators.py index 0df02d8409..23ff525495 100644 --- a/openai/validators.py +++ b/openai/validators.py @@ -168,7 +168,7 @@ def get_long_indexes(d): if len(long_indexes) > 0: immediate_msg = f"\n- There are {len(long_indexes)} examples that are very long. These are rows: {long_indexes}\nFor conditional generation, and for classification the examples shouldn't be longer than 2048 tokens." - optional_msg = f"Remove {len(long_indexes)} long examples." + optional_msg = f"Remove {len(long_indexes)} long examples" def optional_fn(x): From b66095535348c1f39d8324de848a490c1f8709a9 Mon Sep 17 00:00:00 2001 From: Serina Grill Date: Sun, 16 Oct 2022 00:03:08 -0700 Subject: [PATCH 7/9] Add test for fine_tunes.prepare_data --- openai/tests/test_prepare_data.py | 41 +++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 openai/tests/test_prepare_data.py diff --git a/openai/tests/test_prepare_data.py b/openai/tests/test_prepare_data.py new file mode 100644 index 0000000000..8d7a9b5743 --- /dev/null +++ b/openai/tests/test_prepare_data.py @@ -0,0 +1,41 @@ +import json +import subprocess +from tempfile import NamedTemporaryFile + + +def test_prepare_data() -> None: + + # data + short_prompt = "a prompt " + long_prompt = short_prompt * 500 + + short_completion = "a completion " + long_completion = short_completion * 500 + + # the order of these matters + unprepared_training_data = [ + {"prompt": long_prompt, "completion": long_completion}, # 1 of 2 duplicates + {"prompt": short_prompt, "completion": short_completion}, + {"prompt": long_prompt, "completion": long_completion}, # 2 of 2 duplicates + + ] + + with NamedTemporaryFile(suffix="jsonl", mode="w") as training_data: + for prompt_completion_row in unprepared_training_data: + training_data.write(json.dumps(prompt_completion_row) + "\n") + training_data.flush() + + prepared_data_cmd_output = subprocess.run( + [f"openai tools fine_tunes.prepare_data -f {training_data.name}"], + stdout=subprocess.PIPE, + text=True, + input="y\ny\ny\ny\ny", # apply all recommendations, but one at a time + stderr=subprocess.PIPE, + encoding="utf-8", + shell=True + ) + + assert prepared_data_cmd_output.stderr == "" # validate no errors + assert "indices of the long examples has changed" in prepared_data_cmd_output.stdout # validate get_long_indexes() applied during optional_fn() call in long_examples_validator() + + return prepared_data_cmd_output.stdout \ No newline at end of file From 0198c30c839a1a831f1cbd86b30723c1e5c73372 Mon Sep 17 00:00:00 2001 From: Serina Grill Date: Sun, 16 Oct 2022 00:07:54 -0700 Subject: [PATCH 8/9] Renamed file, added docstrings --- ...repare_data.py => test_long_examples_validator.py} | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) rename openai/tests/{test_prepare_data.py => test_long_examples_validator.py} (81%) diff --git a/openai/tests/test_prepare_data.py b/openai/tests/test_long_examples_validator.py similarity index 81% rename from openai/tests/test_prepare_data.py rename to openai/tests/test_long_examples_validator.py index 8d7a9b5743..6e09593cea 100644 --- a/openai/tests/test_prepare_data.py +++ b/openai/tests/test_long_examples_validator.py @@ -3,7 +3,12 @@ from tempfile import NamedTemporaryFile -def test_prepare_data() -> None: +def test_long_examples_validator() -> None: + + """ + Ensures that long_examples_validator() handles previously applied recommendations, + namely dropped duplicates, without resulting in a KeyError. + """ # data short_prompt = "a prompt " @@ -29,13 +34,13 @@ def test_prepare_data() -> None: [f"openai tools fine_tunes.prepare_data -f {training_data.name}"], stdout=subprocess.PIPE, text=True, - input="y\ny\ny\ny\ny", # apply all recommendations, but one at a time + input="y\ny\ny\ny\ny", # apply all recommendations, one at a time stderr=subprocess.PIPE, encoding="utf-8", shell=True ) - assert prepared_data_cmd_output.stderr == "" # validate no errors + assert prepared_data_cmd_output.stderr == "" # validate data was prepared successfully assert "indices of the long examples has changed" in prepared_data_cmd_output.stdout # validate get_long_indexes() applied during optional_fn() call in long_examples_validator() return prepared_data_cmd_output.stdout \ No newline at end of file From 1e675bd0886a410cd67569b190cb42adebaf37cd Mon Sep 17 00:00:00 2001 From: Serina Grill Date: Sun, 16 Oct 2022 00:10:00 -0700 Subject: [PATCH 9/9] Move comment placement --- openai/tests/test_long_examples_validator.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/openai/tests/test_long_examples_validator.py b/openai/tests/test_long_examples_validator.py index 6e09593cea..7f3e4c8cf1 100644 --- a/openai/tests/test_long_examples_validator.py +++ b/openai/tests/test_long_examples_validator.py @@ -40,7 +40,9 @@ def test_long_examples_validator() -> None: shell=True ) - assert prepared_data_cmd_output.stderr == "" # validate data was prepared successfully - assert "indices of the long examples has changed" in prepared_data_cmd_output.stdout # validate get_long_indexes() applied during optional_fn() call in long_examples_validator() + # validate data was prepared successfully + assert prepared_data_cmd_output.stderr == "" + # validate get_long_indexes() applied during optional_fn() call in long_examples_validator() + assert "indices of the long examples has changed" in prepared_data_cmd_output.stdout return prepared_data_cmd_output.stdout \ No newline at end of file