diff --git a/openai/tests/test_long_examples_validator.py b/openai/tests/test_long_examples_validator.py new file mode 100644 index 0000000000..7f3e4c8cf1 --- /dev/null +++ b/openai/tests/test_long_examples_validator.py @@ -0,0 +1,48 @@ +import json +import subprocess +from tempfile import NamedTemporaryFile + + +def test_long_examples_validator() -> None: + + """ + Ensures that long_examples_validator() handles previously applied recommendations, + namely dropped duplicates, without resulting in a KeyError. + """ + + # data + short_prompt = "a prompt " + long_prompt = short_prompt * 500 + + short_completion = "a completion " + long_completion = short_completion * 500 + + # the order of these matters + unprepared_training_data = [ + {"prompt": long_prompt, "completion": long_completion}, # 1 of 2 duplicates + {"prompt": short_prompt, "completion": short_completion}, + {"prompt": long_prompt, "completion": long_completion}, # 2 of 2 duplicates + + ] + + with NamedTemporaryFile(suffix="jsonl", mode="w") as training_data: + for prompt_completion_row in unprepared_training_data: + training_data.write(json.dumps(prompt_completion_row) + "\n") + training_data.flush() + + prepared_data_cmd_output = subprocess.run( + [f"openai tools fine_tunes.prepare_data -f {training_data.name}"], + stdout=subprocess.PIPE, + text=True, + input="y\ny\ny\ny\ny", # apply all recommendations, one at a time + stderr=subprocess.PIPE, + encoding="utf-8", + shell=True + ) + + # validate data was prepared successfully + assert prepared_data_cmd_output.stderr == "" + # validate get_long_indexes() applied during optional_fn() call in long_examples_validator() + assert "indices of the long examples has changed" in prepared_data_cmd_output.stdout + + return prepared_data_cmd_output.stdout \ No newline at end of file diff --git a/openai/validators.py b/openai/validators.py index 0d4d85d4f2..23ff525495 100644 --- a/openai/validators.py +++ b/openai/validators.py @@ -158,17 +158,24 @@ def long_examples_validator(df): ft_type = infer_task_type(df) if ft_type != "open-ended generation": - long_examples = df.apply( - lambda x: len(x.prompt) + len(x.completion) > 10000, axis=1 - ) - long_indexes = df.reset_index().index[long_examples].tolist() + def get_long_indexes(d): + long_examples = d.apply( + lambda x: len(x.prompt) + len(x.completion) > 10000, axis=1 + ) + return d.reset_index().index[long_examples].tolist() + + long_indexes = get_long_indexes(df) if len(long_indexes) > 0: immediate_msg = f"\n- There are {len(long_indexes)} examples that are very long. These are rows: {long_indexes}\nFor conditional generation, and for classification the examples shouldn't be longer than 2048 tokens." optional_msg = f"Remove {len(long_indexes)} long examples" def optional_fn(x): - return x.drop(long_indexes) + + long_indexes_to_drop = get_long_indexes(x) + if long_indexes != long_indexes_to_drop: + sys.stdout.write(f"The indices of the long examples has changed as a result of a previously applied recommendation.\nThe {len(long_indexes_to_drop)} long examples to be dropped are now at the following indices: {long_indexes_to_drop}\n") + return x.drop(long_indexes_to_drop) return Remediation( name="long_examples",