Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Fix KeyError occurring using fine_tunes.prepare_data #125

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Oct 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions openai/tests/test_long_examples_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import json
import subprocess
from tempfile import NamedTemporaryFile


def test_long_examples_validator() -> None:

"""
Ensures that long_examples_validator() handles previously applied recommendations,
namely dropped duplicates, without resulting in a KeyError.
"""

# data
short_prompt = "a prompt "
long_prompt = short_prompt * 500

short_completion = "a completion "
long_completion = short_completion * 500

# the order of these matters
unprepared_training_data = [
{"prompt": long_prompt, "completion": long_completion}, # 1 of 2 duplicates
{"prompt": short_prompt, "completion": short_completion},
{"prompt": long_prompt, "completion": long_completion}, # 2 of 2 duplicates

]

with NamedTemporaryFile(suffix="jsonl", mode="w") as training_data:
for prompt_completion_row in unprepared_training_data:
training_data.write(json.dumps(prompt_completion_row) + "\n")
training_data.flush()

prepared_data_cmd_output = subprocess.run(
[f"openai tools fine_tunes.prepare_data -f {training_data.name}"],
stdout=subprocess.PIPE,
text=True,
input="y\ny\ny\ny\ny", # apply all recommendations, one at a time
stderr=subprocess.PIPE,
encoding="utf-8",
shell=True
)

# validate data was prepared successfully
assert prepared_data_cmd_output.stderr == ""
# validate get_long_indexes() applied during optional_fn() call in long_examples_validator()
assert "indices of the long examples has changed" in prepared_data_cmd_output.stdout

return prepared_data_cmd_output.stdout
17 changes: 12 additions & 5 deletions openai/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,17 +158,24 @@ def long_examples_validator(df):

ft_type = infer_task_type(df)
if ft_type != "open-ended generation":
long_examples = df.apply(
lambda x: len(x.prompt) + len(x.completion) > 10000, axis=1
)
long_indexes = df.reset_index().index[long_examples].tolist()
def get_long_indexes(d):
long_examples = d.apply(
lambda x: len(x.prompt) + len(x.completion) > 10000, axis=1
)
return d.reset_index().index[long_examples].tolist()

long_indexes = get_long_indexes(df)

if len(long_indexes) > 0:
immediate_msg = f"\n- There are {len(long_indexes)} examples that are very long. These are rows: {long_indexes}\nFor conditional generation, and for classification the examples shouldn't be longer than 2048 tokens."
optional_msg = f"Remove {len(long_indexes)} long examples"

def optional_fn(x):
return x.drop(long_indexes)

long_indexes_to_drop = get_long_indexes(x)
if long_indexes != long_indexes_to_drop:
sys.stdout.write(f"The indices of the long examples has changed as a result of a previously applied recommendation.\nThe {len(long_indexes_to_drop)} long examples to be dropped are now at the following indices: {long_indexes_to_drop}\n")
return x.drop(long_indexes_to_drop)

return Remediation(
name="long_examples",
Expand Down