From 6f3554487272658bd728687298d96c5f054f683e Mon Sep 17 00:00:00 2001
From: Serina Grill <serinagrill@gmail.com>
Date: Sun, 25 Sep 2022 13:43:16 -0700
Subject: [PATCH 1/9] Initial commit

---
 openai/validators.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/openai/validators.py b/openai/validators.py
index 0d4d85d4f2..6e6ea140c5 100644
--- a/openai/validators.py
+++ b/openai/validators.py
@@ -128,7 +128,7 @@ def duplicated_rows_validator(df, fields=["prompt", "completion"]):
     This validator will suggest to the user to remove duplicate rows if they exist.
     """
     duplicated_rows = df.duplicated(subset=fields)
-    duplicated_indexes = df.reset_index().index[duplicated_rows].tolist()
+    duplicated_indexes = df.index[duplicated_rows].tolist()
     immediate_msg = None
     optional_msg = None
     optional_fn = None
@@ -158,17 +158,26 @@ def long_examples_validator(df):
 
     ft_type = infer_task_type(df)
     if ft_type != "open-ended generation":
-        long_examples = df.apply(
-            lambda x: len(x.prompt) + len(x.completion) > 10000, axis=1
-        )
-        long_indexes = df.reset_index().index[long_examples].tolist()
+        def find_long_indexes(d):
+            long_examples = d.apply(
+                lambda x: len(x.prompt) + len(x.completion) > 10000, axis=1
+            )
+            long_indexes = d.index[long_examples].tolist()
+            return long_indexes
+
+        long_indexes = find_long_indexes(df)
 
         if len(long_indexes) > 0:
             immediate_msg = f"\n- There are {len(long_indexes)} examples that are very long. These are rows: {long_indexes}\nFor conditional generation, and for classification the examples shouldn't be longer than 2048 tokens."
             optional_msg = f"Remove {len(long_indexes)} long examples"
 
             def optional_fn(x):
-                return x.drop(long_indexes)
+                
+                long_indexes_to_drop = find_long_indexes(x)
+                if long_indexes != long_indexes_to_drop:
+                    print(
+                        f"The indices of the long examples has changed as a result of a previously applied recommendation.\nThe {len(long_indexes_to_drop)} long examples to be dropped are now at the following indices: {long_indexes_to_drop}")
+                return x.drop(long_indexes_to_drop)
 
     return Remediation(
         name="long_examples",

From 32d936c9009cd11952bcce94f83f750b776c4c67 Mon Sep 17 00:00:00 2001
From: Serina Grill <serinagrill@gmail.com>
Date: Tue, 27 Sep 2022 13:03:10 -0700
Subject: [PATCH 2/9] Add fix

---
 openai/validators.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/openai/validators.py b/openai/validators.py
index 6e6ea140c5..5d08da60bf 100644
--- a/openai/validators.py
+++ b/openai/validators.py
@@ -158,14 +158,14 @@ def long_examples_validator(df):
 
     ft_type = infer_task_type(df)
     if ft_type != "open-ended generation":
-        def find_long_indexes(d):
+        def get_long_indexes(d):
             long_examples = d.apply(
                 lambda x: len(x.prompt) + len(x.completion) > 10000, axis=1
             )
             long_indexes = d.index[long_examples].tolist()
             return long_indexes
 
-        long_indexes = find_long_indexes(df)
+        long_indexes = get_long_indexes(df)
 
         if len(long_indexes) > 0:
             immediate_msg = f"\n- There are {len(long_indexes)} examples that are very long. These are rows: {long_indexes}\nFor conditional generation, and for classification the examples shouldn't be longer than 2048 tokens."
@@ -173,7 +173,7 @@ def find_long_indexes(d):
 
             def optional_fn(x):
                 
-                long_indexes_to_drop = find_long_indexes(x)
+                long_indexes_to_drop = get_long_indexes(x)
                 if long_indexes != long_indexes_to_drop:
                     print(
                         f"The indices of the long examples has changed as a result of a previously applied recommendation.\nThe {len(long_indexes_to_drop)} long examples to be dropped are now at the following indices: {long_indexes_to_drop}")

From 4f1ed5bb8339186822735cc572ec8b0165279db2 Mon Sep 17 00:00:00 2001
From: Serina Grill <serinagrill@gmail.com>
Date: Tue, 27 Sep 2022 13:10:44 -0700
Subject: [PATCH 3/9] Reinstate reset_index()

---
 openai/validators.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openai/validators.py b/openai/validators.py
index 5d08da60bf..0f2237b803 100644
--- a/openai/validators.py
+++ b/openai/validators.py
@@ -128,7 +128,7 @@ def duplicated_rows_validator(df, fields=["prompt", "completion"]):
     This validator will suggest to the user to remove duplicate rows if they exist.
     """
     duplicated_rows = df.duplicated(subset=fields)
-    duplicated_indexes = df.index[duplicated_rows].tolist()
+    duplicated_indexes = df.reset_index().index[duplicated_rows].tolist()
     immediate_msg = None
     optional_msg = None
     optional_fn = None

From 60062e2d12b0aa303362a2b20528cb8e54df1e23 Mon Sep 17 00:00:00 2001
From: Serina Grill <serinagrill@gmail.com>
Date: Fri, 14 Oct 2022 14:09:48 -0700
Subject: [PATCH 4/9] Add suggestions

---
 openai/validators.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/openai/validators.py b/openai/validators.py
index 0f2237b803..24be2c4136 100644
--- a/openai/validators.py
+++ b/openai/validators.py
@@ -162,21 +162,21 @@ def get_long_indexes(d):
             long_examples = d.apply(
                 lambda x: len(x.prompt) + len(x.completion) > 10000, axis=1
             )
-            long_indexes = d.index[long_examples].tolist()
-            return long_indexes
+            return d.reset_index().index[long_examples].tolist()
 
         long_indexes = get_long_indexes(df)
 
+        print("long_indexes:", long_indexes)
+
         if len(long_indexes) > 0:
             immediate_msg = f"\n- There are {len(long_indexes)} examples that are very long. These are rows: {long_indexes}\nFor conditional generation, and for classification the examples shouldn't be longer than 2048 tokens."
-            optional_msg = f"Remove {len(long_indexes)} long examples"
+            optional_msg = f"Remove {len(long_indexes)} long examples."
 
             def optional_fn(x):
                 
                 long_indexes_to_drop = get_long_indexes(x)
                 if long_indexes != long_indexes_to_drop:
-                    print(
-                        f"The indices of the long examples has changed as a result of a previously applied recommendation.\nThe {len(long_indexes_to_drop)} long examples to be dropped are now at the following indices: {long_indexes_to_drop}")
+                    sys.stdout.write(f"The indices of the long examples has changed as a result of a previously applied recommendation.\nThe {len(long_indexes_to_drop)} long examples to be dropped are now at the following indices: {long_indexes_to_drop}\n")
                 return x.drop(long_indexes_to_drop)
 
     return Remediation(

From dfa7e73f0b336da716d47e626f21a247845380f9 Mon Sep 17 00:00:00 2001
From: Serina Grill <serinagrill@gmail.com>
Date: Fri, 14 Oct 2022 15:42:44 -0700
Subject: [PATCH 5/9] Remove print stmt

---
 openai/validators.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/openai/validators.py b/openai/validators.py
index 24be2c4136..0df02d8409 100644
--- a/openai/validators.py
+++ b/openai/validators.py
@@ -166,8 +166,6 @@ def get_long_indexes(d):
 
         long_indexes = get_long_indexes(df)
 
-        print("long_indexes:", long_indexes)
-
         if len(long_indexes) > 0:
             immediate_msg = f"\n- There are {len(long_indexes)} examples that are very long. These are rows: {long_indexes}\nFor conditional generation, and for classification the examples shouldn't be longer than 2048 tokens."
             optional_msg = f"Remove {len(long_indexes)} long examples."

From dca5f6c18607da2ae48336279fb635c4fec528e8 Mon Sep 17 00:00:00 2001
From: Serina Grill <serinagrill@gmail.com>
Date: Fri, 14 Oct 2022 15:44:10 -0700
Subject: [PATCH 6/9] punctuation

---
 openai/validators.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openai/validators.py b/openai/validators.py
index 0df02d8409..23ff525495 100644
--- a/openai/validators.py
+++ b/openai/validators.py
@@ -168,7 +168,7 @@ def get_long_indexes(d):
 
         if len(long_indexes) > 0:
             immediate_msg = f"\n- There are {len(long_indexes)} examples that are very long. These are rows: {long_indexes}\nFor conditional generation, and for classification the examples shouldn't be longer than 2048 tokens."
-            optional_msg = f"Remove {len(long_indexes)} long examples."
+            optional_msg = f"Remove {len(long_indexes)} long examples"
 
             def optional_fn(x):
                 

From b66095535348c1f39d8324de848a490c1f8709a9 Mon Sep 17 00:00:00 2001
From: Serina Grill <serinagrill@gmail.com>
Date: Sun, 16 Oct 2022 00:03:08 -0700
Subject: [PATCH 7/9] Add test for fine_tunes.prepare_data

---
 openai/tests/test_prepare_data.py | 41 +++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 openai/tests/test_prepare_data.py

diff --git a/openai/tests/test_prepare_data.py b/openai/tests/test_prepare_data.py
new file mode 100644
index 0000000000..8d7a9b5743
--- /dev/null
+++ b/openai/tests/test_prepare_data.py
@@ -0,0 +1,41 @@
+import json
+import subprocess
+from tempfile import NamedTemporaryFile
+
+
+def test_prepare_data() -> None:
+
+    # data
+    short_prompt = "a prompt "
+    long_prompt = short_prompt * 500
+
+    short_completion = "a completion "
+    long_completion = short_completion * 500
+
+    # the order of these matters
+    unprepared_training_data = [
+        {"prompt": long_prompt, "completion": long_completion},  # 1 of 2 duplicates
+        {"prompt": short_prompt, "completion": short_completion}, 
+        {"prompt": long_prompt, "completion": long_completion},  # 2 of 2 duplicates
+
+    ]
+
+    with NamedTemporaryFile(suffix="jsonl", mode="w") as training_data:
+        for prompt_completion_row in unprepared_training_data:
+            training_data.write(json.dumps(prompt_completion_row) + "\n")
+            training_data.flush()
+    
+        prepared_data_cmd_output = subprocess.run(
+            [f"openai tools fine_tunes.prepare_data -f {training_data.name}"], 
+            stdout=subprocess.PIPE, 
+            text=True, 
+            input="y\ny\ny\ny\ny",  # apply all recommendations, but one at a time
+            stderr=subprocess.PIPE,
+            encoding="utf-8",
+            shell=True
+        )
+
+    assert prepared_data_cmd_output.stderr == ""  # validate no errors
+    assert "indices of the long examples has changed" in prepared_data_cmd_output.stdout  # validate get_long_indexes() applied during optional_fn() call in long_examples_validator()
+    
+    return prepared_data_cmd_output.stdout
\ No newline at end of file

From 0198c30c839a1a831f1cbd86b30723c1e5c73372 Mon Sep 17 00:00:00 2001
From: Serina Grill <serinagrill@gmail.com>
Date: Sun, 16 Oct 2022 00:07:54 -0700
Subject: [PATCH 8/9] Renamed file, added docstrings

---
 ...repare_data.py => test_long_examples_validator.py} | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)
 rename openai/tests/{test_prepare_data.py => test_long_examples_validator.py} (81%)

diff --git a/openai/tests/test_prepare_data.py b/openai/tests/test_long_examples_validator.py
similarity index 81%
rename from openai/tests/test_prepare_data.py
rename to openai/tests/test_long_examples_validator.py
index 8d7a9b5743..6e09593cea 100644
--- a/openai/tests/test_prepare_data.py
+++ b/openai/tests/test_long_examples_validator.py
@@ -3,7 +3,12 @@
 from tempfile import NamedTemporaryFile
 
 
-def test_prepare_data() -> None:
+def test_long_examples_validator() -> None:
+
+    """
+    Ensures that long_examples_validator() handles previously applied recommendations,
+    namely dropped duplicates, without resulting in a KeyError.
+    """
 
     # data
     short_prompt = "a prompt "
@@ -29,13 +34,13 @@ def test_prepare_data() -> None:
             [f"openai tools fine_tunes.prepare_data -f {training_data.name}"], 
             stdout=subprocess.PIPE, 
             text=True, 
-            input="y\ny\ny\ny\ny",  # apply all recommendations, but one at a time
+            input="y\ny\ny\ny\ny",  # apply all recommendations, one at a time
             stderr=subprocess.PIPE,
             encoding="utf-8",
             shell=True
         )
 
-    assert prepared_data_cmd_output.stderr == ""  # validate no errors
+    assert prepared_data_cmd_output.stderr == ""  # validate data was prepared successfully
     assert "indices of the long examples has changed" in prepared_data_cmd_output.stdout  # validate get_long_indexes() applied during optional_fn() call in long_examples_validator()
     
     return prepared_data_cmd_output.stdout
\ No newline at end of file

From 1e675bd0886a410cd67569b190cb42adebaf37cd Mon Sep 17 00:00:00 2001
From: Serina Grill <serinagrill@gmail.com>
Date: Sun, 16 Oct 2022 00:10:00 -0700
Subject: [PATCH 9/9] Move comment placement

---
 openai/tests/test_long_examples_validator.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/openai/tests/test_long_examples_validator.py b/openai/tests/test_long_examples_validator.py
index 6e09593cea..7f3e4c8cf1 100644
--- a/openai/tests/test_long_examples_validator.py
+++ b/openai/tests/test_long_examples_validator.py
@@ -40,7 +40,9 @@ def test_long_examples_validator() -> None:
             shell=True
         )
 
-    assert prepared_data_cmd_output.stderr == ""  # validate data was prepared successfully
-    assert "indices of the long examples has changed" in prepared_data_cmd_output.stdout  # validate get_long_indexes() applied during optional_fn() call in long_examples_validator()
+    # validate data was prepared successfully
+    assert prepared_data_cmd_output.stderr == ""  
+    # validate get_long_indexes() applied during optional_fn() call in long_examples_validator()
+    assert "indices of the long examples has changed" in prepared_data_cmd_output.stdout
     
     return prepared_data_cmd_output.stdout
\ No newline at end of file