[BE] Small cleanup in get_files_to_run (#1923)

malfet · web-flow · commit d1f4760df685 · 2022-05-20T20:07:57.000-07:00
Make `get_all_files` and `calculate_shards` work regardless of the
script invocation cwd
Add `test_files_to_run.py`
Do not leak filedescriptor while reading metadata.json
Use list comprehension instead of list(map(lambda
diff --git a/.jenkins/get_files_to_run.py b/.jenkins/get_files_to_run.py
@@ -6,6 +6,9 @@
 from remove_runnable_code import remove_runnable_code
 
 
+# Calculate repo base dir
+REPO_BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
 def get_all_files(encoding="utf-8") -> List[str]:
     sources = [
         "beginner_source",
@@ -16,11 +19,12 @@ def get_all_files(encoding="utf-8") -> List[str]:
     ]
     cmd = ["find"] + sources + ["-name", "*.py", "-not", "-path", "*/data/*"]
 
-    return run(cmd, capture_output=True).stdout.decode(encoding).splitlines()
+    return run(cmd, capture_output=True, cwd=REPO_BASE_DIR).stdout.decode(encoding).splitlines()
 
 
 def calculate_shards(all_files, num_shards=20):
-    metadata = json.load(open(".jenkins/metadata.json"))
+    with open(os.path.join(REPO_BASE_DIR, ".jenkins", "metadata.json")) as fp:
+        metadata = json.load(fp)
     sharded_files = [(0.0, []) for _ in range(num_shards)]
 
     def get_duration(file):
@@ -47,9 +51,7 @@ def add_to_shard(i, filename):
         # so we'll add all the jobs that need this machine to the 0th worker
         add_to_shard(0, filename)
 
-    all_other_files = list(
-        filter(lambda x: x not in needs_gpu_nvidia_small_multi, all_files)
-    )
+    all_other_files = [x for x in all_files if x not in needs_gpu_nvidia_small_multi]
 
     sorted_files = sorted(all_other_files, key=get_duration, reverse=True,)
 
@@ -58,23 +60,23 @@ def add_to_shard(i, filename):
             0
         ]
         add_to_shard(min_shard_index, filename)
-    return list(map(lambda x: x[1], sharded_files))
+    return [x[1] for x in sharded_files]
 
 
-def remove_other_files(all_files, files_to_run):
+def remove_other_files(all_files, files_to_run) -> None:
     for file in all_files:
         if file not in files_to_run:
             remove_runnable_code(file, file)
 
 
-def main():
+def main() -> None:
     num_shards = int(os.environ.get("NUM_WORKERS", 20))
     shard_num = int(os.environ.get("WORKER_ID", 0))
 
     all_files = get_all_files()
     files_to_run = calculate_shards(all_files, num_shards=num_shards)[shard_num]
     remove_other_files(all_files, files_to_run)
-    stripped_file_names = list(map(lambda x: Path(x).stem, files_to_run))
+    stripped_file_names = [Path(x).stem for x in files_to_run]
     print(" ".join(stripped_file_names))
 
 
diff --git a/.jenkins/test_files_to_run.py b/.jenkins/test_files_to_run.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+from get_files_to_run import get_all_files, calculate_shards
+from unittest import TestCase, main
+from functools import reduce
+
+class TestSharding(TestCase):
+    def test_no_sharding(self):
+        all_files=get_all_files()
+        sharded_files = calculate_shards(all_files, 1)
+        self.assertSetEqual(set(all_files), set(sharded_files[0]))
+
+    def test_sharding(self, num_shards=20):
+        all_files=get_all_files()
+        sharded_files = map(set, calculate_shards(all_files, num_shards))
+        self.assertSetEqual(set(all_files), reduce(lambda x,y: x.union(y), sharded_files, set()))
+
+
+
+if __name__ == "__main__":
+    main()