Fix/ncf mlperf tweaks: robustness and determinism (tensorflow#5334)

Taylor Robie · web-flow · commit 4dc1080d5f30 · 2018-09-19T18:07:17.000-07:00
* bug fixes and add seed

* more random corrections

* make cleanup more robust

* return cleanup fn

* delint and address PR comments.

* delint and fix tests

* delinting is never done

* add pipeline hashing

* delint
diff --git a/official/recommendation/data_async_generation.py b/official/recommendation/data_async_generation.py
@@ -160,7 +160,8 @@ def _construct_training_records(
     train_batch_size,     # type: int
     training_shards,      # type: typing.List[str]
     spillover,            # type: bool
-    carryover=None        # type: typing.Union[typing.List[np.ndarray], None]
+    carryover=None,       # type: typing.Union[typing.List[np.ndarray], None]
+    deterministic=False   # type: bool
     ):
   """Generate false negatives and write TFRecords files.
 
@@ -204,7 +205,8 @@ def _construct_training_records(
 
   with contextlib.closing(multiprocessing.Pool(
       processes=num_workers, initializer=init_worker)) as pool:
-    data_generator = pool.imap_unordered(_process_shard, map_args)  # pylint: disable=no-member
+    map_fn = pool.imap if deterministic else pool.imap_unordered  # pylint: disable=no-member
+    data_generator = map_fn(_process_shard, map_args)
     data = [
         np.zeros(shape=(num_pts,), dtype=np.int32) - 1,
         np.zeros(shape=(num_pts,), dtype=np.uint16),
@@ -339,16 +341,31 @@ def _construct_eval_record(cache_paths, eval_batch_size):
   log_msg("Eval TFRecords file successfully constructed.")
 
 
-def _generation_loop(
-    num_workers, cache_paths, num_readers, num_neg, num_train_positives,
-    num_items, spillover, epochs_per_cycle, train_batch_size, eval_batch_size):
-  # type: (int, rconst.Paths, int, int, int, int, bool, int, int, int) -> None
+def _generation_loop(num_workers,           # type: int
+                     cache_paths,           # type: rconst.Paths
+                     num_readers,           # type: int
+                     num_neg,               # type: int
+                     num_train_positives,   # type: int
+                     num_items,             # type: int
+                     spillover,             # type: bool
+                     epochs_per_cycle,      # type: int
+                     train_batch_size,      # type: int
+                     eval_batch_size,       # type: int
+                     deterministic          # type: bool
+                    ):
+  # type: (...) -> None
   """Primary run loop for data file generation."""
 
   log_msg("Signaling that I am alive.")
   with tf.gfile.Open(cache_paths.subproc_alive, "w") as f:
     f.write("Generation subproc has started.")
-  atexit.register(tf.gfile.Remove, filename=cache_paths.subproc_alive)
+
+  @atexit.register
+  def remove_alive_file():
+    try:
+      tf.gfile.Remove(cache_paths.subproc_alive)
+    except tf.errors.NotFoundError:
+      return  # Main thread has already deleted the entire cache dir.
 
   log_msg("Entering generation loop.")
   tf.gfile.MakeDirs(cache_paths.train_epoch_dir)
@@ -364,7 +381,8 @@ def _generation_loop(
       cache_paths=cache_paths, num_readers=num_readers, num_neg=num_neg,
       num_train_positives=num_train_positives, num_items=num_items,
       epochs_per_cycle=epochs_per_cycle, train_batch_size=train_batch_size,
-      training_shards=training_shards, spillover=spillover, carryover=None)
+      training_shards=training_shards, spillover=spillover, carryover=None,
+      deterministic=deterministic)
 
   _construct_eval_record(cache_paths=cache_paths,
                          eval_batch_size=eval_batch_size)
@@ -397,7 +415,7 @@ def _generation_loop(
         num_train_positives=num_train_positives, num_items=num_items,
         epochs_per_cycle=epochs_per_cycle, train_batch_size=train_batch_size,
         training_shards=training_shards, spillover=spillover,
-        carryover=carryover)
+        carryover=carryover, deterministic=deterministic)
 
     wait_count = 0
     start_time = time.time()
@@ -441,6 +459,7 @@ def main(_):
         epochs_per_cycle=flags.FLAGS.epochs_per_cycle,
         train_batch_size=flags.FLAGS.train_batch_size,
         eval_batch_size=flags.FLAGS.eval_batch_size,
+        deterministic=flags.FLAGS.seed is not None,
     )
   except KeyboardInterrupt:
     log_msg("KeyboardInterrupt registered.")
diff --git a/official/recommendation/data_preprocessing.py b/official/recommendation/data_preprocessing.py
@@ -21,6 +21,7 @@
 import atexit
 import contextlib
 import gc
+import hashlib
 import multiprocessing
 import json
 import os
@@ -50,7 +51,7 @@ class NCFDataset(object):
   """Container for training and testing data."""
 
   def __init__(self, user_map, item_map, num_data_readers, cache_paths,
-               num_train_positives):
+               num_train_positives, deterministic=False):
     # type: (dict, dict, int, rconst.Paths) -> None
     """Assign key values for recommendation dataset.
 
@@ -61,6 +62,8 @@ def __init__(self, user_map, item_map, num_data_readers, cache_paths,
       cache_paths: Object containing locations for various cache files.
       num_train_positives: The number of positive training examples in the
         dataset.
+      deterministic: Operations should use deterministic, order preserving
+        methods, even at the cost of performance.
     """
 
     self.user_map = {int(k): int(v) for k, v in user_map.items()}
@@ -70,6 +73,7 @@ def __init__(self, user_map, item_map, num_data_readers, cache_paths,
     self.num_data_readers = num_data_readers
     self.cache_paths = cache_paths
     self.num_train_positives = num_train_positives
+    self.deterministic = deterministic
 
 
 def _filter_index_sort(raw_rating_path, match_mlperf):
@@ -340,7 +344,8 @@ def generate_train_eval_data(df, approx_num_shards, num_items, cache_paths,
     pickle.dump(eval_data, f, protocol=pickle.HIGHEST_PROTOCOL)
 
 
-def construct_cache(dataset, data_dir, num_data_readers, match_mlperf):
+def construct_cache(dataset, data_dir, num_data_readers, match_mlperf,
+                    deterministic):
   # type: (str, str, int, bool) -> NCFDataset
   """Load and digest data CSV into a usable form.
 
@@ -351,6 +356,8 @@ def construct_cache(dataset, data_dir, num_data_readers, match_mlperf):
       data during training.
     match_mlperf: If True, change the behavior of the cache construction to
       match the MLPerf reference implementation.
+    deterministic: Try to enforce repeatable behavior, even at the cost of
+      performance.
   """
   cache_paths = rconst.Paths(data_dir=data_dir)
   num_data_readers = (num_data_readers or int(multiprocessing.cpu_count() / 2)
@@ -377,7 +384,8 @@ def construct_cache(dataset, data_dir, num_data_readers, match_mlperf):
   ncf_dataset = NCFDataset(user_map=user_map, item_map=item_map,
                            num_data_readers=num_data_readers,
                            cache_paths=cache_paths,
-                           num_train_positives=len(df) - len(user_map))
+                           num_train_positives=len(df) - len(user_map),
+                           deterministic=deterministic)
 
   run_time = timeit.default_timer() - st
   tf.logging.info("Cache construction complete. Time: {:.1f} sec."
@@ -403,13 +411,15 @@ def _shutdown(proc):
 
 def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size,
                          num_data_readers=None, num_neg=4, epochs_per_cycle=1,
-                         match_mlperf=False):
+                         match_mlperf=False, deterministic=False):
+  # type: (...) -> (NCFDataset, typing.Callable)
   """Preprocess data and start negative generation subprocess."""
 
   tf.logging.info("Beginning data preprocessing.")
   ncf_dataset = construct_cache(dataset=dataset, data_dir=data_dir,
                                 num_data_readers=num_data_readers,
-                                match_mlperf=match_mlperf)
+                                match_mlperf=match_mlperf,
+                                deterministic=deterministic)
 
   tf.logging.info("Creating training file subprocess.")
 
@@ -439,18 +449,30 @@ def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size,
                               # guarantee batch size and significantly improves
                               # performance. (~5% increase in examples/sec on
                               # GPU, and needed for TPU XLA.)
-      "--redirect_logs", "True",
-      "--seed", str(int(stat_utils.random_int32()))
+      "--redirect_logs", "True"
   ]
+  if ncf_dataset.deterministic:
+    subproc_args.extend(["--seed", str(int(stat_utils.random_int32()))])
 
   tf.logging.info(
       "Generation subprocess command: {}".format(" ".join(subproc_args)))
 
   proc = subprocess.Popen(args=subproc_args, shell=False, env=subproc_env)
 
-  atexit.register(_shutdown, proc=proc)
-  atexit.register(tf.gfile.DeleteRecursively,
-                  ncf_dataset.cache_paths.cache_root)
+  cleanup_called = {"finished": False}
+  @atexit.register
+  def cleanup():
+    """Remove files and subprocess from data generation."""
+    if cleanup_called["finished"]:
+      return
+
+    _shutdown(proc)
+    try:
+      tf.gfile.DeleteRecursively(ncf_dataset.cache_paths.cache_root)
+    except tf.errors.NotFoundError:
+      pass
+
+    cleanup_called["finished"] = True
 
   for _ in range(300):
     if tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
@@ -460,7 +482,7 @@ def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size,
     raise ValueError("Generation subprocess did not start correctly. Data will "
                      "not be available; exiting to avoid waiting forever.")
 
-  return ncf_dataset
+  return ncf_dataset, cleanup
 
 
 def make_deserialize(params, batch_size, training=False):
@@ -498,6 +520,44 @@ def deserialize(examples_serialized):
   return deserialize
 
 
+def hash_pipeline(dataset, deterministic):
+  # type: (tf.data.Dataset, bool) -> None
+  """Utility function for detecting non-determinism in the data pipeline.
+
+  Args:
+    dataset: a tf.data.Dataset generated by the input_fn
+    deterministic: Does the input_fn expect the dataset to be deterministic.
+      (i.e. fixed seed, sloppy=False, etc.)
+  """
+  if not deterministic:
+    tf.logging.warning("Data pipeline is not marked as deterministic. Hash "
+                       "values are not expected to be meaningful.")
+
+  batch = dataset.make_one_shot_iterator().get_next()
+  md5 = hashlib.md5()
+  count = 0
+  first_batch_hash = b""
+  with tf.Session() as sess:
+    while True:
+      try:
+        result = sess.run(batch)
+        if isinstance(result, tuple):
+          result = result[0]  # only hash features
+      except tf.errors.OutOfRangeError:
+        break
+
+      count += 1
+      md5.update(memoryview(result[movielens.USER_COLUMN]).tobytes())
+      md5.update(memoryview(result[movielens.ITEM_COLUMN]).tobytes())
+      if count == 1:
+        first_batch_hash = md5.hexdigest()
+  overall_hash = md5.hexdigest()
+  tf.logging.info("Batch count: {}".format(count))
+  tf.logging.info("  [pipeline_hash] First batch hash: {}".format(
+      first_batch_hash))
+  tf.logging.info("  [pipeline_hash] All batches hash: {}".format(overall_hash))
+
+
 def make_train_input_fn(ncf_dataset):
   # type: (NCFDataset) -> (typing.Callable, str, int)
   """Construct training input_fn for the current epoch."""
@@ -556,14 +616,19 @@ def input_fn(params):
         tf.data.TFRecordDataset,
         cycle_length=4,
         block_length=100000,
-        sloppy=True,
+        sloppy=not ncf_dataset.deterministic,
         prefetch_input_elements=4,
     )
 
     deserialize = make_deserialize(params, batch_size, True)
     dataset = record_files.apply(interleave)
     dataset = dataset.map(deserialize, num_parallel_calls=4)
-    return dataset.prefetch(32)
+    dataset = dataset.prefetch(32)
+
+    if params.get("hash_pipeline"):
+      hash_pipeline(dataset, ncf_dataset.deterministic)
+
+    return dataset
 
   return input_fn, record_dir, batch_count
 
@@ -588,7 +653,11 @@ def input_fn(params):
 
     deserialize = make_deserialize(params, batch_size, False)
     dataset = dataset.map(deserialize, num_parallel_calls=4)
+    dataset = dataset.prefetch(16)
+
+    if params.get("hash_pipeline"):
+      hash_pipeline(dataset, ncf_dataset.deterministic)
 
-    return dataset.prefetch(16)
+    return dataset
 
   return input_fn
diff --git a/official/recommendation/data_test.py b/official/recommendation/data_test.py
@@ -85,14 +85,14 @@ def test_preprocessing(self):
     # construct_cache()
     ncf_dataset = data_preprocessing.construct_cache(
         dataset=DATASET, data_dir=self.temp_data_dir, num_data_readers=2,
-        match_mlperf=False)
+        match_mlperf=False, deterministic=False)
     assert ncf_dataset.num_users == NUM_USERS
     assert ncf_dataset.num_items == NUM_ITEMS
 
     time.sleep(1)  # Ensure we create the next cache in a new directory.
     ncf_dataset = data_preprocessing.construct_cache(
         dataset=DATASET, data_dir=self.temp_data_dir, num_data_readers=2,
-        match_mlperf=True)
+        match_mlperf=True, deterministic=False)
     assert ncf_dataset.num_users == NUM_USERS
     assert ncf_dataset.num_items == NUM_ITEMS
 
@@ -110,7 +110,7 @@ def drain_dataset(self, dataset, g):
     return output
 
   def test_end_to_end(self):
-    ncf_dataset = data_preprocessing.instantiate_pipeline(
+    ncf_dataset, _ = data_preprocessing.instantiate_pipeline(
         dataset=DATASET, data_dir=self.temp_data_dir,
         batch_size=BATCH_SIZE, eval_batch_size=BATCH_SIZE, num_data_readers=2,
         num_neg=NUM_NEG)
diff --git a/official/recommendation/ncf_main.py b/official/recommendation/ncf_main.py
@@ -247,6 +247,8 @@ def construct_estimator(num_gpus, model_dir, params, batch_size,
         zone=params["tpu_zone"],
         project=params["tpu_gcp_project"],
     )
+    tf.logging.info("Issuing reset command to TPU to ensure a clean state.")
+    tf.Session.reset(tpu_cluster_resolver.get_master())
 
     tpu_config = tf.contrib.tpu.TPUConfig(
         iterations_per_loop=100,
@@ -297,22 +299,28 @@ def run_ncf(_):
   if FLAGS.download_if_missing:
     movielens.download(FLAGS.dataset, FLAGS.data_dir)
 
+  if FLAGS.seed is not None:
+    np.random.seed(FLAGS.seed)
+
   num_gpus = flags_core.get_num_gpus(FLAGS)
   batch_size = distribution_utils.per_device_batch_size(
       int(FLAGS.batch_size), num_gpus)
   eval_batch_size = int(FLAGS.eval_batch_size or FLAGS.batch_size)
-  ncf_dataset = data_preprocessing.instantiate_pipeline(
+  ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
       dataset=FLAGS.dataset, data_dir=FLAGS.data_dir,
       batch_size=batch_size,
       eval_batch_size=eval_batch_size,
       num_neg=FLAGS.num_neg,
       epochs_per_cycle=FLAGS.epochs_between_evals,
-      match_mlperf=FLAGS.ml_perf)
+      match_mlperf=FLAGS.ml_perf,
+      deterministic=FLAGS.seed is not None)
 
   model_helpers.apply_clean(flags.FLAGS)
 
   train_estimator, eval_estimator = construct_estimator(
       num_gpus=num_gpus, model_dir=FLAGS.model_dir, params={
+          "use_seed": FLAGS.seed is not None,
+          "hash_pipeline": FLAGS.hash_pipeline,
           "batch_size": batch_size,
           "learning_rate": FLAGS.learning_rate,
           "num_users": ncf_dataset.num_users,
@@ -365,6 +373,7 @@ def run_ncf(_):
       tf.logging.warning(
           "Estimated ({}) and reported ({}) number of batches differ by more "
           "than one".format(approx_train_steps, batch_count))
+
     train_estimator.train(input_fn=train_input_fn, hooks=train_hooks,
                           steps=batch_count)
     tf.gfile.DeleteRecursively(train_record_dir)
@@ -390,6 +399,8 @@ def run_ncf(_):
     if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
       break
 
+  cleanup_fn()  # Cleanup data construction artifacts and subprocess.
+
   # Clear the session explicitly to avoid session delete error
   tf.keras.backend.clear_session()
 
@@ -496,6 +507,17 @@ def define_ncf_flags():
           "which performs better due to the fact the sorting algorithms are "
           "not stable."))
 
+  flags.DEFINE_integer(
+      name="seed", default=None, help=flags_core.help_wrap(
+          "This value will be used to seed both NumPy and TensorFlow."))
+
+  flags.DEFINE_bool(
+      name="hash_pipeline", default=False, help=flags_core.help_wrap(
+          "This flag will perform a separate run of the pipeline and hash "
+          "batches as they are produced. \nNOTE: this will significantly slow "
+          "training. However it is useful to confirm that a random seed is "
+          "does indeed make the data pipeline deterministic."))
+
 
 if __name__ == "__main__":
   tf.logging.set_verbosity(tf.logging.INFO)
diff --git a/official/recommendation/neumf_model.py b/official/recommendation/neumf_model.py
diff --git a/official/recommendation/run.sh b/official/recommendation/run.sh