Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 4dc1080

Browse files
author
Taylor Robie
authored
Fix/ncf mlperf tweaks: robustness and determinism (tensorflow#5334)
* bug fixes and add seed * more random corrections * make cleanup more robust * return cleanup fn * delint and address PR comments. * delint and fix tests * delinting is never done * add pipeline hashing * delint
1 parent 903194c commit 4dc1080

File tree

6 files changed

+153
-33
lines changed

6 files changed

+153
-33
lines changed

official/recommendation/data_async_generation.py

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,8 @@ def _construct_training_records(
160160
train_batch_size, # type: int
161161
training_shards, # type: typing.List[str]
162162
spillover, # type: bool
163-
carryover=None # type: typing.Union[typing.List[np.ndarray], None]
163+
carryover=None, # type: typing.Union[typing.List[np.ndarray], None]
164+
deterministic=False # type: bool
164165
):
165166
"""Generate false negatives and write TFRecords files.
166167
@@ -204,7 +205,8 @@ def _construct_training_records(
204205

205206
with contextlib.closing(multiprocessing.Pool(
206207
processes=num_workers, initializer=init_worker)) as pool:
207-
data_generator = pool.imap_unordered(_process_shard, map_args) # pylint: disable=no-member
208+
map_fn = pool.imap if deterministic else pool.imap_unordered # pylint: disable=no-member
209+
data_generator = map_fn(_process_shard, map_args)
208210
data = [
209211
np.zeros(shape=(num_pts,), dtype=np.int32) - 1,
210212
np.zeros(shape=(num_pts,), dtype=np.uint16),
@@ -339,16 +341,31 @@ def _construct_eval_record(cache_paths, eval_batch_size):
339341
log_msg("Eval TFRecords file successfully constructed.")
340342

341343

342-
def _generation_loop(
343-
num_workers, cache_paths, num_readers, num_neg, num_train_positives,
344-
num_items, spillover, epochs_per_cycle, train_batch_size, eval_batch_size):
345-
# type: (int, rconst.Paths, int, int, int, int, bool, int, int, int) -> None
344+
def _generation_loop(num_workers, # type: int
345+
cache_paths, # type: rconst.Paths
346+
num_readers, # type: int
347+
num_neg, # type: int
348+
num_train_positives, # type: int
349+
num_items, # type: int
350+
spillover, # type: bool
351+
epochs_per_cycle, # type: int
352+
train_batch_size, # type: int
353+
eval_batch_size, # type: int
354+
deterministic # type: bool
355+
):
356+
# type: (...) -> None
346357
"""Primary run loop for data file generation."""
347358

348359
log_msg("Signaling that I am alive.")
349360
with tf.gfile.Open(cache_paths.subproc_alive, "w") as f:
350361
f.write("Generation subproc has started.")
351-
atexit.register(tf.gfile.Remove, filename=cache_paths.subproc_alive)
362+
363+
@atexit.register
364+
def remove_alive_file():
365+
try:
366+
tf.gfile.Remove(cache_paths.subproc_alive)
367+
except tf.errors.NotFoundError:
368+
return # Main thread has already deleted the entire cache dir.
352369

353370
log_msg("Entering generation loop.")
354371
tf.gfile.MakeDirs(cache_paths.train_epoch_dir)
@@ -364,7 +381,8 @@ def _generation_loop(
364381
cache_paths=cache_paths, num_readers=num_readers, num_neg=num_neg,
365382
num_train_positives=num_train_positives, num_items=num_items,
366383
epochs_per_cycle=epochs_per_cycle, train_batch_size=train_batch_size,
367-
training_shards=training_shards, spillover=spillover, carryover=None)
384+
training_shards=training_shards, spillover=spillover, carryover=None,
385+
deterministic=deterministic)
368386

369387
_construct_eval_record(cache_paths=cache_paths,
370388
eval_batch_size=eval_batch_size)
@@ -397,7 +415,7 @@ def _generation_loop(
397415
num_train_positives=num_train_positives, num_items=num_items,
398416
epochs_per_cycle=epochs_per_cycle, train_batch_size=train_batch_size,
399417
training_shards=training_shards, spillover=spillover,
400-
carryover=carryover)
418+
carryover=carryover, deterministic=deterministic)
401419

402420
wait_count = 0
403421
start_time = time.time()
@@ -441,6 +459,7 @@ def main(_):
441459
epochs_per_cycle=flags.FLAGS.epochs_per_cycle,
442460
train_batch_size=flags.FLAGS.train_batch_size,
443461
eval_batch_size=flags.FLAGS.eval_batch_size,
462+
deterministic=flags.FLAGS.seed is not None,
444463
)
445464
except KeyboardInterrupt:
446465
log_msg("KeyboardInterrupt registered.")

official/recommendation/data_preprocessing.py

Lines changed: 83 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import atexit
2222
import contextlib
2323
import gc
24+
import hashlib
2425
import multiprocessing
2526
import json
2627
import os
@@ -50,7 +51,7 @@ class NCFDataset(object):
5051
"""Container for training and testing data."""
5152

5253
def __init__(self, user_map, item_map, num_data_readers, cache_paths,
53-
num_train_positives):
54+
num_train_positives, deterministic=False):
5455
# type: (dict, dict, int, rconst.Paths) -> None
5556
"""Assign key values for recommendation dataset.
5657
@@ -61,6 +62,8 @@ def __init__(self, user_map, item_map, num_data_readers, cache_paths,
6162
cache_paths: Object containing locations for various cache files.
6263
num_train_positives: The number of positive training examples in the
6364
dataset.
65+
deterministic: Operations should use deterministic, order preserving
66+
methods, even at the cost of performance.
6467
"""
6568

6669
self.user_map = {int(k): int(v) for k, v in user_map.items()}
@@ -70,6 +73,7 @@ def __init__(self, user_map, item_map, num_data_readers, cache_paths,
7073
self.num_data_readers = num_data_readers
7174
self.cache_paths = cache_paths
7275
self.num_train_positives = num_train_positives
76+
self.deterministic = deterministic
7377

7478

7579
def _filter_index_sort(raw_rating_path, match_mlperf):
@@ -340,7 +344,8 @@ def generate_train_eval_data(df, approx_num_shards, num_items, cache_paths,
340344
pickle.dump(eval_data, f, protocol=pickle.HIGHEST_PROTOCOL)
341345

342346

343-
def construct_cache(dataset, data_dir, num_data_readers, match_mlperf):
347+
def construct_cache(dataset, data_dir, num_data_readers, match_mlperf,
348+
deterministic):
344349
# type: (str, str, int, bool) -> NCFDataset
345350
"""Load and digest data CSV into a usable form.
346351
@@ -351,6 +356,8 @@ def construct_cache(dataset, data_dir, num_data_readers, match_mlperf):
351356
data during training.
352357
match_mlperf: If True, change the behavior of the cache construction to
353358
match the MLPerf reference implementation.
359+
deterministic: Try to enforce repeatable behavior, even at the cost of
360+
performance.
354361
"""
355362
cache_paths = rconst.Paths(data_dir=data_dir)
356363
num_data_readers = (num_data_readers or int(multiprocessing.cpu_count() / 2)
@@ -377,7 +384,8 @@ def construct_cache(dataset, data_dir, num_data_readers, match_mlperf):
377384
ncf_dataset = NCFDataset(user_map=user_map, item_map=item_map,
378385
num_data_readers=num_data_readers,
379386
cache_paths=cache_paths,
380-
num_train_positives=len(df) - len(user_map))
387+
num_train_positives=len(df) - len(user_map),
388+
deterministic=deterministic)
381389

382390
run_time = timeit.default_timer() - st
383391
tf.logging.info("Cache construction complete. Time: {:.1f} sec."
@@ -403,13 +411,15 @@ def _shutdown(proc):
403411

404412
def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size,
405413
num_data_readers=None, num_neg=4, epochs_per_cycle=1,
406-
match_mlperf=False):
414+
match_mlperf=False, deterministic=False):
415+
# type: (...) -> (NCFDataset, typing.Callable)
407416
"""Preprocess data and start negative generation subprocess."""
408417

409418
tf.logging.info("Beginning data preprocessing.")
410419
ncf_dataset = construct_cache(dataset=dataset, data_dir=data_dir,
411420
num_data_readers=num_data_readers,
412-
match_mlperf=match_mlperf)
421+
match_mlperf=match_mlperf,
422+
deterministic=deterministic)
413423

414424
tf.logging.info("Creating training file subprocess.")
415425

@@ -439,18 +449,30 @@ def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size,
439449
# guarantee batch size and significantly improves
440450
# performance. (~5% increase in examples/sec on
441451
# GPU, and needed for TPU XLA.)
442-
"--redirect_logs", "True",
443-
"--seed", str(int(stat_utils.random_int32()))
452+
"--redirect_logs", "True"
444453
]
454+
if ncf_dataset.deterministic:
455+
subproc_args.extend(["--seed", str(int(stat_utils.random_int32()))])
445456

446457
tf.logging.info(
447458
"Generation subprocess command: {}".format(" ".join(subproc_args)))
448459

449460
proc = subprocess.Popen(args=subproc_args, shell=False, env=subproc_env)
450461

451-
atexit.register(_shutdown, proc=proc)
452-
atexit.register(tf.gfile.DeleteRecursively,
453-
ncf_dataset.cache_paths.cache_root)
462+
cleanup_called = {"finished": False}
463+
@atexit.register
464+
def cleanup():
465+
"""Remove files and subprocess from data generation."""
466+
if cleanup_called["finished"]:
467+
return
468+
469+
_shutdown(proc)
470+
try:
471+
tf.gfile.DeleteRecursively(ncf_dataset.cache_paths.cache_root)
472+
except tf.errors.NotFoundError:
473+
pass
474+
475+
cleanup_called["finished"] = True
454476

455477
for _ in range(300):
456478
if tf.gfile.Exists(ncf_dataset.cache_paths.subproc_alive):
@@ -460,7 +482,7 @@ def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size,
460482
raise ValueError("Generation subprocess did not start correctly. Data will "
461483
"not be available; exiting to avoid waiting forever.")
462484

463-
return ncf_dataset
485+
return ncf_dataset, cleanup
464486

465487

466488
def make_deserialize(params, batch_size, training=False):
@@ -498,6 +520,44 @@ def deserialize(examples_serialized):
498520
return deserialize
499521

500522

523+
def hash_pipeline(dataset, deterministic):
524+
# type: (tf.data.Dataset, bool) -> None
525+
"""Utility function for detecting non-determinism in the data pipeline.
526+
527+
Args:
528+
dataset: a tf.data.Dataset generated by the input_fn
529+
deterministic: Does the input_fn expect the dataset to be deterministic.
530+
(i.e. fixed seed, sloppy=False, etc.)
531+
"""
532+
if not deterministic:
533+
tf.logging.warning("Data pipeline is not marked as deterministic. Hash "
534+
"values are not expected to be meaningful.")
535+
536+
batch = dataset.make_one_shot_iterator().get_next()
537+
md5 = hashlib.md5()
538+
count = 0
539+
first_batch_hash = b""
540+
with tf.Session() as sess:
541+
while True:
542+
try:
543+
result = sess.run(batch)
544+
if isinstance(result, tuple):
545+
result = result[0] # only hash features
546+
except tf.errors.OutOfRangeError:
547+
break
548+
549+
count += 1
550+
md5.update(memoryview(result[movielens.USER_COLUMN]).tobytes())
551+
md5.update(memoryview(result[movielens.ITEM_COLUMN]).tobytes())
552+
if count == 1:
553+
first_batch_hash = md5.hexdigest()
554+
overall_hash = md5.hexdigest()
555+
tf.logging.info("Batch count: {}".format(count))
556+
tf.logging.info(" [pipeline_hash] First batch hash: {}".format(
557+
first_batch_hash))
558+
tf.logging.info(" [pipeline_hash] All batches hash: {}".format(overall_hash))
559+
560+
501561
def make_train_input_fn(ncf_dataset):
502562
# type: (NCFDataset) -> (typing.Callable, str, int)
503563
"""Construct training input_fn for the current epoch."""
@@ -556,14 +616,19 @@ def input_fn(params):
556616
tf.data.TFRecordDataset,
557617
cycle_length=4,
558618
block_length=100000,
559-
sloppy=True,
619+
sloppy=not ncf_dataset.deterministic,
560620
prefetch_input_elements=4,
561621
)
562622

563623
deserialize = make_deserialize(params, batch_size, True)
564624
dataset = record_files.apply(interleave)
565625
dataset = dataset.map(deserialize, num_parallel_calls=4)
566-
return dataset.prefetch(32)
626+
dataset = dataset.prefetch(32)
627+
628+
if params.get("hash_pipeline"):
629+
hash_pipeline(dataset, ncf_dataset.deterministic)
630+
631+
return dataset
567632

568633
return input_fn, record_dir, batch_count
569634

@@ -588,7 +653,11 @@ def input_fn(params):
588653

589654
deserialize = make_deserialize(params, batch_size, False)
590655
dataset = dataset.map(deserialize, num_parallel_calls=4)
656+
dataset = dataset.prefetch(16)
657+
658+
if params.get("hash_pipeline"):
659+
hash_pipeline(dataset, ncf_dataset.deterministic)
591660

592-
return dataset.prefetch(16)
661+
return dataset
593662

594663
return input_fn

official/recommendation/data_test.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,14 +85,14 @@ def test_preprocessing(self):
8585
# construct_cache()
8686
ncf_dataset = data_preprocessing.construct_cache(
8787
dataset=DATASET, data_dir=self.temp_data_dir, num_data_readers=2,
88-
match_mlperf=False)
88+
match_mlperf=False, deterministic=False)
8989
assert ncf_dataset.num_users == NUM_USERS
9090
assert ncf_dataset.num_items == NUM_ITEMS
9191

9292
time.sleep(1) # Ensure we create the next cache in a new directory.
9393
ncf_dataset = data_preprocessing.construct_cache(
9494
dataset=DATASET, data_dir=self.temp_data_dir, num_data_readers=2,
95-
match_mlperf=True)
95+
match_mlperf=True, deterministic=False)
9696
assert ncf_dataset.num_users == NUM_USERS
9797
assert ncf_dataset.num_items == NUM_ITEMS
9898

@@ -110,7 +110,7 @@ def drain_dataset(self, dataset, g):
110110
return output
111111

112112
def test_end_to_end(self):
113-
ncf_dataset = data_preprocessing.instantiate_pipeline(
113+
ncf_dataset, _ = data_preprocessing.instantiate_pipeline(
114114
dataset=DATASET, data_dir=self.temp_data_dir,
115115
batch_size=BATCH_SIZE, eval_batch_size=BATCH_SIZE, num_data_readers=2,
116116
num_neg=NUM_NEG)

official/recommendation/ncf_main.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,8 @@ def construct_estimator(num_gpus, model_dir, params, batch_size,
247247
zone=params["tpu_zone"],
248248
project=params["tpu_gcp_project"],
249249
)
250+
tf.logging.info("Issuing reset command to TPU to ensure a clean state.")
251+
tf.Session.reset(tpu_cluster_resolver.get_master())
250252

251253
tpu_config = tf.contrib.tpu.TPUConfig(
252254
iterations_per_loop=100,
@@ -297,22 +299,28 @@ def run_ncf(_):
297299
if FLAGS.download_if_missing:
298300
movielens.download(FLAGS.dataset, FLAGS.data_dir)
299301

302+
if FLAGS.seed is not None:
303+
np.random.seed(FLAGS.seed)
304+
300305
num_gpus = flags_core.get_num_gpus(FLAGS)
301306
batch_size = distribution_utils.per_device_batch_size(
302307
int(FLAGS.batch_size), num_gpus)
303308
eval_batch_size = int(FLAGS.eval_batch_size or FLAGS.batch_size)
304-
ncf_dataset = data_preprocessing.instantiate_pipeline(
309+
ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
305310
dataset=FLAGS.dataset, data_dir=FLAGS.data_dir,
306311
batch_size=batch_size,
307312
eval_batch_size=eval_batch_size,
308313
num_neg=FLAGS.num_neg,
309314
epochs_per_cycle=FLAGS.epochs_between_evals,
310-
match_mlperf=FLAGS.ml_perf)
315+
match_mlperf=FLAGS.ml_perf,
316+
deterministic=FLAGS.seed is not None)
311317

312318
model_helpers.apply_clean(flags.FLAGS)
313319

314320
train_estimator, eval_estimator = construct_estimator(
315321
num_gpus=num_gpus, model_dir=FLAGS.model_dir, params={
322+
"use_seed": FLAGS.seed is not None,
323+
"hash_pipeline": FLAGS.hash_pipeline,
316324
"batch_size": batch_size,
317325
"learning_rate": FLAGS.learning_rate,
318326
"num_users": ncf_dataset.num_users,
@@ -365,6 +373,7 @@ def run_ncf(_):
365373
tf.logging.warning(
366374
"Estimated ({}) and reported ({}) number of batches differ by more "
367375
"than one".format(approx_train_steps, batch_count))
376+
368377
train_estimator.train(input_fn=train_input_fn, hooks=train_hooks,
369378
steps=batch_count)
370379
tf.gfile.DeleteRecursively(train_record_dir)
@@ -390,6 +399,8 @@ def run_ncf(_):
390399
if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
391400
break
392401

402+
cleanup_fn() # Cleanup data construction artifacts and subprocess.
403+
393404
# Clear the session explicitly to avoid session delete error
394405
tf.keras.backend.clear_session()
395406

@@ -496,6 +507,17 @@ def define_ncf_flags():
496507
"which performs better due to the fact the sorting algorithms are "
497508
"not stable."))
498509

510+
flags.DEFINE_integer(
511+
name="seed", default=None, help=flags_core.help_wrap(
512+
"This value will be used to seed both NumPy and TensorFlow."))
513+
514+
flags.DEFINE_bool(
515+
name="hash_pipeline", default=False, help=flags_core.help_wrap(
516+
"This flag will perform a separate run of the pipeline and hash "
517+
"batches as they are produced. \nNOTE: this will significantly slow "
518+
"training. However it is useful to confirm that a random seed is "
519+
"does indeed make the data pipeline deterministic."))
520+
499521

500522
if __name__ == "__main__":
501523
tf.logging.set_verbosity(tf.logging.INFO)

0 commit comments

Comments
 (0)