@@ -357,8 +357,8 @@ def generate_train_eval_data(df, approx_num_shards, num_items, cache_paths,
357
357
358
358
359
359
def construct_cache (dataset , data_dir , num_data_readers , match_mlperf ,
360
- deterministic ):
361
- # type: (str, str, int, bool) -> NCFDataset
360
+ deterministic , cache_id = None ):
361
+ # type: (str, str, int, bool, typing.Optional[int] ) -> NCFDataset
362
362
"""Load and digest data CSV into a usable form.
363
363
364
364
Args:
@@ -371,7 +371,7 @@ def construct_cache(dataset, data_dir, num_data_readers, match_mlperf,
371
371
deterministic: Try to enforce repeatable behavior, even at the cost of
372
372
performance.
373
373
"""
374
- cache_paths = rconst .Paths (data_dir = data_dir )
374
+ cache_paths = rconst .Paths (data_dir = data_dir , cache_id = cache_id )
375
375
num_data_readers = (num_data_readers or int (multiprocessing .cpu_count () / 2 )
376
376
or 1 )
377
377
approx_num_shards = int (movielens .NUM_RATINGS [dataset ]
@@ -436,15 +436,16 @@ def _shutdown(proc):
436
436
def instantiate_pipeline (dataset , data_dir , batch_size , eval_batch_size ,
437
437
num_data_readers = None , num_neg = 4 , epochs_per_cycle = 1 ,
438
438
match_mlperf = False , deterministic = False ,
439
- use_subprocess = True ):
439
+ use_subprocess = True , cache_id = None ):
440
440
# type: (...) -> (NCFDataset, typing.Callable)
441
441
"""Preprocess data and start negative generation subprocess."""
442
442
443
443
tf .logging .info ("Beginning data preprocessing." )
444
444
ncf_dataset = construct_cache (dataset = dataset , data_dir = data_dir ,
445
445
num_data_readers = num_data_readers ,
446
446
match_mlperf = match_mlperf ,
447
- deterministic = deterministic )
447
+ deterministic = deterministic ,
448
+ cache_id = cache_id )
448
449
# By limiting the number of workers we guarantee that the worker
449
450
# pool underlying the training generation doesn't starve other processes.
450
451
num_workers = int (multiprocessing .cpu_count () * 0.75 ) or 1
@@ -473,13 +474,14 @@ def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size,
473
474
# We write to a temp file then atomically rename it to the final file,
474
475
# because writing directly to the final file can cause the data generation
475
476
# async process to read a partially written JSON file.
476
- flagfile_temp = os .path .join (data_dir , rconst .FLAGFILE_TEMP )
477
+ flagfile_temp = os .path .join (ncf_dataset .cache_paths .cache_root ,
478
+ rconst .FLAGFILE_TEMP )
477
479
tf .logging .info ("Preparing flagfile for async data generation in {} ..."
478
480
.format (flagfile_temp ))
479
481
with tf .gfile .Open (flagfile_temp , "w" ) as f :
480
482
for k , v in six .iteritems (flags_ ):
481
483
f .write ("--{}={}\n " .format (k , v ))
482
- flagfile = os .path .join (data_dir , rconst .FLAGFILE )
484
+ flagfile = os .path .join (ncf_dataset . cache_paths . cache_root , rconst .FLAGFILE )
483
485
tf .gfile .Rename (flagfile_temp , flagfile )
484
486
tf .logging .info (
485
487
"Wrote flagfile for async data generation in {}."
@@ -493,7 +495,8 @@ def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size,
493
495
# contention with the main training process.
494
496
subproc_env ["CUDA_VISIBLE_DEVICES" ] = ""
495
497
subproc_args = popen_helper .INVOCATION + [
496
- "--data_dir" , data_dir ]
498
+ "--data_dir" , data_dir ,
499
+ "--cache_id" , str (ncf_dataset .cache_paths .cache_id )]
497
500
tf .logging .info (
498
501
"Generation subprocess command: {}" .format (" " .join (subproc_args )))
499
502
proc = subprocess .Popen (args = subproc_args , shell = False , env = subproc_env )
0 commit comments