Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 34beb7a

Browse files
reedwmTaylor Robie
authored and
Taylor Robie
committed
Fix race condition with ready file. (tensorflow#5271)
1 parent e6ce8cd commit 34beb7a

File tree

2 files changed

+8
-1
lines changed

2 files changed

+8
-1
lines changed

official/recommendation/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ def __init__(self, data_dir, cache_id=None):
5858
CYCLES_TO_BUFFER = 3 # The number of train cycles worth of data to "run ahead"
5959
# of the main training loop.
6060

61+
READY_FILE_TEMP = "ready.json.temp"
6162
READY_FILE = "ready.json"
6263
TRAIN_RECORD_TEMPLATE = "train_{}.tfrecords"
6364

official/recommendation/data_async_generation.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -282,11 +282,17 @@ def _construct_training_records(
282282
raise ValueError("Error detected: point counts do not match: {} vs. {}"
283283
.format(num_pts, written_pts))
284284

285-
with tf.gfile.Open(os.path.join(record_dir, rconst.READY_FILE), "w") as f:
285+
# We write to a temp file then atomically rename it to the final file, because
286+
# writing directly to the final file can cause the main process to read a
287+
# partially written JSON file.
288+
ready_file_temp = os.path.join(record_dir, rconst.READY_FILE_TEMP)
289+
with tf.gfile.Open(ready_file_temp, "w") as f:
286290
json.dump({
287291
"batch_size": train_batch_size,
288292
"batch_count": batch_count,
289293
}, f)
294+
ready_file = os.path.join(record_dir, rconst.READY_FILE)
295+
tf.gfile.Rename(ready_file_temp, ready_file)
290296

291297
log_msg("Cycle {} complete. Total time: {:.1f} seconds"
292298
.format(train_cycle, timeit.default_timer() - st))

0 commit comments

Comments
 (0)