Fix rare caching allocator race condition in imagenet prefetcher

definitelynotmcarilli · definitelynotmcarilli · commit d5e2bb4bdeed · 2019-06-17T17:26:45.000-07:00
diff --git a/examples/imagenet/main_amp.py b/examples/imagenet/main_amp.py
@@ -272,9 +272,23 @@ def preload(self):
             self.next_input = None
             self.next_target = None
             return
+        # if record_stream() doesn't work, another option is to make sure device inputs are created
+        # on the main stream.
+        # self.next_input_gpu = torch.empty_like(self.next_input, device='cuda')
+        # self.next_target_gpu = torch.empty_like(self.next_target, device='cuda')
+        # Need to make sure the memory allocated for next_* is not still in use by the main stream
+        # at the time we start copying to next_*:
+        # self.stream.wait_stream(torch.cuda.current_stream())
         with torch.cuda.stream(self.stream):
             self.next_input = self.next_input.cuda(non_blocking=True)
             self.next_target = self.next_target.cuda(non_blocking=True)
+            # more code for the alternative if record_stream() doesn't work:
+            # copy_ will record the use of the pinned source tensor in this side stream.
+            # self.next_input_gpu.copy_(self.next_input, non_blocking=True)
+            # self.next_target_gpu.copy_(self.next_target, non_blocking=True)
+            # self.next_input = self.next_input_gpu
+            # self.next_target = self.next_target_gpu
+
             # With Amp, it isn't necessary to manually convert data to half.
             # if args.fp16:
             #     self.next_input = self.next_input.half()
@@ -286,6 +300,8 @@ def next(self):
         torch.cuda.current_stream().wait_stream(self.stream)
         input = self.next_input
         target = self.next_target
+        input.record_stream(torch.cuda.current_stream())
+        target.record_stream(torch.cuda.current_stream())
         self.preload()
         return input, target