Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 828cb18

Browse files
albanDfacebook-github-bot
authored andcommitted
Allow ReadyQueue to handle empty tasks (#15791)
Summary: Allow the comparison function used in ReadyQueue to handle the empty FunctionTasks created by the reentrant autograd. Fix #11732 Pull Request resolved: #15791 Differential Revision: D13598006 Pulled By: soumith fbshipit-source-id: 0bfdf28a735fbfe44f0fdbaf8b74a6198e6a1984
1 parent 8a07cbe commit 828cb18

2 files changed

Lines changed: 40 additions & 1 deletion

File tree

test/test_autograd.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,14 @@
1414
from torch.autograd.gradcheck import gradgradcheck, gradcheck
1515
from torch.autograd.function import once_differentiable
1616
from torch.autograd.profiler import profile
17+
from torch.utils.checkpoint import checkpoint
1718
from common_utils import (TEST_MKL, TestCase, run_tests, skipIfNoLapack,
1819
suppress_warnings, skipIfRocm,
1920
prod_single_zero, random_square_matrix_of_rank,
2021
random_symmetric_matrix, random_symmetric_psd_matrix,
2122
random_symmetric_pd_matrix, make_nonzero_det,
2223
random_fullrank_matrix_distinct_singular_value, load_tests)
24+
from common_cuda import TEST_CUDA
2325
from torch.autograd import Variable, Function, detect_anomaly
2426
from torch.autograd.function import InplaceFunction
2527
from torch.testing import make_non_contiguous, randn_like
@@ -2722,6 +2724,36 @@ def fn(sparse):
27222724
with self.assertRaisesRegex(RuntimeError, 'gradcheck expects all tensor inputs are dense'):
27232725
gradcheck(fn, torch.rand(10).to_sparse().requires_grad_(True), check_sparse_nnz=False)
27242726

2727+
@unittest.skipIf(not TEST_CUDA, "Requires cuda for multi device")
2728+
def test_multi_device_reentrant_autograd(self):
2729+
# Output on gpu so that this task will be associated with the gpu thread
2730+
def fn_on_gpu(inp):
2731+
# Artificially increase the priority of the next op to make sure it runs
2732+
# as soon as we reach it before the ops of branch1.
2733+
dummy = inp * 2 * 2 * 2 * 2
2734+
return inp.cuda()
2735+
2736+
def parent_on_cpu(inp):
2737+
# Slow branch of ops on gpu so that the work queue for the gpu thread
2738+
# won't empty too quickly. They also have smaller priorities than the
2739+
# ones created by fn_on_gpu
2740+
branch1 = inp.cuda()
2741+
branch1 = branch1 / branch1
2742+
branch1 = branch1 / branch1
2743+
branch1 = branch1 / branch1
2744+
# Perform checkpoint on cpu tensors. So the last op performed in the reentrant
2745+
# autograd is an AccumulateGrad that runs on the cpu thread for the gpu thread.
2746+
# So the cpu thread will notify the gpu thread with an empty FunctionTask.
2747+
branch2 = checkpoint(fn_on_gpu, inp)
2748+
out = branch2 + branch1
2749+
return out
2750+
2751+
inp = torch.rand(2, requires_grad=True)
2752+
out = parent_on_cpu(inp)
2753+
# This will segfault if the empty FunctionTask is not handled properly in the
2754+
# gpu thread ReadyQueue
2755+
out.sum().backward()
2756+
27252757

27262758
def index_variable(shape, max_indices):
27272759
if not isinstance(shape, tuple):

torch/csrc/autograd/engine.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,16 @@ struct FunctionTask {
7575
};
7676

7777
// Returns true when t2 should be (weakly) BEFORE t1 in the queue.
78+
// Empty FunctionTask are first.
7879
struct CompareFunctionTaskTime {
7980
bool operator()(FunctionTask const & t1, FunctionTask const & t2) {
80-
return t1.fn->sequence_nr() < t2.fn->sequence_nr();
81+
if (!t1.fn) {
82+
return false;
83+
} else if (!t2.fn) {
84+
return true;
85+
} else {
86+
return t1.fn->sequence_nr() < t2.fn->sequence_nr();
87+
}
8188
}
8289
};
8390

0 commit comments

Comments
 (0)