Thanks to visit codestin.com
Credit goes to github.com

Skip to content
This repository was archived by the owner on Jan 7, 2025. It is now read-only.

Commit 2da0b05

Browse files
committed
Rework Torch multi-GPU training
1 parent a0a5741 commit 2da0b05

5 files changed

Lines changed: 13 additions & 42 deletions

File tree

digits/standard-networks/torch/ImageNet-Training/alexnet.lua

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ return function(params)
8282
end
8383
return {
8484
model = createModel(params.ngpus, channels, nclasses),
85+
disableAutoDataParallelism = true,
8586
croplen = 224,
8687
trainBatchSize = 128,
8788
validationBatchSize = 32,

digits/standard-networks/torch/ImageNet-Training/googlenet.lua

Lines changed: 4 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ local function inception(input_size, config)
5555
return concat
5656
end
5757

58-
function createModel(nGPU, nChannels, nClasses)
58+
function createModel(nChannels, nClasses)
5959
-- batch normalization added on top of convolutional layers in feature branch
6060
-- in order to help the network learn faster
6161
local features = nn.Sequential()
@@ -99,28 +99,10 @@ function createModel(nGPU, nChannels, nClasses)
9999
local splitter = nn.Concat(2)
100100
splitter:add(main_branch):add(aux_classifier)
101101
--local googlenet = nn.Sequential():add(features):add(splitter)
102-
local googlenet = nn.Sequential():add(features):add(main_branch)
103102

104-
local model
105-
if nGPU>1 then
106-
local gpus = torch.range(1, nGPU):totable()
107-
local fastest, benchmark
108-
local use_cudnn = cudnn ~= nil
109-
if use_cudnn then
110-
fastest, benchmark = cudnn.fastest, cudnn.benchmark
111-
end
112-
model = nn.DataParallelTable(1, true, true):add(googlenet,gpus):threads(function()
113-
if use_cudnn then
114-
local cudnn = require 'cudnn'
115-
cudnn.fastest, cudnn.benchmark = fastest, benchmark
116-
end
117-
end)
118-
model.gradInput = nil
119-
else
120-
model = googlenet
121-
end
103+
local googlenet = nn.Sequential():add(features):add(main_branch)
122104

123-
return model
105+
return googlenet
124106
end
125107

126108
-- return function that returns network definition
@@ -135,7 +117,7 @@ return function(params)
135117
assert(params.inputShape[2]==256 and params.inputShape[3]==256, 'Network expects 256x256 images')
136118
end
137119
return {
138-
model = createModel(params.ngpus, channels, nclasses),
120+
model = createModel(channels, nclasses),
139121
croplen = 224,
140122
trainBatchSize = 32,
141123
validationBatchSize = 16,

digits/standard-networks/torch/lenet.lua

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -44,27 +44,8 @@ return function(params)
4444
lenet:add(nn.Linear(500, nclasses)) -- 500 -> nclasses
4545
lenet:add(nn.LogSoftMax())
4646

47-
local model
48-
if params.ngpus > 1 then
49-
local gpus = torch.range(1, params.ngpus):totable()
50-
local fastest, benchmark
51-
local use_cudnn = cudnn ~= nil
52-
if use_cudnn then
53-
fastest, benchmark = cudnn.fastest, cudnn.benchmark
54-
end
55-
model = nn.DataParallelTable(1, true, true):add(lenet,gpus):threads(function()
56-
if use_cudnn then
57-
local cudnn = require 'cudnn'
58-
cudnn.fastest, cudnn.benchmark = fastest, benchmark
59-
end
60-
end)
61-
model.gradInput = nil
62-
else
63-
model = lenet
64-
end
65-
6647
return {
67-
model = model,
48+
model = lenet,
6849
loss = nn.ClassNLLCriterion(),
6950
trainBatchSize = 64,
7051
validationBatchSize = 32,

docs/GettingStartedTorch.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ labelHook | function | No | A function(input,dblabel) tha
106106
trainBatchSize | number | No | If specified, sets train batch size. May be overridden by user in DIGITS UI.
107107
validationBatchSize | number | No | If specified, sets validation batch size. May be overridden by user in DIGITS UI.
108108
fineTuneHook | function | No | A function(net) that returns the model to be used for fine-tuning. The untuned model is passed as a function parameter.
109+
disableAutoDataParallelism | boolean | No | By default models are encapsulated in a nn.DataParallelTable container to enable multi-GPU training when more than 1 GPUs are selected. Setting this flag to `true` disables this mechanism.
109110

110111
### Tensors
111112

tools/torch/main.lua

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,12 @@ local parameters = {
280280
network = network_func(parameters)
281281
local model = network.model
282282

283+
-- embed model in parallel table unless explicitly disallowed in user-defined description
284+
if nGpus > 1 and not network.disableAutoDataParallelism then
285+
local gpus = torch.range(1, nGpus):totable()
286+
model = nn.DataParallelTable(1, true, true):add(model, gpus)
287+
end
288+
283289
-- if the loss criterion was not defined in the network
284290
-- use nn.ClassNLLCriterion() by default
285291
local loss = network.loss or nn.ClassNLLCriterion()

0 commit comments

Comments
 (0)