3030#include " cudnn_backend.h"
3131
3232// some helpers
33- int64_t checkCudaError (cudaError_t code, const char * expr, const char * file, int line) {
33+ int64_t checkCudaError (cudaError_t code, const char * expr, const char * file, int line) {
3434 if (code) {
3535 printf (" CUDA error at %s:%d, code=%d (%s) in '%s'" , file, line, (int )code, cudaGetErrorString (code), expr);
3636 return 1 ;
3737 }
3838 return 0 ;
3939}
4040
41- int64_t checkCudnnError (cudnnStatus_t code, const char * expr, const char * file, int line) {
41+ int64_t checkCudnnError (cudnnStatus_t code, const char * expr, const char * file, int line) {
4242 if (code) {
4343 printf (" CUDNN error at %s:%d, code=%d (%s) in '%s'\n " , file, line, (int )code, cudnnGetErrorString (code), expr);
4444 return 1 ;
@@ -51,7 +51,7 @@ bool AllowAll(cudnnBackendDescriptor_t engine_config) {
5151 return false ;
5252}
5353
54- void generateStrides (const int64_t * dimA, int64_t * strideA, int64_t nbDims, cudnnTensorFormat_t filterFormat) {
54+ void generateStrides (const int64_t * dimA, int64_t * strideA, int64_t nbDims, cudnnTensorFormat_t filterFormat) {
5555 // For INT8x4 and INT8x32 we still compute standard strides here to input
5656 // into the cuDNN functions. We will manually scale by resizeFactor in the cpu ref.
5757 if (filterFormat == CUDNN_TENSOR_NCHW) {
@@ -71,8 +71,8 @@ void generateStrides(const int64_t *dimA, int64_t *strideA, int64_t nbDims, cudn
7171}
7272
7373// runtime
74- cudnn_frontend::ExecutionPlan run_batch_norm_forward (int64_t * tensorDims, int64_t * perChannelSum, int64_t * epsilon,
75- int64_t * peerDims, cudnnDataType_t data_type) {
74+ cudnn_frontend::ExecutionPlan run_batch_norm_forward (int64_t * tensorDims, int64_t * perChannelSum, int64_t * epsilon,
75+ int64_t * peerDims, cudnnDataType_t data_type) {
7676 // get the cudnn handle
7777 cudnnHandle_t handle = torch::native::getCudnnHandle ();
7878
@@ -172,9 +172,9 @@ cudnn_frontend::ExecutionPlan run_batch_norm_forward(int64_t *tensorDims, int64_
172172 .setyDesc (yTensor)
173173 .build ();
174174
175- std::array<cudnn_frontend::Operation const *, 1 > ops = {&batch_norm_op};
175+ std::array<cudnn_frontend::Operation const *, 1 > ops = {&batch_norm_op};
176176#else
177- std::array<cudnn_frontend::Operation const *, 0 > ops = {};
177+ std::array<cudnn_frontend::Operation const *, 0 > ops = {};
178178#endif
179179 auto opGraph =
180180 cudnn_frontend::OperationGraphBuilder ().setHandle (handle).setOperationGraph (ops.size (), ops.data ()).build ();
@@ -203,7 +203,7 @@ cudnn_frontend::ExecutionPlan run_batch_norm_forward(int64_t *tensorDims, int64_
203203 .setEngineConfig (filtered_configs[i], opGraph.getTag ())
204204 .build ();
205205 return plan;
206- } catch (cudnn_frontend::cudnnException & e) {
206+ } catch (cudnn_frontend::cudnnException& e) {
207207 continue ;
208208 }
209209 }
@@ -219,10 +219,10 @@ cudnn_frontend::ExecutionPlan run_batch_norm_forward(int64_t *tensorDims, int64_
219219 return plan;
220220}
221221
222- void execute_batch_norm_forward (cudnn_frontend::ExecutionPlan plan, void * xDevPtr, void * yDevPtr, void * scaledevPtr,
223- void * biasdevPtr, void * in_meandevPtr, void * in_vardevPtr, void * out_meandevPtr,
224- void * out_vardevPtr, void * saved_meandevPtr, void * saved_inv_vardevPtr,
225- const std::vector<void *> & peer_devPtrs, double epsilon_val,
222+ void execute_batch_norm_forward (cudnn_frontend::ExecutionPlan plan, void * xDevPtr, void * yDevPtr, void * scaledevPtr,
223+ void * biasdevPtr, void * in_meandevPtr, void * in_vardevPtr, void * out_meandevPtr,
224+ void * out_vardevPtr, void * saved_meandevPtr, void * saved_inv_vardevPtr,
225+ const std::vector<void *>& peer_devPtrs, double epsilon_val,
226226 double exponential_decay_factor, size_t peer_size, int rank_id) {
227227 // get handle
228228 cudnnHandle_t handle_ = torch::native::getCudnnHandle ();
@@ -235,13 +235,13 @@ void execute_batch_norm_forward(cudnn_frontend::ExecutionPlan plan, void *xDevPt
235235 // allocate workspace
236236 auto workspace_size = plan.getWorkspaceSize ();
237237 auto workspace_tensor = at::empty ({(workspace_size + 3 ) / 4 }, at::TensorOptions (at::kCUDA ).dtype (at::kFloat ));
238- void * workPtr = nullptr ;
238+ void * workPtr = nullptr ;
239239 if (workspace_size > 0 ) {
240240 workPtr = workspace_tensor.data_ptr <float >();
241241 }
242242
243243 // first the data pointers
244- std::vector<void *> data_ptrs{
244+ std::vector<void *> data_ptrs{
245245 xDevPtr, yDevPtr, scaledevPtr, biasdevPtr, in_meandevPtr, in_vardevPtr,
246246 out_meandevPtr, out_vardevPtr, saved_meandevPtr, saved_inv_vardevPtr, &epsilon_val, &exponential_decay_factor};
247247 data_ptrs.insert (data_ptrs.end (), peer_devPtrs.begin (), peer_devPtrs.end ());
@@ -262,7 +262,7 @@ void execute_batch_norm_forward(cudnn_frontend::ExecutionPlan plan, void *xDevPt
262262 // Reset local communication buffer
263263 cudaMemsetAsync (peer_devPtrs[rank_id], 0 , peer_size * 4 , stream);
264264
265- } catch (cudnn_frontend::cudnnException & e) {
265+ } catch (cudnn_frontend::cudnnException& e) {
266266 struct cudaDeviceProp prop;
267267 checkCudaErr (cudaGetDeviceProperties (&prop, 0 ));
268268 if (prop.major == 8 ) {
@@ -272,8 +272,8 @@ void execute_batch_norm_forward(cudnn_frontend::ExecutionPlan plan, void *xDevPt
272272 }
273273}
274274
275- cudnn_frontend::ExecutionPlan run_batch_norm_backward (int64_t * tensorDims, int64_t * perChannelSum, int64_t * epsilon,
276- int64_t * peerDims, cudnnDataType_t data_type) {
275+ cudnn_frontend::ExecutionPlan run_batch_norm_backward (int64_t * tensorDims, int64_t * perChannelSum, int64_t * epsilon,
276+ int64_t * peerDims, cudnnDataType_t data_type) {
277277 // get cudnn handle
278278 cudnnHandle_t handle = torch::native::getCudnnHandle ();
279279
@@ -364,9 +364,9 @@ cudnn_frontend::ExecutionPlan run_batch_norm_backward(int64_t *tensorDims, int64
364364 .setPeerStatTensor (peerStatTensors)
365365 .build ();
366366
367- std::array<cudnn_frontend::Operation const *, 1 > ops = {&batch_norm_op};
367+ std::array<cudnn_frontend::Operation const *, 1 > ops = {&batch_norm_op};
368368#else
369- std::array<cudnn_frontend::Operation const *, 0 > ops = {};
369+ std::array<cudnn_frontend::Operation const *, 0 > ops = {};
370370#endif
371371
372372 auto opGraph =
@@ -385,7 +385,7 @@ cudnn_frontend::ExecutionPlan run_batch_norm_backward(int64_t *tensorDims, int64
385385 .setEngineConfig (filtered_configs[i], opGraph.getTag ())
386386 .build ();
387387 return plan;
388- } catch (cudnn_frontend::cudnnException & e) {
388+ } catch (cudnn_frontend::cudnnException& e) {
389389 continue ;
390390 }
391391 }
@@ -401,10 +401,10 @@ cudnn_frontend::ExecutionPlan run_batch_norm_backward(int64_t *tensorDims, int64
401401 return plan;
402402}
403403
404- void execute_batch_norm_backward (cudnn_frontend::ExecutionPlan plan, void * xDevPtr, void * dyDevPtr, void * scaledevPtr,
405- void * saved_meandevPtr, void * saved_inv_vardevPtr,
406- const std::vector<void *> & peer_devPtrs, void * dxDevPtr, void * dscaledevPtr,
407- void * dbiasdevPtr, double epsilon_val, size_t peer_size, int rank_id) {
404+ void execute_batch_norm_backward (cudnn_frontend::ExecutionPlan plan, void * xDevPtr, void * dyDevPtr, void * scaledevPtr,
405+ void * saved_meandevPtr, void * saved_inv_vardevPtr,
406+ const std::vector<void *>& peer_devPtrs, void * dxDevPtr, void * dscaledevPtr,
407+ void * dbiasdevPtr, double epsilon_val, size_t peer_size, int rank_id) {
408408 // get handle
409409 cudnnHandle_t handle_ = torch::native::getCudnnHandle ();
410410
@@ -416,14 +416,14 @@ void execute_batch_norm_backward(cudnn_frontend::ExecutionPlan plan, void *xDevP
416416 // allocate workspace
417417 auto workspace_size = plan.getWorkspaceSize ();
418418 auto workspace_tensor = at::empty ({(workspace_size + 3 ) / 4 }, at::TensorOptions (at::kCUDA ).dtype (at::kFloat ));
419- void * workPtr = nullptr ;
419+ void * workPtr = nullptr ;
420420 if (workspace_size > 0 ) {
421421 workPtr = workspace_tensor.data_ptr <float >();
422422 }
423423
424424 // create helper arrays
425- std::vector<void *> data_ptrs{xDevPtr, dyDevPtr, scaledevPtr, saved_meandevPtr, saved_inv_vardevPtr,
426- dxDevPtr, dscaledevPtr, dbiasdevPtr, &epsilon_val};
425+ std::vector<void *> data_ptrs{xDevPtr, dyDevPtr, scaledevPtr, saved_meandevPtr, saved_inv_vardevPtr,
426+ dxDevPtr, dscaledevPtr, dbiasdevPtr, &epsilon_val};
427427 data_ptrs.insert (data_ptrs.end (), peer_devPtrs.begin (), peer_devPtrs.end ());
428428 std::vector<int64_t > uids;
429429 for (size_t i = 100 ; i < 100 + data_ptrs.size (); ++i) {
@@ -442,7 +442,7 @@ void execute_batch_norm_backward(cudnn_frontend::ExecutionPlan plan, void *xDevP
442442 // Reset local communication buffer
443443 cudaMemsetAsync (peer_devPtrs[rank_id], 0 , peer_size * 4 , stream);
444444
445- } catch (cudnn_frontend::cudnnException & e) {
445+ } catch (cudnn_frontend::cudnnException& e) {
446446 struct cudaDeviceProp prop;
447447 checkCudaErr (cudaGetDeviceProperties (&prop, 0 ));
448448 if (prop.major == 8 ) {
0 commit comments