Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 5c3b61d

Browse files
[pre-commit.ci] pre-commit autoupdate (#1966)
* [pre-commit.ci] pre-commit autoupdate updates: - [github.com/pre-commit/mirrors-clang-format: v17.0.6 → v21.1.6](pre-commit/mirrors-clang-format@v17.0.6...v21.1.6) - [github.com/astral-sh/ruff-pre-commit: v0.14.0 → v0.14.7](astral-sh/ruff-pre-commit@v0.14.0...v0.14.7) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 87868b0 commit 5c3b61d

89 files changed

Lines changed: 2089 additions & 2089 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.pre-commit-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
repos:
22
- repo: https://github.com/pre-commit/mirrors-clang-format
3-
rev: v17.0.6 # Or pin to your preferred clang-format version
3+
rev: v21.1.6 # Or pin to your preferred clang-format version
44
hooks:
55
- id: clang-format
66
files: \.(c|h|cpp|hpp|proto|cu|cuh)$
77
exclude: ^(apex/contrib/csrc/multihead_attn/cutlass|apex/contrib/csrc/cudnn-frontend)/
88

99
- repo: https://github.com/astral-sh/ruff-pre-commit
10-
rev: v0.14.0
10+
rev: v0.14.7
1111
hooks:
1212
- id: ruff-check
1313
args: ["--fix"]

apex/contrib/csrc/bottleneck/bottleneck.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,10 @@ int checkCudnnError(cudnnStatus_t code, const char* expr, const char* file, int
4646
}
4747

4848
void checkError(cudaError_t code, char const* func, const char* file, const int line, bool abort = true);
49-
#define checkCUDAError(val) \
50-
{ checkError((val), #val, __FILE__, __LINE__); } // in-line regular function
49+
#define checkCUDAError(val) \
50+
{ \
51+
checkError((val), #val, __FILE__, __LINE__); \
52+
} // in-line regular function
5153

5254
void checkError(cudaError_t code, char const* func, const char* file, const int line, bool abort) {
5355
if (code != cudaSuccess) {

apex/contrib/csrc/conv_bias_relu/conv_bias_relu.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,10 @@ int checkCudnnError(cudnnStatus_t code, const char* expr, const char* file, int
5252
}
5353

5454
void checkError(cudaError_t code, char const* func, const char* file, const int line, bool abort = true);
55-
#define checkCUDAError(val) \
56-
{ checkError((val), #val, __FILE__, __LINE__); } // in-line regular function
55+
#define checkCUDAError(val) \
56+
{ \
57+
checkError((val), #val, __FILE__, __LINE__); \
58+
} // in-line regular function
5759

5860
void checkError(cudaError_t code, char const* func, const char* file, const int line, bool abort) {
5961
if (code != cudaSuccess) {

apex/contrib/csrc/cudnn_gbn/norm_sample.cpp

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -30,15 +30,15 @@
3030
#include "cudnn_backend.h"
3131

3232
// some helpers
33-
int64_t checkCudaError(cudaError_t code, const char *expr, const char *file, int line) {
33+
int64_t checkCudaError(cudaError_t code, const char* expr, const char* file, int line) {
3434
if (code) {
3535
printf("CUDA error at %s:%d, code=%d (%s) in '%s'", file, line, (int)code, cudaGetErrorString(code), expr);
3636
return 1;
3737
}
3838
return 0;
3939
}
4040

41-
int64_t checkCudnnError(cudnnStatus_t code, const char *expr, const char *file, int line) {
41+
int64_t checkCudnnError(cudnnStatus_t code, const char* expr, const char* file, int line) {
4242
if (code) {
4343
printf("CUDNN error at %s:%d, code=%d (%s) in '%s'\n", file, line, (int)code, cudnnGetErrorString(code), expr);
4444
return 1;
@@ -51,7 +51,7 @@ bool AllowAll(cudnnBackendDescriptor_t engine_config) {
5151
return false;
5252
}
5353

54-
void generateStrides(const int64_t *dimA, int64_t *strideA, int64_t nbDims, cudnnTensorFormat_t filterFormat) {
54+
void generateStrides(const int64_t* dimA, int64_t* strideA, int64_t nbDims, cudnnTensorFormat_t filterFormat) {
5555
// For INT8x4 and INT8x32 we still compute standard strides here to input
5656
// into the cuDNN functions. We will manually scale by resizeFactor in the cpu ref.
5757
if (filterFormat == CUDNN_TENSOR_NCHW) {
@@ -71,8 +71,8 @@ void generateStrides(const int64_t *dimA, int64_t *strideA, int64_t nbDims, cudn
7171
}
7272

7373
// runtime
74-
cudnn_frontend::ExecutionPlan run_batch_norm_forward(int64_t *tensorDims, int64_t *perChannelSum, int64_t *epsilon,
75-
int64_t *peerDims, cudnnDataType_t data_type) {
74+
cudnn_frontend::ExecutionPlan run_batch_norm_forward(int64_t* tensorDims, int64_t* perChannelSum, int64_t* epsilon,
75+
int64_t* peerDims, cudnnDataType_t data_type) {
7676
// get the cudnn handle
7777
cudnnHandle_t handle = torch::native::getCudnnHandle();
7878

@@ -172,9 +172,9 @@ cudnn_frontend::ExecutionPlan run_batch_norm_forward(int64_t *tensorDims, int64_
172172
.setyDesc(yTensor)
173173
.build();
174174

175-
std::array<cudnn_frontend::Operation const *, 1> ops = {&batch_norm_op};
175+
std::array<cudnn_frontend::Operation const*, 1> ops = {&batch_norm_op};
176176
#else
177-
std::array<cudnn_frontend::Operation const *, 0> ops = {};
177+
std::array<cudnn_frontend::Operation const*, 0> ops = {};
178178
#endif
179179
auto opGraph =
180180
cudnn_frontend::OperationGraphBuilder().setHandle(handle).setOperationGraph(ops.size(), ops.data()).build();
@@ -203,7 +203,7 @@ cudnn_frontend::ExecutionPlan run_batch_norm_forward(int64_t *tensorDims, int64_
203203
.setEngineConfig(filtered_configs[i], opGraph.getTag())
204204
.build();
205205
return plan;
206-
} catch (cudnn_frontend::cudnnException &e) {
206+
} catch (cudnn_frontend::cudnnException& e) {
207207
continue;
208208
}
209209
}
@@ -219,10 +219,10 @@ cudnn_frontend::ExecutionPlan run_batch_norm_forward(int64_t *tensorDims, int64_
219219
return plan;
220220
}
221221

222-
void execute_batch_norm_forward(cudnn_frontend::ExecutionPlan plan, void *xDevPtr, void *yDevPtr, void *scaledevPtr,
223-
void *biasdevPtr, void *in_meandevPtr, void *in_vardevPtr, void *out_meandevPtr,
224-
void *out_vardevPtr, void *saved_meandevPtr, void *saved_inv_vardevPtr,
225-
const std::vector<void *> &peer_devPtrs, double epsilon_val,
222+
void execute_batch_norm_forward(cudnn_frontend::ExecutionPlan plan, void* xDevPtr, void* yDevPtr, void* scaledevPtr,
223+
void* biasdevPtr, void* in_meandevPtr, void* in_vardevPtr, void* out_meandevPtr,
224+
void* out_vardevPtr, void* saved_meandevPtr, void* saved_inv_vardevPtr,
225+
const std::vector<void*>& peer_devPtrs, double epsilon_val,
226226
double exponential_decay_factor, size_t peer_size, int rank_id) {
227227
// get handle
228228
cudnnHandle_t handle_ = torch::native::getCudnnHandle();
@@ -235,13 +235,13 @@ void execute_batch_norm_forward(cudnn_frontend::ExecutionPlan plan, void *xDevPt
235235
// allocate workspace
236236
auto workspace_size = plan.getWorkspaceSize();
237237
auto workspace_tensor = at::empty({(workspace_size + 3) / 4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
238-
void *workPtr = nullptr;
238+
void* workPtr = nullptr;
239239
if (workspace_size > 0) {
240240
workPtr = workspace_tensor.data_ptr<float>();
241241
}
242242

243243
// first the data pointers
244-
std::vector<void *> data_ptrs{
244+
std::vector<void*> data_ptrs{
245245
xDevPtr, yDevPtr, scaledevPtr, biasdevPtr, in_meandevPtr, in_vardevPtr,
246246
out_meandevPtr, out_vardevPtr, saved_meandevPtr, saved_inv_vardevPtr, &epsilon_val, &exponential_decay_factor};
247247
data_ptrs.insert(data_ptrs.end(), peer_devPtrs.begin(), peer_devPtrs.end());
@@ -262,7 +262,7 @@ void execute_batch_norm_forward(cudnn_frontend::ExecutionPlan plan, void *xDevPt
262262
// Reset local communication buffer
263263
cudaMemsetAsync(peer_devPtrs[rank_id], 0, peer_size * 4, stream);
264264

265-
} catch (cudnn_frontend::cudnnException &e) {
265+
} catch (cudnn_frontend::cudnnException& e) {
266266
struct cudaDeviceProp prop;
267267
checkCudaErr(cudaGetDeviceProperties(&prop, 0));
268268
if (prop.major == 8) {
@@ -272,8 +272,8 @@ void execute_batch_norm_forward(cudnn_frontend::ExecutionPlan plan, void *xDevPt
272272
}
273273
}
274274

275-
cudnn_frontend::ExecutionPlan run_batch_norm_backward(int64_t *tensorDims, int64_t *perChannelSum, int64_t *epsilon,
276-
int64_t *peerDims, cudnnDataType_t data_type) {
275+
cudnn_frontend::ExecutionPlan run_batch_norm_backward(int64_t* tensorDims, int64_t* perChannelSum, int64_t* epsilon,
276+
int64_t* peerDims, cudnnDataType_t data_type) {
277277
// get cudnn handle
278278
cudnnHandle_t handle = torch::native::getCudnnHandle();
279279

@@ -364,9 +364,9 @@ cudnn_frontend::ExecutionPlan run_batch_norm_backward(int64_t *tensorDims, int64
364364
.setPeerStatTensor(peerStatTensors)
365365
.build();
366366

367-
std::array<cudnn_frontend::Operation const *, 1> ops = {&batch_norm_op};
367+
std::array<cudnn_frontend::Operation const*, 1> ops = {&batch_norm_op};
368368
#else
369-
std::array<cudnn_frontend::Operation const *, 0> ops = {};
369+
std::array<cudnn_frontend::Operation const*, 0> ops = {};
370370
#endif
371371

372372
auto opGraph =
@@ -385,7 +385,7 @@ cudnn_frontend::ExecutionPlan run_batch_norm_backward(int64_t *tensorDims, int64
385385
.setEngineConfig(filtered_configs[i], opGraph.getTag())
386386
.build();
387387
return plan;
388-
} catch (cudnn_frontend::cudnnException &e) {
388+
} catch (cudnn_frontend::cudnnException& e) {
389389
continue;
390390
}
391391
}
@@ -401,10 +401,10 @@ cudnn_frontend::ExecutionPlan run_batch_norm_backward(int64_t *tensorDims, int64
401401
return plan;
402402
}
403403

404-
void execute_batch_norm_backward(cudnn_frontend::ExecutionPlan plan, void *xDevPtr, void *dyDevPtr, void *scaledevPtr,
405-
void *saved_meandevPtr, void *saved_inv_vardevPtr,
406-
const std::vector<void *> &peer_devPtrs, void *dxDevPtr, void *dscaledevPtr,
407-
void *dbiasdevPtr, double epsilon_val, size_t peer_size, int rank_id) {
404+
void execute_batch_norm_backward(cudnn_frontend::ExecutionPlan plan, void* xDevPtr, void* dyDevPtr, void* scaledevPtr,
405+
void* saved_meandevPtr, void* saved_inv_vardevPtr,
406+
const std::vector<void*>& peer_devPtrs, void* dxDevPtr, void* dscaledevPtr,
407+
void* dbiasdevPtr, double epsilon_val, size_t peer_size, int rank_id) {
408408
// get handle
409409
cudnnHandle_t handle_ = torch::native::getCudnnHandle();
410410

@@ -416,14 +416,14 @@ void execute_batch_norm_backward(cudnn_frontend::ExecutionPlan plan, void *xDevP
416416
// allocate workspace
417417
auto workspace_size = plan.getWorkspaceSize();
418418
auto workspace_tensor = at::empty({(workspace_size + 3) / 4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
419-
void *workPtr = nullptr;
419+
void* workPtr = nullptr;
420420
if (workspace_size > 0) {
421421
workPtr = workspace_tensor.data_ptr<float>();
422422
}
423423

424424
// create helper arrays
425-
std::vector<void *> data_ptrs{xDevPtr, dyDevPtr, scaledevPtr, saved_meandevPtr, saved_inv_vardevPtr,
426-
dxDevPtr, dscaledevPtr, dbiasdevPtr, &epsilon_val};
425+
std::vector<void*> data_ptrs{xDevPtr, dyDevPtr, scaledevPtr, saved_meandevPtr, saved_inv_vardevPtr,
426+
dxDevPtr, dscaledevPtr, dbiasdevPtr, &epsilon_val};
427427
data_ptrs.insert(data_ptrs.end(), peer_devPtrs.begin(), peer_devPtrs.end());
428428
std::vector<int64_t> uids;
429429
for (size_t i = 100; i < 100 + data_ptrs.size(); ++i) {
@@ -442,7 +442,7 @@ void execute_batch_norm_backward(cudnn_frontend::ExecutionPlan plan, void *xDevP
442442
// Reset local communication buffer
443443
cudaMemsetAsync(peer_devPtrs[rank_id], 0, peer_size * 4, stream);
444444

445-
} catch (cudnn_frontend::cudnnException &e) {
445+
} catch (cudnn_frontend::cudnnException& e) {
446446
struct cudaDeviceProp prop;
447447
checkCudaErr(cudaGetDeviceProperties(&prop, 0));
448448
if (prop.major == 8) {

apex/contrib/csrc/cudnn_gbn/norm_sample.h

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,10 @@
3838

3939
/* some helpers
4040
*/
41-
void generateStrides(const int64_t *dimA, int64_t *strideA, int64_t nbDims, cudnnTensorFormat_t filterFormat);
41+
void generateStrides(const int64_t* dimA, int64_t* strideA, int64_t nbDims, cudnnTensorFormat_t filterFormat);
4242

43-
int64_t checkCudaError(cudaError_t code, const char *expr, const char *file, int line);
44-
int64_t checkCudnnError(cudnnStatus_t code, const char *expr, const char *file, int line);
43+
int64_t checkCudaError(cudaError_t code, const char* expr, const char* file, int line);
44+
int64_t checkCudnnError(cudnnStatus_t code, const char* expr, const char* file, int line);
4545

4646
#define checkCudaErr(...) \
4747
do { \
@@ -67,8 +67,8 @@ int64_t checkCudnnError(cudnnStatus_t code, const char *expr, const char *file,
6767
6868
*
6969
*/
70-
cudnn_frontend::ExecutionPlan run_batch_norm_forward(int64_t *tensorDims, int64_t *perChannelSum, int64_t *epsilon,
71-
int64_t *peerDims, cudnnDataType_t in_out_data_type);
70+
cudnn_frontend::ExecutionPlan run_batch_norm_forward(int64_t* tensorDims, int64_t* perChannelSum, int64_t* epsilon,
71+
int64_t* peerDims, cudnnDataType_t in_out_data_type);
7272
/**
7373
* @param xDevPtr input tensor device pointer
7474
* @param yDevPtr output tensor device pointer
@@ -86,10 +86,10 @@ cudnn_frontend::ExecutionPlan run_batch_norm_forward(int64_t *tensorDims, int64_
8686
* @param exponential_decay_factor exponential_decay_factor as a value
8787
*
8888
**/
89-
void execute_batch_norm_forward(cudnn_frontend::ExecutionPlan plan, void *xDevPtr, void *yDevPtr, void *scaledevPtr,
90-
void *biasdevPtr, void *in_meandevPtr, void *in_vardevPtr, void *out_meandevPtr,
91-
void *out_vardevPtr, void *saved_meandevPtr, void *saved_inv_vardevPtr,
92-
const std::vector<void *> &peer_devPtrs, double epsilon_val,
89+
void execute_batch_norm_forward(cudnn_frontend::ExecutionPlan plan, void* xDevPtr, void* yDevPtr, void* scaledevPtr,
90+
void* biasdevPtr, void* in_meandevPtr, void* in_vardevPtr, void* out_meandevPtr,
91+
void* out_vardevPtr, void* saved_meandevPtr, void* saved_inv_vardevPtr,
92+
const std::vector<void*>& peer_devPtrs, double epsilon_val,
9393
double exponential_decay_factor, size_t peer_size, int rank_id);
9494

9595
/**
@@ -103,8 +103,8 @@ void execute_batch_norm_forward(cudnn_frontend::ExecutionPlan plan, void *xDevPt
103103
* GBN
104104
*
105105
*/
106-
cudnn_frontend::ExecutionPlan run_batch_norm_backward(int64_t *tensorDims, int64_t *perChannelSum, int64_t *epsilon,
107-
int64_t *peerDims, cudnnDataType_t data_type);
106+
cudnn_frontend::ExecutionPlan run_batch_norm_backward(int64_t* tensorDims, int64_t* perChannelSum, int64_t* epsilon,
107+
int64_t* peerDims, cudnnDataType_t data_type);
108108

109109
/**
110110
* @brief Run a Group BN backward sample with 2 peer stat tensors.
@@ -124,7 +124,7 @@ cudnn_frontend::ExecutionPlan run_batch_norm_backward(int64_t *tensorDims, int64
124124
* @param epsilon_val episilon value as a double
125125
*
126126
*/
127-
void execute_batch_norm_backward(cudnn_frontend::ExecutionPlan plan, void *xDevPtr, void *dyDevPtr, void *scaledevPtr,
128-
void *saved_meandevPtr, void *saved_inv_vardevPtr,
129-
const std::vector<void *> &peer_devPtrs, void *dxDevPtr, void *dscaledevPtr,
130-
void *dbiasdevPtr, double epsilon_val, size_t peer_size, int rank_id);
127+
void execute_batch_norm_backward(cudnn_frontend::ExecutionPlan plan, void* xDevPtr, void* dyDevPtr, void* scaledevPtr,
128+
void* saved_meandevPtr, void* saved_inv_vardevPtr,
129+
const std::vector<void*>& peer_devPtrs, void* dxDevPtr, void* dscaledevPtr,
130+
void* dbiasdevPtr, double epsilon_val, size_t peer_size, int rank_id);

apex/contrib/csrc/fmha/fmha_api.cpp

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,12 @@
3030

3131
#include "fmha.h"
3232

33-
extern at::Tensor &mha_fill(at::Tensor &self, const at::Tensor &start_index);
34-
void set_params(Fused_multihead_attention_fprop_params &params,
33+
extern at::Tensor& mha_fill(at::Tensor& self, const at::Tensor& start_index);
34+
void set_params(Fused_multihead_attention_fprop_params& params,
3535
// sizes
3636
const size_t b, const size_t s, const size_t h, const size_t d,
3737
// device pointers
38-
void *qkv_packed_d, void *cu_seqlens_d, void *o_packed_d, void *s_d, float p_dropout) {
38+
void* qkv_packed_d, void* cu_seqlens_d, void* o_packed_d, void* s_d, float p_dropout) {
3939
Data_type acc_type = DATA_TYPE_FP32;
4040
Data_type data_type = DATA_TYPE_FP16;
4141

@@ -48,7 +48,7 @@ void set_params(Fused_multihead_attention_fprop_params &params,
4848
params.o_ptr = o_packed_d;
4949
params.o_stride_in_bytes = get_size_in_bytes(h * d, data_type);
5050

51-
params.cu_seqlens = static_cast<int *>(cu_seqlens_d);
51+
params.cu_seqlens = static_cast<int*>(cu_seqlens_d);
5252

5353
// S = softmax(P)
5454
params.s_ptr = s_d;
@@ -77,8 +77,8 @@ void set_params(Fused_multihead_attention_fprop_params &params,
7777
}
7878

7979
std::vector<at::Tensor> mha_fwd(
80-
const at::Tensor &qkv, // total x num_heads x 3 x head_size, total := \sum_{i=0}^{b} s_i
81-
const at::Tensor &cu_seqlens, // b+1
80+
const at::Tensor& qkv, // total x num_heads x 3 x head_size, total := \sum_{i=0}^{b} s_i
81+
const at::Tensor& cu_seqlens, // b+1
8282
const float p_dropout, const int max_seq_len, const bool is_training, const bool is_nl, const bool zero_tensors,
8383
c10::optional<at::Generator> gen_) {
8484
using namespace torch::indexing;
@@ -158,10 +158,10 @@ std::vector<at::Tensor> mha_fwd(
158158
}
159159

160160
std::vector<at::Tensor> mha_bwd(
161-
const at::Tensor &dout, // total x num_heads, x head_size
162-
const at::Tensor &qkv, // total x num_heads x 3 x head_size, total := \sum_{i=0}^{b} s_i
163-
at::Tensor &softmax, // b x h x s x s softmax and dmask - will be overwritten with dP
164-
const at::Tensor &cu_seqlens, // b+1
161+
const at::Tensor& dout, // total x num_heads, x head_size
162+
const at::Tensor& qkv, // total x num_heads x 3 x head_size, total := \sum_{i=0}^{b} s_i
163+
at::Tensor& softmax, // b x h x s x s softmax and dmask - will be overwritten with dP
164+
const at::Tensor& cu_seqlens, // b+1
165165
const float p_dropout, // probability to drop
166166
const int max_seq_len, // max sequence length to choose the kernel
167167
const bool zero_tensors) {
@@ -238,10 +238,10 @@ std::vector<at::Tensor> mha_bwd(
238238
}
239239

240240
std::vector<at::Tensor> mha_bwd_nl(
241-
const at::Tensor &dout, // total x num_heads, x head_size
242-
const at::Tensor &qkv, // total x num_heads x 3 x head_size, total := \sum_{i=0}^{b} s_i
243-
at::Tensor &softmax, // b x h x s x s softmax and dmask - will be overwritten with dP
244-
const at::Tensor &cu_seqlens, // b+1
241+
const at::Tensor& dout, // total x num_heads, x head_size
242+
const at::Tensor& qkv, // total x num_heads x 3 x head_size, total := \sum_{i=0}^{b} s_i
243+
at::Tensor& softmax, // b x h x s x s softmax and dmask - will be overwritten with dP
244+
const at::Tensor& cu_seqlens, // b+1
245245
const float p_dropout, // probability to drop
246246
const int max_seq_len, // max sequence length to choose the kernel
247247
const bool zero_tensors) {

0 commit comments

Comments
 (0)