Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 247f307

Browse files
committed
It is faster without shared memory
1 parent a7a6634 commit 247f307

File tree

1 file changed

+21
-82
lines changed

1 file changed

+21
-82
lines changed

modules/cudaimgproc/src/cuda/moments.cu

Lines changed: 21 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,8 @@ using namespace cv::cuda::device;
1717

1818
namespace cv { namespace cuda { namespace device { namespace imgproc {
1919

20-
constexpr int blockSizeXBit = 4;
21-
constexpr int blockSizeYBit = 4;
22-
constexpr int blockSizeX = 1 << blockSizeXBit;
23-
constexpr int blockSizeY = 1 << blockSizeYBit;
24-
constexpr int reduceFolds = blockSizeXBit + blockSizeYBit;
20+
constexpr int blockSizeX = 16;
21+
constexpr int blockSizeY = 16;
2522
constexpr int momentsSize = sizeof(cv::Moments) / sizeof(double);
2623

2724
constexpr int m00 = offsetof(cv::Moments, m00) / sizeof(double);
@@ -45,16 +42,6 @@ constexpr int mu03 = offsetof(cv::Moments, mu03) / sizeof(double);
4542

4643
__global__ void ComputeSpatialMoments(const cuda::PtrStepSzb img, bool binary,
4744
double* moments, double2* centroid) {
48-
__shared__ volatile double smem[blockSizeX * blockSizeY][momentsSize];
49-
50-
// Prepare memory
51-
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
52-
volatile double* curr_mem = smem[tid];
53-
for (int j = 0; j < momentsSize; ++j) {
54-
curr_mem[j] = 0;
55-
}
56-
57-
// Compute moments
5845
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
5946
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
6047
if (y < img.rows && x < img.cols) {
@@ -64,39 +51,22 @@ __global__ void ComputeSpatialMoments(const cuda::PtrStepSzb img, bool binary,
6451
const unsigned long x2 = x * x, x3 = x2 * x;
6552
const unsigned long y2 = y * y, y3 = y2 * y;
6653

67-
curr_mem[m00] = val;
68-
curr_mem[m10] = x * val;
69-
curr_mem[m01] = y * val;
70-
curr_mem[m20] = x2 * val;
71-
curr_mem[m11] = x * y * val;
72-
curr_mem[m02] = y2 * val;
73-
curr_mem[m30] = x3 * val;
74-
curr_mem[m21] = x2 * y * val;
75-
curr_mem[m12] = x * y2 * val;
76-
curr_mem[m03] = y3 * val;
77-
}
78-
}
79-
80-
// Reduce memory
81-
for (int p = 0; p < reduceFolds; ++p) {
82-
__syncthreads();
83-
if (tid % (1 << (p + 1)) == 0) {
84-
volatile double* dst_mem = smem[tid];
85-
volatile double* src_mem = smem[tid + 1 << p];
86-
for (int j = 0; j < momentsSize; ++j) {
87-
dst_mem[j] += src_mem[j];
88-
}
54+
atomicAdd(&moments[m00], val);
55+
atomicAdd(&moments[m10], x * val);
56+
atomicAdd(&moments[m01], y * val);
57+
atomicAdd(&moments[m20], x2 * val);
58+
atomicAdd(&moments[m11], x * y * val);
59+
atomicAdd(&moments[m02], y2 * val);
60+
atomicAdd(&moments[m30], x3 * val);
61+
atomicAdd(&moments[m21], x2 * y * val);
62+
atomicAdd(&moments[m12], x * y2 * val);
63+
atomicAdd(&moments[m03], y3 * val);
8964
}
9065
}
9166

92-
// Publish results
93-
__syncthreads();
67+
// Compute centroid
68+
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
9469
if (tid == 0) {
95-
volatile double* curr_mem = smem[0];
96-
for (int j = 0; j < momentsSize; ++j) {
97-
atomicAdd(moments + j, curr_mem[j]);
98-
}
99-
10070
__syncthreads();
10171
centroid->x = moments[m10] / moments[m00];
10272
centroid->y = moments[m01] / moments[m00];
@@ -105,16 +75,6 @@ __global__ void ComputeSpatialMoments(const cuda::PtrStepSzb img, bool binary,
10575

10676
__global__ void ComputeCenteralMoments(const cuda::PtrStepSzb img, bool binary,
10777
const double2* centroid, double* moments) {
108-
__shared__ volatile double smem[blockSizeX * blockSizeY][momentsSize];
109-
110-
// Prepare memory
111-
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
112-
volatile double* curr_mem = smem[tid];
113-
for (int j = 0; j < momentsSize; ++j) {
114-
curr_mem[j] = 0;
115-
}
116-
117-
// Compute moments
11878
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
11979
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
12080
if (y < img.rows && x < img.cols) {
@@ -124,34 +84,13 @@ __global__ void ComputeCenteralMoments(const cuda::PtrStepSzb img, bool binary,
12484
const double x1 = x - centroid->x, x2 = x1 * x1, x3 = x2 * x1;
12585
const double y1 = y - centroid->y, y2 = y1 * y1, y3 = y2 * y1;
12686

127-
curr_mem[mu20] = x2 * val;
128-
curr_mem[mu11] = x1 * y1 * val;
129-
curr_mem[mu02] = y2 * val;
130-
curr_mem[mu30] = x3 * val;
131-
curr_mem[mu21] = x2 * y1 * val;
132-
curr_mem[mu12] = x1 * y2 * val;
133-
curr_mem[mu03] = y3 * val;
134-
}
135-
}
136-
137-
// Reduce memory
138-
for (int p = 0; p < reduceFolds; ++p) {
139-
__syncthreads();
140-
if (tid % (1 << (p + 1)) == 0) {
141-
volatile double* dst_mem = smem[tid];
142-
volatile double* src_mem = smem[tid + 1 << p];
143-
for (int j = 0; j < momentsSize; ++j) {
144-
dst_mem[j] += src_mem[j];
145-
}
146-
}
147-
}
148-
149-
// Publish results
150-
__syncthreads();
151-
if (tid == 0) {
152-
volatile double* curr_mem = smem[0];
153-
for (int j = 0; j < momentsSize; ++j) {
154-
atomicAdd(moments + j, curr_mem[j]);
87+
atomicAdd(&moments[mu20], x2 * val);
88+
atomicAdd(&moments[mu11], x1 * y1 * val);
89+
atomicAdd(&moments[mu02], y2 * val);
90+
atomicAdd(&moments[mu30], x3 * val);
91+
atomicAdd(&moments[mu21], x2 * y1 * val);
92+
atomicAdd(&moments[mu12], x1 * y2 * val);
93+
atomicAdd(&moments[mu03], y3 * val);
15594
}
15695
}
15796
}

0 commit comments

Comments
 (0)