@@ -17,11 +17,8 @@ using namespace cv::cuda::device;
1717
1818namespace cv { namespace cuda { namespace device { namespace imgproc {
1919
20- constexpr int blockSizeXBit = 4 ;
21- constexpr int blockSizeYBit = 4 ;
22- constexpr int blockSizeX = 1 << blockSizeXBit;
23- constexpr int blockSizeY = 1 << blockSizeYBit;
24- constexpr int reduceFolds = blockSizeXBit + blockSizeYBit;
20+ constexpr int blockSizeX = 16 ;
21+ constexpr int blockSizeY = 16 ;
2522constexpr int momentsSize = sizeof (cv::Moments) / sizeof (double );
2623
2724constexpr int m00 = offsetof(cv::Moments, m00) / sizeof (double );
@@ -45,16 +42,6 @@ constexpr int mu03 = offsetof(cv::Moments, mu03) / sizeof(double);
4542
4643__global__ void ComputeSpatialMoments (const cuda::PtrStepSzb img, bool binary,
4744 double * moments, double2 * centroid) {
48- __shared__ volatile double smem[blockSizeX * blockSizeY][momentsSize];
49-
50- // Prepare memory
51- const int tid = threadIdx .y * blockDim .x + threadIdx .x ;
52- volatile double * curr_mem = smem[tid];
53- for (int j = 0 ; j < momentsSize; ++j) {
54- curr_mem[j] = 0 ;
55- }
56-
57- // Compute moments
5845 const unsigned int x = blockIdx .x * blockDim .x + threadIdx .x ;
5946 const unsigned int y = blockIdx .y * blockDim .y + threadIdx .y ;
6047 if (y < img.rows && x < img.cols ) {
@@ -64,39 +51,22 @@ __global__ void ComputeSpatialMoments(const cuda::PtrStepSzb img, bool binary,
6451 const unsigned long x2 = x * x, x3 = x2 * x;
6552 const unsigned long y2 = y * y, y3 = y2 * y;
6653
67- curr_mem[m00] = val;
68- curr_mem[m10] = x * val;
69- curr_mem[m01] = y * val;
70- curr_mem[m20] = x2 * val;
71- curr_mem[m11] = x * y * val;
72- curr_mem[m02] = y2 * val;
73- curr_mem[m30] = x3 * val;
74- curr_mem[m21] = x2 * y * val;
75- curr_mem[m12] = x * y2 * val;
76- curr_mem[m03] = y3 * val;
77- }
78- }
79-
80- // Reduce memory
81- for (int p = 0 ; p < reduceFolds; ++p) {
82- __syncthreads ();
83- if (tid % (1 << (p + 1 )) == 0 ) {
84- volatile double * dst_mem = smem[tid];
85- volatile double * src_mem = smem[tid + 1 << p];
86- for (int j = 0 ; j < momentsSize; ++j) {
87- dst_mem[j] += src_mem[j];
88- }
54+ atomicAdd (&moments[m00], val);
55+ atomicAdd (&moments[m10], x * val);
56+ atomicAdd (&moments[m01], y * val);
57+ atomicAdd (&moments[m20], x2 * val);
58+ atomicAdd (&moments[m11], x * y * val);
59+ atomicAdd (&moments[m02], y2 * val);
60+ atomicAdd (&moments[m30], x3 * val);
61+ atomicAdd (&moments[m21], x2 * y * val);
62+ atomicAdd (&moments[m12], x * y2 * val);
63+ atomicAdd (&moments[m03], y3 * val);
8964 }
9065 }
9166
92- // Publish results
93- __syncthreads () ;
67+ // Compute centroid
68+ const int tid = threadIdx . y * blockDim . x + threadIdx . x ;
9469 if (tid == 0 ) {
95- volatile double * curr_mem = smem[0 ];
96- for (int j = 0 ; j < momentsSize; ++j) {
97- atomicAdd (moments + j, curr_mem[j]);
98- }
99-
10070 __syncthreads ();
10171 centroid->x = moments[m10] / moments[m00];
10272 centroid->y = moments[m01] / moments[m00];
@@ -105,16 +75,6 @@ __global__ void ComputeSpatialMoments(const cuda::PtrStepSzb img, bool binary,
10575
10676__global__ void ComputeCenteralMoments (const cuda::PtrStepSzb img, bool binary,
10777 const double2 * centroid, double * moments) {
108- __shared__ volatile double smem[blockSizeX * blockSizeY][momentsSize];
109-
110- // Prepare memory
111- const int tid = threadIdx .y * blockDim .x + threadIdx .x ;
112- volatile double * curr_mem = smem[tid];
113- for (int j = 0 ; j < momentsSize; ++j) {
114- curr_mem[j] = 0 ;
115- }
116-
117- // Compute moments
11878 const unsigned int x = blockIdx .x * blockDim .x + threadIdx .x ;
11979 const unsigned int y = blockIdx .y * blockDim .y + threadIdx .y ;
12080 if (y < img.rows && x < img.cols ) {
@@ -124,34 +84,13 @@ __global__ void ComputeCenteralMoments(const cuda::PtrStepSzb img, bool binary,
12484 const double x1 = x - centroid->x , x2 = x1 * x1, x3 = x2 * x1;
12585 const double y1 = y - centroid->y , y2 = y1 * y1, y3 = y2 * y1;
12686
127- curr_mem[mu20] = x2 * val;
128- curr_mem[mu11] = x1 * y1 * val;
129- curr_mem[mu02] = y2 * val;
130- curr_mem[mu30] = x3 * val;
131- curr_mem[mu21] = x2 * y1 * val;
132- curr_mem[mu12] = x1 * y2 * val;
133- curr_mem[mu03] = y3 * val;
134- }
135- }
136-
137- // Reduce memory
138- for (int p = 0 ; p < reduceFolds; ++p) {
139- __syncthreads ();
140- if (tid % (1 << (p + 1 )) == 0 ) {
141- volatile double * dst_mem = smem[tid];
142- volatile double * src_mem = smem[tid + 1 << p];
143- for (int j = 0 ; j < momentsSize; ++j) {
144- dst_mem[j] += src_mem[j];
145- }
146- }
147- }
148-
149- // Publish results
150- __syncthreads ();
151- if (tid == 0 ) {
152- volatile double * curr_mem = smem[0 ];
153- for (int j = 0 ; j < momentsSize; ++j) {
154- atomicAdd (moments + j, curr_mem[j]);
87+ atomicAdd (&moments[mu20], x2 * val);
88+ atomicAdd (&moments[mu11], x1 * y1 * val);
89+ atomicAdd (&moments[mu02], y2 * val);
90+ atomicAdd (&moments[mu30], x3 * val);
91+ atomicAdd (&moments[mu21], x2 * y1 * val);
92+ atomicAdd (&moments[mu12], x1 * y2 * val);
93+ atomicAdd (&moments[mu03], y3 * val);
15594 }
15695 }
15796}
0 commit comments