Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit ab6c5b7

Browse files
committed
fpga: new op
1 parent 8104c5e commit ab6c5b7

File tree

11 files changed

+863
-151
lines changed

11 files changed

+863
-151
lines changed

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,10 @@ FPGA_OBJS=$(patsubst %.cpp,%.xo,$(FPGA_SRC))
5252

5353
FPGA_OUT=ops.${TARGET}.xclbin
5454
fpga:$(FPGA_OBJS)
55-
v++ -O2 -t $(TARGET) --platform=$(PLATFORM) -l -o $(FPGA_OUT) $(FPGA_OBJS)
55+
v++ -t $(TARGET) --platform=$(PLATFORM) -l -o $(FPGA_OUT) $(FPGA_OBJS)
5656
%.xo:%.cpp
5757
#v++ -t $(TARGET) --platform=$(PLATFORM) -c -k $(basename $(notdir $<)) -o '${BUILD}/fpga/$(basename $(notdir $<)).${TARGET}.xo' $<
58-
v++ -O2 -t $(TARGET) --platform=$(PLATFORM) -c -k $(basename $(notdir $<)) -o '$@' $<
58+
v++ -t $(TARGET) --platform=$(PLATFORM) -c -k $(basename $(notdir $<)) -o '$@' $<
5959
rm $@.*
6060

6161
cleanfpga:

src/runtime/opencl/ops/fpga/gemm_bias.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ void gemm_bias(const char *A, const char* B, const int* bias, int *C, const int
2323
const int TK = (K+63)/64*64;
2424
const int TN = (N+63)/64*64;
2525

26-
int offset = TM*TK;
26+
int offset = TK*TN;
2727

2828
#pragma HLS ARRAY_PARTITION variable = bufC dim = 2 complete
2929
#pragma HLS ARRAY_PARTITION variable = bufB dim = 2 complete
@@ -52,7 +52,7 @@ void gemm_bias(const char *A, const char* B, const int* bias, int *C, const int
5252
#pragma HLS PIPELINE II=1
5353
for(int kk = 0; kk < BLOCK_SIZE; kk++){
5454
#pragma HLS UNROLL factor=2
55-
bufA[ii][kk] = A[(i+ii)*TK + k + kk];
55+
bufA[ii][kk] = A[offset + (i+ii)*TK + k + kk];
5656
}
5757
}
5858

@@ -61,7 +61,7 @@ void gemm_bias(const char *A, const char* B, const int* bias, int *C, const int
6161
#pragma HLS PIPELINE II=1
6262
for(int jj = 0; jj < BLOCK_SIZE; jj++){
6363
#pragma HLS UNROLL factor=2
64-
bufB[kk][jj] = B[offset + (k + kk)*TN + j + jj];
64+
bufB[kk][jj] = B[(k + kk)*TN + j + jj];
6565
}
6666
}
6767

src/runtime/opencl/ops/fpga/im2col.cpp

Lines changed: 52 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ void im2col(const int * data_im,
77
const int pad_h, const int pad_w,
88
const int stride_h, const int stride_w,
99
const int dilation_h, const int dilation_w,
10-
const int height_col, const int width_col, const int offset) {
10+
const int height_col, const int width_col) {
1111
#pragma HLS INTERFACE m_axi port=data_im offset=slave bundle=gmem
1212
#pragma HLS INTERFACE m_axi port=data_col offset=slave bundle=gmem
1313
#pragma HLS INTERFACE s_axilite port=data_im bundle=control
@@ -25,47 +25,66 @@ void im2col(const int * data_im,
2525
#pragma HLS INTERFACE s_axilite port=dilation_w bundle=control
2626
#pragma HLS INTERFACE s_axilite port=height_col bundle=control
2727
#pragma HLS INTERFACE s_axilite port=width_col bundle=control
28-
#pragma HLS INTERFACE s_axilite port=offset bundle=control
28+
//#pragma HLS INTERFACE s_axilite port=offset bundle=control
2929
#pragma HLS INTERFACE s_axilite port = return bundle = control
3030

31-
//int tid = threadIdx.x + blockDim.x * blockIdx.x;
32-
//for(int64_t index = tid; index < n; index += gridDim.x*blockDim.x){
31+
const int BS = 32;
32+
const int MAX_KERNEL = 11;
33+
char buf[MAX_KERNEL*MAX_KERNEL][BS];
34+
const int cols = height_col * width_col;
35+
const int cols_offset = (cols + 63) / 64 * 64;
36+
for(int ri = 0; ri < n; ri+=BS){
37+
int chunk_n = BS;
38+
if(ri + BS > n) chunk_n = n - ri;
39+
for(int ti = 0; ti < chunk_n; ti++){
40+
#pragma HLS PIPELINE
41+
int index = ri + ti;
42+
const int h_index = index / width_col;
43+
const int h_col = h_index % height_col;
44+
const int w_col = index % width_col;
45+
const int c_im = h_index / height_col;
46+
const int c_col = c_im * kernel_h * kernel_w;
47+
const int h_offset = h_col * stride_h - pad_h;
48+
const int w_offset = w_col * stride_w - pad_w;
49+
int dst_index = c_col * cols_offset + h_col * width_col + w_col;
50+
int src_index = (c_im * height + h_offset) * width + w_offset;
3351

34-
for(int index = 0; index < n; index++){
35-
const int h_index = index / width_col;
36-
const int h_col = h_index % height_col;
37-
const int w_col = index % width_col;
38-
const int c_im = h_index / height_col;
39-
const int c_col = c_im * kernel_h * kernel_w;
40-
const int h_offset = h_col * stride_h - pad_h;
41-
const int w_offset = w_col * stride_w - pad_w;
42-
//int* data_col_ptr = data_col;
43-
//data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
44-
//int dst_index = offset + (c_col * height_col + h_col) * width_col + w_col;
45-
const int cols = height_col * width_col;
46-
const int cols_offset = (cols + (MATRIX_PAD-1)) / MATRIX_PAD * MATRIX_PAD;
47-
int dst_index = offset + c_col * cols_offset + h_col * width_col + w_col;
48-
//const int * data_im_ptr = data_im;
49-
//data_im_ptr += (c_im * height + h_offset) * width + w_offset;
50-
int src_index = (c_im * height + h_offset) * width + w_offset;
52+
for (int i = 0; i < kernel_h; ++i) {
53+
for (int j = 0; j < kernel_w; ++j) {
54+
int h_im = h_offset + i * dilation_h;
55+
int w_im = w_offset + j * dilation_w;
56+
if(h_im < 0 || w_im < 0 || h_im >= height || w_im >= width){
57+
//data_col[dst_index] = 0;
58+
buf[i*kernel_w + j][ti] = 0;
59+
}else{
60+
//data_col[dst_index] = data_im[src_index + i*dilation_h * width + j * dilation_w];
61+
buf[i*kernel_w + j][ti] = data_im[src_index + i*dilation_h * width + j * dilation_w];
62+
}
63+
//dst_index += cols_offset;//height_col * width_col;
64+
}
65+
}
66+
}
5167

52-
#pragma HLS PIPELINE
5368
for (int i = 0; i < kernel_h; ++i) {
5469
for (int j = 0; j < kernel_w; ++j) {
55-
int h_im = h_offset + i * dilation_h;
56-
int w_im = w_offset + j * dilation_w;
57-
//*data_col_ptr = data_im_ptr[i*dilation_h * width + j * dilation_w];
58-
if(h_im < 0 || w_im < 0 || h_im >= height || w_im >= width){
59-
data_col[dst_index] = 0;
60-
}else{
61-
data_col[dst_index] = data_im[src_index + i*dilation_h * width + j * dilation_w];
70+
const int kernel_index = i*kernel_w + j;
71+
for(int ti = 0; ti < chunk_n; ti++){
72+
#pragma HLS PIPELINE
73+
int index = ri + ti;
74+
const int h_index = index / width_col;
75+
const int h_col = h_index % height_col;
76+
const int w_col = index % width_col;
77+
const int c_im = h_index / height_col;
78+
const int c_col = c_im * kernel_h * kernel_w;
79+
const int h_offset = h_col * stride_h - pad_h;
80+
const int w_offset = w_col * stride_w - pad_w;
81+
int dst_index = c_col * cols_offset + h_col * width_col + w_col + kernel_index * cols_offset;
82+
data_col[dst_index] = buf[kernel_index][ti];
6283
}
63-
//(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
64-
//(data_im_ptr[i * dilation_h * width + j * dilation_w]) : 0;
65-
//data_col_ptr += height_col * width_col;
66-
dst_index += cols_offset;//height_col * width_col;
84+
//dst_index += cols_offset;//height_col * width_col;
6785
}
6886
}
6987
}
88+
7089
}
7190
}

src/runtime/opencl/ops/fpga/int32_to_int8.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
extern "C"{
2-
void int32_to_int8(const int *input, char* output, const int M, const int K){
2+
void int32_to_int8(const int *input, char* output, const int M, const int K, const int offset){
33
#pragma HLS INTERFACE m_axi port=input offset=slave bundle=gmem
44
#pragma HLS INTERFACE m_axi port=output offset=slave bundle=gmem
55
#pragma HLS INTERFACE s_axilite port=input bundle=control
66
#pragma HLS INTERFACE s_axilite port=output bundle=control
77
#pragma HLS INTERFACE s_axilite port=M bundle=control
88
#pragma HLS INTERFACE s_axilite port=K bundle=control
9+
#pragma HLS INTERFACE s_axilite port=offset bundle=control
910
#pragma HLS INTERFACE s_axilite port = return bundle = control
10-
const int BUF_SIZE = 64;
11+
const int BUF_SIZE = 128;
1112
char buf[BUF_SIZE];
1213
const int n = M * K;
1314
const int TK = (K + 63) / 64 * 64;
@@ -17,15 +18,15 @@ extern "C"{
1718
if(i + chunk_size > n) chunk_size = n - i;
1819
load:
1920
for(int j = 0; j < chunk_size; j++){
20-
#pragma HLS PIPELINE II=1
21+
#pragma HLS PIPELINE
2122
buf[j] = input[i + j];
2223
}
2324
write:
2425
for(int j = 0; j < chunk_size; j++){
25-
#pragma HLS PIPELINE II=1
26+
#pragma HLS PIPELINE
2627
int y = (i + j) / K;
2728
int x = (i + j) % K;
28-
output[y * TK + x] = buf[j];
29+
output[offset + y * TK + x] = buf[j];
2930
}
3031
}
3132
}

src/runtime/opencl/ops/fpga/recursion_sort.cpp

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
extern "C"
22
{
33

4-
void recursion_sort(int *a, int *c, const int M, const int N, const int K)
4+
void recursion_sort(int *a, int *c, const int M, const int N, const int K,
5+
const int a_offset, const int c_offset)
56
{
67
#pragma HLS INTERFACE m_axi port = a offset = slave bundle = gmem
78
#pragma HLS INTERFACE m_axi port = c offset = slave bundle = gmem
@@ -13,6 +14,9 @@ extern "C"
1314
#pragma HLS INTERFACE s_axilite port = N bundle = control
1415
#pragma HLS INTERFACE s_axilite port = K bundle = control
1516

17+
#pragma HLS INTERFACE s_axilite port = a_offset bundle = control
18+
#pragma HLS INTERFACE s_axilite port = c_offset bundle = control
19+
1620
#pragma HLS INTERFACE s_axilite port = return bundle = control
1721
for (int i = 2; i < M * 2; i *= 2)
1822
{
@@ -25,34 +29,34 @@ extern "C"
2529

2630
int k = left * N, l = left, r = mid + 1;
2731
while (l <= mid && r <= right)
28-
if (a[l * N + K] <= a[r * N + K])
32+
if (a[l * N + K] > a[r * N + K])
2933
{
3034
for (int m = l * N; m < l * N + N; m++)
31-
c[k++] = a[m];
35+
c[c_offset + k++] = a[a_offset + m];
3236
l++;
3337
}
3438
else
3539
{
3640
for (int m = r * N; m < r * N + N; m++)
37-
c[k++] = a[m];
41+
c[c_offset + k++] = a[a_offset + m];
3842
r++;
3943
}
4044

4145
while (l <= mid)
4246
{
4347
for (int m = l * N; m < l * N + N; m++)
44-
c[k++] = a[m];
48+
c[c_offset + k++] = a[a_offset + m];
4549
l++;
4650
}
4751
while (r <= right)
4852
{
4953
for (int m = r * N; m < r * N + N; m++)
50-
c[k++] = a[m];
54+
c[c_offset + k++] = a[a_offset + m];
5155
r++;
5256
}
5357
for (int m = left * N; m < (right + 1) * N; m++)
54-
a[m] = c[m];
58+
a[c_offset + m] = c[a_offset + m];
5559
}
5660
}
5761
}
58-
}
62+
}

0 commit comments

Comments
 (0)