CortexFoundation
diff --git a/‎Makefile‎
Lines changed: 2 additions & 2 deletions b/‎Makefile‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/runtime/opencl/ops/fpga/gemm_bias.cpp‎
Lines changed: 3 additions & 3 deletions b/‎src/runtime/opencl/ops/fpga/gemm_bias.cpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/runtime/opencl/ops/fpga/im2col.cpp‎
Lines changed: 52 additions & 33 deletions b/‎src/runtime/opencl/ops/fpga/im2col.cpp‎
Lines changed: 52 additions & 33 deletions
diff --git a/‎src/runtime/opencl/ops/fpga/int32_to_int8.cpp‎
Lines changed: 6 additions & 5 deletions b/‎src/runtime/opencl/ops/fpga/int32_to_int8.cpp‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎src/runtime/opencl/ops/fpga/recursion_sort.cpp‎
Lines changed: 12 additions & 8 deletions b/‎src/runtime/opencl/ops/fpga/recursion_sort.cpp‎
Lines changed: 12 additions & 8 deletions
@@ -52,10 +52,10 @@ FPGA_OBJS=$(patsubst %.cpp,%.xo,$(FPGA_SRC))
 
 FPGA_OUT=ops.${TARGET}.xclbin
 fpga:$(FPGA_OBJS)
-	v++ -O2 -t $(TARGET) --platform=$(PLATFORM) -l -o $(FPGA_OUT) $(FPGA_OBJS)
+	v++ -t $(TARGET) --platform=$(PLATFORM) -l -o $(FPGA_OUT) $(FPGA_OBJS)
 %.xo:%.cpp
 #v++ -t $(TARGET) --platform=$(PLATFORM) -c -k $(basename $(notdir $<)) -o '${BUILD}/fpga/$(basename $(notdir $<)).${TARGET}.xo' $<
-	v++ -O2 -t $(TARGET) --platform=$(PLATFORM) -c -k $(basename $(notdir $<)) -o '$@' $<
+	v++ -t $(TARGET) --platform=$(PLATFORM) -c -k $(basename $(notdir $<)) -o '$@' $<
 	rm $@.*
 
 cleanfpga:
 
@@ -23,7 +23,7 @@ void gemm_bias(const char *A, const char* B, const int* bias, int *C, const int
   const int TK = (K+63)/64*64;
   const int TN = (N+63)/64*64;
 
-  int offset = TM*TK;
+  int offset = TK*TN;
 
 #pragma HLS ARRAY_PARTITION variable = bufC dim = 2 complete
 #pragma HLS ARRAY_PARTITION variable = bufB dim = 2 complete
@@ -52,7 +52,7 @@ void gemm_bias(const char *A, const char* B, const int* bias, int *C, const int
 #pragma HLS PIPELINE II=1
           for(int kk = 0; kk < BLOCK_SIZE; kk++){
 #pragma HLS UNROLL factor=2
-            bufA[ii][kk] = A[(i+ii)*TK + k + kk];
+            bufA[ii][kk] = A[offset + (i+ii)*TK + k + kk];
           }
         }
 
@@ -61,7 +61,7 @@ void gemm_bias(const char *A, const char* B, const int* bias, int *C, const int
 #pragma HLS PIPELINE II=1
           for(int jj = 0; jj < BLOCK_SIZE; jj++){
 #pragma HLS UNROLL factor=2
-            bufB[kk][jj] = B[offset + (k + kk)*TN + j + jj];
+            bufB[kk][jj] = B[(k + kk)*TN + j + jj];
           }
         }
 
 
@@ -7,7 +7,7 @@ void im2col(const int * data_im,
     const int pad_h, const int pad_w,
     const int stride_h, const int stride_w,
     const int dilation_h, const int dilation_w,
-    const int height_col, const int width_col, const int offset) {
+    const int height_col, const int width_col) {
 #pragma HLS INTERFACE m_axi port=data_im offset=slave bundle=gmem
 #pragma HLS INTERFACE m_axi port=data_col offset=slave bundle=gmem
 #pragma HLS INTERFACE s_axilite port=data_im  bundle=control
@@ -25,47 +25,66 @@ void im2col(const int * data_im,
 #pragma HLS INTERFACE s_axilite port=dilation_w bundle=control
 #pragma HLS INTERFACE s_axilite port=height_col bundle=control
 #pragma HLS INTERFACE s_axilite port=width_col bundle=control
-#pragma HLS INTERFACE s_axilite port=offset bundle=control
+//#pragma HLS INTERFACE s_axilite port=offset bundle=control
 #pragma HLS INTERFACE s_axilite port = return bundle = control
 
-  //int tid = threadIdx.x + blockDim.x * blockIdx.x;
-  //for(int64_t index = tid; index < n; index += gridDim.x*blockDim.x){
+  const int BS = 32;
+  const int MAX_KERNEL = 11;
+  char buf[MAX_KERNEL*MAX_KERNEL][BS];
+  const int cols = height_col * width_col;
+  const int cols_offset = (cols + 63) / 64 * 64;
+  for(int ri = 0; ri < n; ri+=BS){
+    int chunk_n = BS;
+    if(ri + BS > n) chunk_n = n - ri;
+    for(int ti = 0; ti < chunk_n; ti++){
+#pragma HLS PIPELINE
+      int index = ri + ti; 
+      const int h_index = index / width_col;
+      const int h_col = h_index % height_col;
+      const int w_col = index % width_col;
+      const int c_im = h_index / height_col;
+      const int c_col = c_im * kernel_h * kernel_w;
+      const int h_offset = h_col * stride_h - pad_h;
+      const int w_offset = w_col * stride_w - pad_w;
+      int dst_index = c_col * cols_offset + h_col * width_col + w_col;
+      int src_index = (c_im * height + h_offset) * width + w_offset;
 
-  for(int index = 0; index < n; index++){
-    const int h_index = index / width_col;
-    const int h_col = h_index % height_col;
-    const int w_col = index % width_col;
-    const int c_im = h_index / height_col;
-    const int c_col = c_im * kernel_h * kernel_w;
-    const int h_offset = h_col * stride_h - pad_h;
-    const int w_offset = w_col * stride_w - pad_w;
-    //int* data_col_ptr = data_col;
-    //data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
-    //int dst_index = offset + (c_col * height_col + h_col) * width_col + w_col;
-    const int cols = height_col * width_col;
-    const int cols_offset = (cols + (MATRIX_PAD-1)) / MATRIX_PAD * MATRIX_PAD;
-    int dst_index = offset + c_col * cols_offset + h_col * width_col + w_col;
-    //const int * data_im_ptr = data_im;
-    //data_im_ptr += (c_im * height + h_offset) * width + w_offset;
-    int src_index = (c_im * height + h_offset) * width + w_offset;
+      for (int i = 0; i < kernel_h; ++i) {
+        for (int j = 0; j < kernel_w; ++j) {
+          int h_im = h_offset + i * dilation_h;
+          int w_im = w_offset + j * dilation_w;
+          if(h_im < 0 || w_im < 0 || h_im >= height || w_im >= width){
+            //data_col[dst_index] = 0;
+            buf[i*kernel_w + j][ti] = 0;
+          }else{
+            //data_col[dst_index] = data_im[src_index + i*dilation_h * width + j * dilation_w];
+            buf[i*kernel_w + j][ti] = data_im[src_index + i*dilation_h * width + j * dilation_w];
+          }
+          //dst_index += cols_offset;//height_col * width_col;
+        }
+      }
+    }
 
-#pragma HLS PIPELINE
     for (int i = 0; i < kernel_h; ++i) {
       for (int j = 0; j < kernel_w; ++j) {
-        int h_im = h_offset + i * dilation_h;
-        int w_im = w_offset + j * dilation_w;
-        //*data_col_ptr = data_im_ptr[i*dilation_h * width + j * dilation_w];
-        if(h_im < 0 || w_im < 0 || h_im >= height || w_im >= width){
-          data_col[dst_index] = 0;
-        }else{
-          data_col[dst_index] = data_im[src_index + i*dilation_h * width + j * dilation_w];
+        const int kernel_index = i*kernel_w + j;
+        for(int ti = 0; ti < chunk_n; ti++){
+#pragma HLS PIPELINE
+          int index = ri + ti; 
+          const int h_index = index / width_col;
+          const int h_col = h_index % height_col;
+          const int w_col = index % width_col;
+          const int c_im = h_index / height_col;
+          const int c_col = c_im * kernel_h * kernel_w;
+          const int h_offset = h_col * stride_h - pad_h;
+          const int w_offset = w_col * stride_w - pad_w;
+          int dst_index = c_col * cols_offset + h_col * width_col + w_col + kernel_index * cols_offset;
+          data_col[dst_index] = buf[kernel_index][ti];
         }
-        //(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
-        //(data_im_ptr[i * dilation_h * width + j * dilation_w]) : 0;
-        //data_col_ptr += height_col * width_col;
-        dst_index += cols_offset;//height_col * width_col;
+        //dst_index += cols_offset;//height_col * width_col;
       }
     }
   }
+
 }
 }
@@ -1,13 +1,14 @@
 extern "C"{
-  void int32_to_int8(const int *input, char* output, const int M, const int K){
+  void int32_to_int8(const int *input, char* output, const int M, const int K, const int offset){
 #pragma HLS INTERFACE m_axi port=input offset=slave bundle=gmem
 #pragma HLS INTERFACE m_axi port=output offset=slave bundle=gmem
 #pragma HLS INTERFACE s_axilite port=input bundle=control
 #pragma HLS INTERFACE s_axilite port=output bundle=control
 #pragma HLS INTERFACE s_axilite port=M bundle=control
 #pragma HLS INTERFACE s_axilite port=K bundle=control
+#pragma HLS INTERFACE s_axilite port=offset bundle=control
 #pragma HLS INTERFACE s_axilite port = return bundle = control
-    const int BUF_SIZE = 64;
+    const int BUF_SIZE = 128;
     char buf[BUF_SIZE];
     const int n = M * K;
     const int TK = (K + 63) / 64 * 64;
@@ -17,15 +18,15 @@ extern "C"{
       if(i + chunk_size > n) chunk_size = n - i;
 load:
       for(int j = 0; j < chunk_size; j++){
-#pragma HLS PIPELINE II=1
+#pragma HLS PIPELINE
         buf[j] = input[i + j];
       }
 write:
       for(int j = 0; j < chunk_size; j++){
-#pragma HLS PIPELINE II=1
+#pragma HLS PIPELINE
         int y = (i + j) / K;
         int x = (i + j) % K;
-        output[y * TK + x] = buf[j];
+        output[offset + y * TK + x] = buf[j];
       }
     }
   }
 
@@ -1,7 +1,8 @@
 extern "C"
 {
 
-    void recursion_sort(int *a, int *c, const int M, const int N, const int K)
+    void recursion_sort(int *a, int *c, const int M, const int N, const int K, 
+        const int a_offset, const int c_offset)
     {
 #pragma HLS INTERFACE m_axi port = a offset = slave bundle = gmem
 #pragma HLS INTERFACE m_axi port = c offset = slave bundle = gmem
@@ -13,6 +14,9 @@ extern "C"
 #pragma HLS INTERFACE s_axilite port = N bundle = control
 #pragma HLS INTERFACE s_axilite port = K bundle = control
 
+#pragma HLS INTERFACE s_axilite port = a_offset bundle = control
+#pragma HLS INTERFACE s_axilite port = c_offset bundle = control
+
 #pragma HLS INTERFACE s_axilite port = return bundle = control
         for (int i = 2; i < M * 2; i *= 2)
         {
@@ -25,34 +29,34 @@ extern "C"
 
                 int k = left * N, l = left, r = mid + 1;
                 while (l <= mid && r <= right)
-                    if (a[l * N + K] <= a[r * N + K])
+                    if (a[l * N + K] > a[r * N + K])
                     {
                         for (int m = l * N; m < l * N + N; m++)
-                            c[k++] = a[m];
+                            c[c_offset + k++] = a[a_offset + m];
                         l++;
                     }
                     else
                     {
                         for (int m = r * N; m < r * N + N; m++)
-                            c[k++] = a[m];
+                            c[c_offset + k++] = a[a_offset + m];
                         r++;
                     }
 
                 while (l <= mid)
                 {
                     for (int m = l * N; m < l * N + N; m++)
-                        c[k++] = a[m];
+                        c[c_offset + k++] = a[a_offset + m];
                     l++;
                 }
                 while (r <= right)
                 {
                     for (int m = r * N; m < r * N + N; m++)
-                        c[k++] = a[m];
+                        c[c_offset + k++] = a[a_offset + m];
                     r++;
                 }
                 for (int m = left * N; m < (right + 1) * N; m++)
-                    a[m] = c[m];
+                    a[c_offset + m] = c[a_offset + m];
             }
         }
     }
-}
+}
Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@ void gemm_bias(const char A, const char B, const int* bias, int *C, const int`
`23`	`23`	`const int TK = (K+63)/64*64;`
`24`	`24`	`const int TN = (N+63)/64*64;`
`25`	`25`
`26`		`- int offset = TM*TK;`
	`26`	`+ int offset = TK*TN;`
`27`	`27`
`28`	`28`	`#pragma HLS ARRAY_PARTITION variable = bufC dim = 2 complete`
`29`	`29`	`#pragma HLS ARRAY_PARTITION variable = bufB dim = 2 complete`
`@@ -52,7 +52,7 @@ void gemm_bias(const char A, const char B, const int* bias, int *C, const int`
`52`	`52`	`#pragma HLS PIPELINE II=1`
`53`	`53`	`for(int kk = 0; kk < BLOCK_SIZE; kk++){`
`54`	`54`	`#pragma HLS UNROLL factor=2`
`55`		`- bufA[ii][kk] = A[(i+ii)*TK + k + kk];`
	`55`	`+ bufA[ii][kk] = A[offset + (i+ii)*TK + k + kk];`
`56`	`56`	`}`
`57`	`57`	`}`
`58`	`58`
`@@ -61,7 +61,7 @@ void gemm_bias(const char A, const char B, const int* bias, int *C, const int`
`61`	`61`	`#pragma HLS PIPELINE II=1`
`62`	`62`	`for(int jj = 0; jj < BLOCK_SIZE; jj++){`
`63`	`63`	`#pragma HLS UNROLL factor=2`
`64`		`- bufB[kk][jj] = B[offset + (k + kk)*TN + j + jj];`
	`64`	`+ bufB[kk][jj] = B[(k + kk)*TN + j + jj];`
`65`	`65`	`}`
`66`	`66`	`}`
`67`	`67`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,8 @@`
`1`	`1`	`extern "C"`
`2`	`2`	`{`
`3`	`3`
`4`		`- void recursion_sort(int a, int c, const int M, const int N, const int K)`
	`4`	`+ void recursion_sort(int a, int c, const int M, const int N, const int K,`
	`5`	`+ const int a_offset, const int c_offset)`
`5`	`6`	`{`
`6`	`7`	`#pragma HLS INTERFACE m_axi port = a offset = slave bundle = gmem`
`7`	`8`	`#pragma HLS INTERFACE m_axi port = c offset = slave bundle = gmem`
`@@ -13,6 +14,9 @@ extern "C"`
`13`	`14`	`#pragma HLS INTERFACE s_axilite port = N bundle = control`
`14`	`15`	`#pragma HLS INTERFACE s_axilite port = K bundle = control`
`15`	`16`
	`17`	`+#pragma HLS INTERFACE s_axilite port = a_offset bundle = control`
	`18`	`+#pragma HLS INTERFACE s_axilite port = c_offset bundle = control`
	`19`	`+`
`16`	`20`	`#pragma HLS INTERFACE s_axilite port = return bundle = control`
`17`	`21`	`for (int i = 2; i < M * 2; i *= 2)`
`18`	`22`	`{`
`@@ -25,34 +29,34 @@ extern "C"`
`25`	`29`
`26`	`30`	`int k = left * N, l = left, r = mid + 1;`
`27`	`31`	`while (l <= mid && r <= right)`
`28`		`- if (a[l * N + K] <= a[r * N + K])`
	`32`	`+ if (a[l * N + K] > a[r * N + K])`
`29`	`33`	`{`
`30`	`34`	`for (int m = l * N; m < l * N + N; m++)`
`31`		`- c[k++] = a[m];`
	`35`	`+ c[c_offset + k++] = a[a_offset + m];`
`32`	`36`	`l++;`
`33`	`37`	`}`
`34`	`38`	`else`
`35`	`39`	`{`
`36`	`40`	`for (int m = r * N; m < r * N + N; m++)`
`37`		`- c[k++] = a[m];`
	`41`	`+ c[c_offset + k++] = a[a_offset + m];`
`38`	`42`	`r++;`
`39`	`43`	`}`
`40`	`44`
`41`	`45`	`while (l <= mid)`
`42`	`46`	`{`
`43`	`47`	`for (int m = l * N; m < l * N + N; m++)`
`44`		`- c[k++] = a[m];`
	`48`	`+ c[c_offset + k++] = a[a_offset + m];`
`45`	`49`	`l++;`
`46`	`50`	`}`
`47`	`51`	`while (r <= right)`
`48`	`52`	`{`
`49`	`53`	`for (int m = r * N; m < r * N + N; m++)`
`50`		`- c[k++] = a[m];`
	`54`	`+ c[c_offset + k++] = a[a_offset + m];`
`51`	`55`	`r++;`
`52`	`56`	`}`
`53`	`57`	`for (int m = left * N; m < (right + 1) * N; m++)`
`54`		`- a[m] = c[m];`
	`58`	`+ a[c_offset + m] = c[a_offset + m];`
`55`	`59`	`}`
`56`	`60`	`}`
`57`	`61`	`}`
`58`		`-}`
	`62`	`+}`