diff --git a/.gitignore b/.gitignore index 2b8c6977..9dda0010 100644 --- a/.gitignore +++ b/.gitignore @@ -39,6 +39,7 @@ sd-driver #Project generated test files python/test_scripts/testData python/.venv +__pycache__ #Vim stuff *.swp diff --git a/TESTS/tensors/test_romtensor.cpp b/TESTS/tensors/test_romtensor.cpp index 025ca69a..be968065 100644 --- a/TESTS/tensors/test_romtensor.cpp +++ b/TESTS/tensors/test_romtensor.cpp @@ -125,3 +125,35 @@ TEST(Rom_Tensor, read_write_i16) { cout << "uint16 Sizeof IntegralValue " << sizeof(IntegralValue(5)) << endl; delete[] buffer; } + +TEST(ScalarRom_Tensor, read_write_i8) { + ///setup_context(); + localCircularArenaAllocator<256> meta_allocator; + localCircularArenaAllocator<256> ram_allocator; + Context::get_default_context()->set_metadata_allocator(&meta_allocator); + Context::get_default_context()->set_ram_data_allocator(&ram_allocator); + int8_t* buffer = new int8_t[1]; + buffer[0] = 5; + ScalarRomTensor r({1}, i8, buffer); + int8_t read = r(2,2); + EXPECT_EQ(read, 5); + cout << "i8 Sizeof IntegralValue " << sizeof(IntegralValue(5)) << endl; + cout << "Sizeof RomTensor " << sizeof(r) << endl; + delete[] buffer; +} + +TEST(ScalarRom_Tensor, read_write_flt) { + ///setup_context(); + localCircularArenaAllocator<256> meta_allocator; + localCircularArenaAllocator<256> ram_allocator; + Context::get_default_context()->set_metadata_allocator(&meta_allocator); + Context::get_default_context()->set_ram_data_allocator(&ram_allocator); + float* buffer = new float[1]; + buffer[0] = 5.0; + ScalarRomTensor r({1}, flt, buffer); + float read = r(2,2); + EXPECT_NEAR(read, 5.0, 0.0001); + cout << "float Sizeof IntegralValue " << sizeof(IntegralValue(5)) << endl; + cout << "Sizeof RomTensor " << sizeof(r) << endl; + delete[] buffer; +} diff --git a/python/test_scripts/__init__.py b/python/test_scripts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/test_scripts/gen_softmax.py b/python/test_scripts/gen_softmax.py new file mode 100644 index 00000000..a39a71a6 --- /dev/null +++ b/python/test_scripts/gen_softmax.py @@ -0,0 +1,24 @@ +from jinja_env import env2, Operator, Tensor, SingleOpTest +import tensorflow as tf + +test_group = "Softmax" + +def gen_test(test_number, scale = 1.0): + test_name = "random_gen_scale_%d__%d" % ( int(scale), test_number) + in1 = tf.constant(tf.random.uniform([1,10])*scale).numpy() + out_1 = tf.nn.softmax(in1).numpy() + + in_ref_name = "s_ref_in_%d" % test_number + out_ref_name = "s_ref_out_%d" % test_number + in_t = Tensor("in", in1, ref_name=in_ref_name) + out_ref = Tensor("out_ref", out_1, ref_name=out_ref_name) # Store the reference out values + out_t = Tensor("out", out_1) + #conv_param_str = "{%s}, %s" % (str(strides).lstrip('[').rstrip(']'), padding) + #convOp = Operator("Conv2dOperator", "op_0", dtypes=["float"], param_str=conv_param_str) + op = Operator("SoftmaxOperator", "softmaxOp", dtypes=["float"]) + op.set_inputs({"in": in_t}).set_outputs({"out": out_t}) + + test = SingleOpTest(test_group, test_name, op) + test.add_tensor_comparison(out_t, out_ref) + test_rendered, const_snippets = test.render() + print(test_rendered) diff --git a/python/test_scripts/jinja_env/__init__.py b/python/test_scripts/jinja_env/__init__.py index 07dc9e1f..8d93ad9b 100644 --- a/python/test_scripts/jinja_env/__init__.py +++ b/python/test_scripts/jinja_env/__init__.py @@ -1,23 +1,177 @@ import jinja2 from pathlib import Path +import numpy as np _template_dir = Path(__file__).parent / "templates" +_template2_dir = Path(__file__).parent / "templates_v2" env = jinja2.Environment( loader=jinja2.FileSystemLoader(_template_dir), trim_blocks=True, lstrip_blocks=True -) + ) env.globals.update( zip=zip, len=len, TENSOR_TYPE_MAP={ - "int8_t": "i8", - "uint8_t": "u8", - "int16_t": "i16", - "uint16_t": "u16", - "int32_t": "i32", - "uint32_t": "u32", - "float": "flt", - }, -) + "int8_t": "i8", + "uint8_t": "u8", + "int16_t": "i16", + "uint16_t": "u16", + "int32_t": "i32", + "uint32_t": "u32", + "float": "flt", + }, + ) del _template_dir + +env2 = jinja2.Environment( + loader=jinja2.FileSystemLoader(_template2_dir), trim_blocks=True, lstrip_blocks=True + ) +env2.globals.update( + zip=zip, + len=len, + TENSOR_TYPE_MAP={ + "int8_t": "i8", + "uint8_t": "u8", + "int16_t": "i16", + "uint16_t": "u16", + "int32_t": "i32", + "uint32_t": "u32", + "float": "flt", + }, + NUMPY_2_CMAP={ + np.int8: "int8_t", + np.uint8: "uint8_t", + np.int16: "int16_t", + np.uint16: "uint16_t", + np.int32: "int32_t", + np.uint32: "uint32_t", + np.float: "float", + np.dtype('float32'): "float", + }, + ) + +TENSOR_TYPE_MAP={ + "int8_t": "i8", + "uint8_t": "u8", + "int16_t": "i16", + "uint16_t": "u16", + "int32_t": "i32", + "uint32_t": "u32", + "float": "flt", + } +NUMPY_2_CMAP={ + np.int8: "int8_t", + np.uint8: "uint8_t", + np.int16: "int16_t", + np.uint16: "uint16_t", + np.int32: "int32_t", + np.uint32: "uint32_t", + np.float: "float", + np.dtype('float32'): "float", + } + +class Tensor: + def __init__(self, name, np_array, ref_name=None, quantize_params=[]): + self.name = name + self.np_array = np_array + self.ref_name = ref_name + self.quantize_params = quantize_params + + @property + def shape(self): + return self.np_array.shape + + @property + def dtype(self): + return NUMPY_2_CMAP[self.np_array.dtype] + + @property + def utype(self): + return TENSOR_TYPE_MAP[self.dtype] + + def flatten(self): + return self.np_array.flatten() + + def render_constant(self): + if self.ref_name: + return env2.get_template('def_constant.hpp').render(tensor=self) + else: + return "" + def render_declaration(self): + if self.ref_name: + return env2.get_template('declare_rom_tensor.cpp').render(tensor=self) + else: + return env2.get_template('declare_ram_tensor.cpp').render(tensor=self) + + +class Operator: + def __init__(self, op_type, name, dtypes=[], param_str=None): + self.op_type = op_type + self.name = name + self.dtypes = dtypes + self.param_str = param_str + self.array_template = env2.get_template('array_template.cpp') + self.input_map = {} + self.output_map = {} + self.type_signature = env2.get_template('op_type_signature.cpp').render(op=self) + + def set_inputs(self, input_map): + self.input_map = input_map + return self + + def set_outputs(self, output_map): + self.output_map = output_map + return self + + def render_declaration(self): + return env2.get_template('declare_operator.cpp').render(op=self) + + def render_eval(self): + return env2.get_template('eval_operator.cpp').render(op=self) + +class SingleOpTest: + def __init__(self, test_group, test_name, target_op): + self.test_group = test_group + self.test_name = test_name + self.out_size = 0 + for out_tensor in target_op.output_map: + self.out_size += len(target_op.output_map[out_tensor].flatten()) + self.target_op = target_op + self.compare_tensors = [] + self.tensor_set = set() + for tensor in target_op.input_map: + self.tensor_set.add(target_op.input_map[tensor]) + for tensor in target_op.output_map: + self.tensor_set.add(target_op.output_map[tensor]) + + def add_tensor_comparison(self, a, b): + self.compare_tensors.append((a,b)) + self.tensor_set.add(a) + self.tensor_set.add(b) + + def render(self): + const_snippets = [] + tensor_decls = [] + for tensor in self.tensor_set: + const_snippets.append(tensor.render_constant()) + tensor_decls.append(tensor.render_declaration()) + op_decl = self.target_op.render_declaration() + op_eval = self.target_op.render_eval() + + compare_snippets = [] + for a, b in self.compare_tensors: + compare_snippets.append(env2.get_template('compare_outputs.cpp').render(a=a, b=b)) + + TestTemplate = env2.get_template('test_container.cpp') + test_rendered = TestTemplate.render(test_group= self.test_group, + test_name = self.test_name, + out_size = self.out_size, + tensor_declarations = tensor_decls, + op_decl = op_decl, + op_eval = op_eval, + compare_snippets=compare_snippets) + return (test_rendered, const_snippets) + + +del _template2_dir diff --git a/python/test_scripts/jinja_env/templates_v2/array_template.cpp b/python/test_scripts/jinja_env/templates_v2/array_template.cpp new file mode 100644 index 00000000..cbaec841 --- /dev/null +++ b/python/test_scripts/jinja_env/templates_v2/array_template.cpp @@ -0,0 +1 @@ +{% for x in arr %}{{ x }}{{ "," if not loop.last }}{% endfor %} diff --git a/python/test_scripts/jinja_env/templates_v2/compare_outputs.cpp b/python/test_scripts/jinja_env/templates_v2/compare_outputs.cpp new file mode 100644 index 00000000..04f8c007 --- /dev/null +++ b/python/test_scripts/jinja_env/templates_v2/compare_outputs.cpp @@ -0,0 +1,3 @@ +for(int i = 0; i < {{ len(b.flatten()) }}; i++) { + EXPECT_NEAR(static_cast<{{ a.dtype }}>( {{ a.name }}(i) ), static_cast<{{ b.dtype }}>( {{ b.name }}(i) ), 0.0001); +} diff --git a/python/test_scripts/jinja_env/templates_v2/const_container.hpp b/python/test_scripts/jinja_env/templates_v2/const_container.hpp new file mode 100644 index 00000000..f41695a2 --- /dev/null +++ b/python/test_scripts/jinja_env/templates_v2/const_container.hpp @@ -0,0 +1,6 @@ +#ifndef {{ constants_header | replace(".", "_") }} +#define {{ constants_header | replace(".", "_") }} +{% for constant_snippet in constants %} +{{ constant_snippet }} +{% endfor %} +#endif diff --git a/python/test_scripts/jinja_env/templates_v2/declare_operator.cpp b/python/test_scripts/jinja_env/templates_v2/declare_operator.cpp new file mode 100644 index 00000000..035f26d0 --- /dev/null +++ b/python/test_scripts/jinja_env/templates_v2/declare_operator.cpp @@ -0,0 +1 @@ +{{ op.type_signature }} {{ op.name }}{% if op.param_str %}({{ op.param_str }}){% endif %}; diff --git a/python/test_scripts/jinja_env/templates_v2/declare_ram_tensor.cpp b/python/test_scripts/jinja_env/templates_v2/declare_ram_tensor.cpp new file mode 100644 index 00000000..6b674d57 --- /dev/null +++ b/python/test_scripts/jinja_env/templates_v2/declare_ram_tensor.cpp @@ -0,0 +1,4 @@ +Tensor {{tensor.name}} = new RamTensor({ {%for s in tensor.shape%}{{ s }}{{"," if not loop.last}}{%endfor%} }, {{ tensor.utype }}); +{%if tensor.quantize_params%} + {{tensor.name}}->set_quantization_params(PerTensorQuantizationParams({{tensor.quantize_params[1]}}, {{tensor.quantize_params[0]}})); +{%endif%} diff --git a/python/test_scripts/jinja_env/templates_v2/declare_rom_tensor.cpp b/python/test_scripts/jinja_env/templates_v2/declare_rom_tensor.cpp new file mode 100644 index 00000000..447775c1 --- /dev/null +++ b/python/test_scripts/jinja_env/templates_v2/declare_rom_tensor.cpp @@ -0,0 +1,4 @@ +Tensor {{ tensor.name }} = new RomTensor({ {% for s in tensor.shape %}{{ s }}{{"," if not loop.last}}{% endfor %} }, {{ tensor.utype }}, {{ tensor.ref_name }}); +{%if tensor.quantize_params%} + {{tensor.name}}->set_quantization_params(PerTensorQuantizationParams({{tensor.quantize_params[1]}}, {{tensor.quantize_params[0]}})); +{%endif%} diff --git a/python/test_scripts/jinja_env/templates_v2/def_constant.hpp b/python/test_scripts/jinja_env/templates_v2/def_constant.hpp new file mode 100644 index 00000000..f213e528 --- /dev/null +++ b/python/test_scripts/jinja_env/templates_v2/def_constant.hpp @@ -0,0 +1,3 @@ +static const {{ tensor.dtype }} {{ tensor.ref_name }}[{{ len(tensor.flatten()) }}] = { + {% for x in tensor.flatten() %} {{ x }}{{ "," if not loop.last }}{{ "\n" if not loop.first and loop.index % 10 == 0}} {% endfor %} +}; diff --git a/python/test_scripts/jinja_env/templates_v2/eval_operator.cpp b/python/test_scripts/jinja_env/templates_v2/eval_operator.cpp new file mode 100644 index 00000000..0ec0c0d4 --- /dev/null +++ b/python/test_scripts/jinja_env/templates_v2/eval_operator.cpp @@ -0,0 +1,10 @@ +{{op.name}} + .set_inputs({ + {% for x in op.input_map %} + { {{op.type_signature}}::{{x}}, {{op.input_map[x].name}} }{{"," if not loop.last}} + {% endfor %} + }).set_outputs({ + {% for x in op.output_map %} + { {{op.type_signature}}::{{x}}, {{op.output_map[x].name}} }{{"," if not loop.last}} + {% endfor %} + }).eval(); diff --git a/python/test_scripts/jinja_env/templates_v2/op_type_signature.cpp b/python/test_scripts/jinja_env/templates_v2/op_type_signature.cpp new file mode 100644 index 00000000..6b658313 --- /dev/null +++ b/python/test_scripts/jinja_env/templates_v2/op_type_signature.cpp @@ -0,0 +1 @@ +{{ op.op_type }}{% if op.dtypes %}<{{ op.array_template.render(arr=op.dtypes) }}>{% endif %} diff --git a/python/test_scripts/jinja_env/templates_v2/test_container.cpp b/python/test_scripts/jinja_env/templates_v2/test_container.cpp new file mode 100644 index 00000000..2c2716f6 --- /dev/null +++ b/python/test_scripts/jinja_env/templates_v2/test_container.cpp @@ -0,0 +1,20 @@ +/*************************************** + * Generated Test + ***************************************/ + +TEST({{ test_group }}, {{ test_name }}) { + localCircularArenaAllocator<1024> meta_allocator; + localCircularArenaAllocator<{{ out_size }}*2*sizeof(float), uint32_t> ram_allocator; + Context::get_default_context()->set_metadata_allocator(&meta_allocator); + Context::get_default_context()->set_ram_data_allocator(&ram_allocator); + + {% for tensor_decl in tensor_declarations %} + {{ tensor_decl }}{% endfor %} + + {{ op_decl }} + {{ op_eval }} + + {% for compare_snippet in compare_snippets %} + {{ compare_snippet }} + {% endfor %} +} diff --git a/src/uTensor/CMakeLists.txt b/src/uTensor/CMakeLists.txt index 527c9b4b..8e38e115 100644 --- a/src/uTensor/CMakeLists.txt +++ b/src/uTensor/CMakeLists.txt @@ -14,6 +14,8 @@ set(src_utensor_tensors ) set(src_utensor_ops ops/Matrix.cpp + ops/Matrix_kernels.cpp + ops/Convolution_kernels.cpp ) set(src_utensor_errhndl errorHandlers/SimpleErrorHandler.cpp diff --git a/src/uTensor/core/quantizationPrimitives.cpp b/src/uTensor/core/quantizationPrimitives.cpp index e3c21631..aa70d912 100644 --- a/src/uTensor/core/quantizationPrimitives.cpp +++ b/src/uTensor/core/quantizationPrimitives.cpp @@ -83,11 +83,15 @@ void QuantizationParamsHandle::free() { QuantizationParamsHandle::QuantizationParamsHandle() : Handle() {} QuantizationParamsHandle::QuantizationParamsHandle(QuantizationParams* ptr) : Handle((void*)ptr) { // Context::get_default_context()->get_metadata_allocator()->bind(_ptr, this); - bind(*this, *Context::get_default_context()->get_metadata_allocator()); + if(ptr){ + bind(*this, *Context::get_default_context()->get_metadata_allocator()); + } } QuantizationParamsHandle& QuantizationParamsHandle::operator=(QuantizationParams* ptr) { _ptr = (void*)ptr; - bind(*this, *Context::get_default_context()->get_metadata_allocator()); + if(ptr){ + bind(*this, *Context::get_default_context()->get_metadata_allocator()); + } // Context::get_metadata_allocator()->bind(_ptr, this); return *this; } diff --git a/src/uTensor/ops/ActivationFncs.hpp b/src/uTensor/ops/ActivationFncs.hpp index 9b2dc390..5ca82698 100644 --- a/src/uTensor/ops/ActivationFncs.hpp +++ b/src/uTensor/ops/ActivationFncs.hpp @@ -83,7 +83,87 @@ class ReLU6Operator : public OperatorInterface<1, 1> { } }; -} +template +class InPlaceSoftmax : public InPlaceActivationFnc { + // ReLU only makes sense if there is a notion of negative + static_assert(std::is_signed::value, + "Error attempted to construct Softmax on non-signed types"); + + public: + InPlaceSoftmax() : beta(1) {} + InPlaceSoftmax(T beta) : beta(beta) {} + protected: + virtual void compute() { inplace_softmax_k(inputs[x].tensor(), beta); } + + private: + T beta; +}; + +template +class SoftmaxOperator : public OperatorInterface<1, 1> { + // ReLU only makes sense if there is a notion of negative + static_assert(std::is_signed::value, + "Error attempted to construct softmax on non-signed types"); + + public: + enum names_in : uint8_t { in }; + enum names_out : uint8_t { out }; + + public: + SoftmaxOperator() : beta(1) {} + SoftmaxOperator(T beta) : beta(beta) {} + protected: + virtual void compute() { + const Tensor& inT = inputs[in].tensor(); + Tensor& outT = outputs[out].tensor(); + // TODO Check sizes here and throw mismatch + uint32_t in_size = inT->get_shape().get_linear_size(); + uint32_t out_size = outT->get_shape().get_linear_size(); + if (in_size != out_size) + Context::get_default_context()->throwError( + new OperatorIOSizeMismatchError); + softmax_k(outT, inT, beta); + } + + private: + T beta; +}; + +template +class InPlaceSigmoid : public InPlaceActivationFnc { + // ReLU only makes sense if there is a notion of negative + static_assert(std::is_signed::value, + "Error attempted to construct Sigmoid on non-signed types"); + + protected: + virtual void compute() { inplace_softmax_k(inputs[x].tensor()); } +}; + +template +class SigmoidOperator : public OperatorInterface<1, 1> { + // ReLU only makes sense if there is a notion of negative + static_assert(std::is_signed::value, + "Error attempted to construct Sigmoid on non-signed types"); + + public: + enum names_in : uint8_t { in }; + enum names_out : uint8_t { out }; + + protected: + virtual void compute() { + const Tensor& inT = inputs[in].tensor(); + Tensor& outT = outputs[out].tensor(); + // TODO Check sizes here and throw mismatch + uint32_t in_size = inT->get_shape().get_linear_size(); + uint32_t out_size = outT->get_shape().get_linear_size(); + if (in_size != out_size) + Context::get_default_context()->throwError( + new OperatorIOSizeMismatchError); + sigmoid_k(outT, inT); + } +}; + +} // ReferenceOperators } // namespace uTensor #endif diff --git a/src/uTensor/ops/ActivationFncs_kernels.hpp b/src/uTensor/ops/ActivationFncs_kernels.hpp index 9300c134..62777ede 100644 --- a/src/uTensor/ops/ActivationFncs_kernels.hpp +++ b/src/uTensor/ops/ActivationFncs_kernels.hpp @@ -1,9 +1,43 @@ #ifndef UTENSOR_ACTIVATIONS_KERNELS_H #define UTENSOR_ACTIVATIONS_KERNELS_H #include "operatorBase.hpp" +#include +#include +#include + +using std::exp; namespace uTensor { +namespace Fuseable { + + template + using Activation = std::function; + + template + T NoActivation(T x) { return x; } + + template + T ReLU(T x) { return (x < 0) ? 0 : x; } + + template + T ReLU6(T x) { + if (x < 0){ + return 0; + } else if (x > 6) { + return 6; + } else { + return x; + } + } + + template + T Sigmoid(T x) { + const T one = 1; + return one / ( one + exp(-x) ); + } + +} // namespace Fuseable template void inplace_relu_k(Tensor& t) { T tmp; @@ -60,5 +94,86 @@ void relu6_k(Tensor& out, const Tensor& in) { } } +template +void inplace_softmax_k(Tensor& in, T beta = 1) { + T tmp; + T mSum = 0; + const TensorShape& inShape = in->get_shape(); + int outer_dim = inShape.num_dims() -1; + int depth = inShape[outer_dim]; + int out_side_numelems = 1; + for(int i = 0; i < inShape.num_dims(); i++){ + out_side_numelems *= (i == outer_dim) ? 1: inShape[i]; + } + + for (int i = 0; i < out_side_numelems; i++) { + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) + T max = std::numeric_limits::lowest(); + for(int j = 0; j < depth; j++){ + max = std::max(max, static_cast(in(i, j))); + } + + T mSum = 0; + for(int j = 0; j < depth; j++){ + T tmp = exp((static_cast(in(i,j)) - max) * beta); + mSum += tmp; + in(i,j) = tmp; + } + for(int j = 0; j < depth; j++){ + in(i, j) = static_cast(in(i, j)) / mSum; + } + } +} +template +void softmax_k(Tensor& out, const Tensor& in, T beta=1) { + T tmp; + T mSum = 0; + const TensorShape& inShape = in->get_shape(); + int outer_dim = inShape.num_dims() -1; + int depth = inShape[outer_dim]; + int out_side_numelems = 1; + for(int i = 0; i < inShape.num_dims(); i++){ + out_side_numelems *= (i == outer_dim) ? 1: inShape[i]; + } + + for (int i = 0; i < out_side_numelems; i++) { + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) + T max = std::numeric_limits::lowest(); + for(int j = 0; j < depth; j++){ + max = std::max(max, static_cast(in(i, j))); + } + + T mSum = 0; + for(int j = 0; j < depth; j++){ + T tmp = exp((static_cast(in(i,j)) - max) * beta); + mSum += tmp; + out(i,j) = tmp; + } + for(int j = 0; j < depth; j++){ + out(i, j) = static_cast(out(i, j)) / mSum; + } + } + +} +template +void inplace_sigmoid_k(Tensor& t) { + const T one = 1; + uint32_t t_size = t->get_shape().get_linear_size(); + for (uint32_t i = 0; i < t_size; i++) { + const T tmp = one / (one + exp(- static_cast(t(i)))); + t(i) = tmp; + } +} + +template +void sigmoid_k(Tensor& out, const Tensor& in) { + const T one = 1; + uint32_t t_size = in->get_shape().get_linear_size(); + for (uint32_t i = 0; i < t_size; i++) { + const T tmp = one / (one + exp(- static_cast(in(i)))); + out(i) = tmp; + } +} + } // namespace uTensor #endif diff --git a/src/uTensor/ops/Arithmetic.hpp b/src/uTensor/ops/Arithmetic.hpp index 2be81534..2bfe1e38 100644 --- a/src/uTensor/ops/Arithmetic.hpp +++ b/src/uTensor/ops/Arithmetic.hpp @@ -23,6 +23,34 @@ class AddOperator : public OperatorInterface<2, 1> { } }; +template +class SubOperator : public OperatorInterface<2, 1> { + public: + enum names_in : uint8_t { a, b }; + enum names_out : uint8_t { c }; + // AddOperator(FixedTensorMap<2> inputs, FixedTensorMap<1> outputs) : + // OperatorBase(inputs, outputs) {} + + protected: + virtual void compute() { + sub_kernel(outputs[c].tensor(), inputs[a].tensor(), inputs[b].tensor()); + } +}; + +template +class MulOperator : public OperatorInterface<2, 1> { + public: + enum names_in : uint8_t { a, b }; + enum names_out : uint8_t { c }; + // AddOperator(FixedTensorMap<2> inputs, FixedTensorMap<1> outputs) : + // OperatorBase(inputs, outputs) {} + + protected: + virtual void compute() { + mul_kernel(outputs[c].tensor(), inputs[a].tensor(), inputs[b].tensor()); + } +}; + } } // namespace uTensor #endif diff --git a/src/uTensor/ops/Arithmetic_kernels.hpp b/src/uTensor/ops/Arithmetic_kernels.hpp index 4937f45e..1250d2ab 100644 --- a/src/uTensor/ops/Arithmetic_kernels.hpp +++ b/src/uTensor/ops/Arithmetic_kernels.hpp @@ -6,7 +6,7 @@ namespace uTensor { template void add_kernel(Tensor& c, const Tensor& a, const Tensor& b) { // Decide on c shape - TensorShape c_shape = c->get_shape(); + const TensorShape& c_shape = c->get_shape(); uint32_t c_size = c_shape.get_linear_size(); // TensorInterface& C = reinterpret_cast(*c); // const TensorInterface& A = reinterpret_cast(*a); @@ -16,5 +16,31 @@ void add_kernel(Tensor& c, const Tensor& a, const Tensor& b) { c(i) = static_cast(static_cast(a(i)) + static_cast(b(i))); } +template +void sub_kernel(Tensor& c, const Tensor& a, const Tensor& b) { + // Decide on c shape + const TensorShape& c_shape = c->get_shape(); + uint32_t c_size = c_shape.get_linear_size(); + // TensorInterface& C = reinterpret_cast(*c); + // const TensorInterface& A = reinterpret_cast(*a); + // const TensorInterface& B = reinterpret_cast(*b); + + for (uint32_t i = 0; i < c_size; i++) + c(i) = static_cast(static_cast(a(i)) - static_cast(b(i))); +} + +template +void mul_kernel(Tensor& c, const Tensor& a, const Tensor& b) { + // Decide on c shape + const TensorShape& c_shape = c->get_shape(); + uint32_t c_size = c_shape.get_linear_size(); + // TensorInterface& C = reinterpret_cast(*c); + // const TensorInterface& A = reinterpret_cast(*a); + // const TensorInterface& B = reinterpret_cast(*b); + + for (uint32_t i = 0; i < c_size; i++) + c(i) = static_cast(static_cast(a(i)) * static_cast(b(i))); +} + } // namespace uTensor #endif diff --git a/src/uTensor/ops/Convolution.hpp b/src/uTensor/ops/Convolution.hpp index faf9e5ba..462908ca 100644 --- a/src/uTensor/ops/Convolution.hpp +++ b/src/uTensor/ops/Convolution.hpp @@ -225,6 +225,80 @@ using MaxPoolOperator = GenericPoolOperator>; template using AvgPoolOperator = GenericPoolOperator>; +template +class DepthwiseSeparableConvOperatorV2 : public OperatorInterface<3, 1> { + public: + enum names_in : uint8_t { in, filter, bias }; + enum names_out : uint8_t { out }; + + public: + DepthwiseSeparableConvOperatorV2(); + // TODO allow 4D bits later + //DepthwiseSeparableConvOperatorV2( + // const uint16_t (&strides)[4], Padding padding, + // const int depth_multiplier = 1, const uint16_t (&dialation)[2] = {1, 1}); + DepthwiseSeparableConvOperatorV2( + const uint16_t (&strides)[2], Padding padding, + const int depth_multiplier = 1, const uint16_t (&dialation)[2] = {1, 1}); + + protected: + virtual void compute(); + + private: + // TfLiteDepthwiseConvParams + // Set by constructors + uint16_t _stride[4]; + Padding _padding; + int depth_multiplier; + uint16_t _dialation[2]; +}; + +template +DepthwiseSeparableConvOperatorV2< + Tout>::DepthwiseSeparableConvOperatorV2() + : _stride{1, 1}, + _padding(SAME), + depth_multiplier(1), + _dialation{1, 1} {} + +template +DepthwiseSeparableConvOperatorV2:: + DepthwiseSeparableConvOperatorV2( + const uint16_t (&strides)[2], Padding padding, + const int depth_multiplier, const uint16_t (&dialation)[2]) + : _stride{1, strides[0], strides[1], 1}, _padding(padding), + depth_multiplier(depth_multiplier), + _dialation{dialation[0], dialation[1]} +{ +} + +template +void DepthwiseSeparableConvOperatorV2::compute() { + AllocatorInterface* ram_allocator = + Context::get_default_context()->get_ram_data_allocator(); + const TensorShape& in_shape = inputs[in].tensor()->get_shape(); + const TensorShape& df_shape = inputs[filter].tensor()->get_shape(); + const TensorShape& bias_shape = inputs[bias].tensor()->get_shape(); + const TensorShape& out_shape = outputs[out].tensor()->get_shape(); + + if (in_shape[3] != df_shape[2]) { + Context::get_default_context()->throwError( + new InvalidTensorDimensionsError); + } + if (bias_shape[0] != 1 || bias_shape[1] != 1) { + Context::get_default_context()->throwError( + new InvalidTensorDimensionsError); + } + + + depthwise_separable_convolution_kernel_v2( + outputs[out].tensor(), + inputs[in].tensor(), inputs[filter].tensor(), inputs[bias].tensor(), + _padding, _stride, depth_multiplier, _dialation); + +} + + } } // namespace uTensor #endif diff --git a/src/uTensor/ops/Convolution_kernels.cpp b/src/uTensor/ops/Convolution_kernels.cpp new file mode 100644 index 00000000..557e7f81 --- /dev/null +++ b/src/uTensor/ops/Convolution_kernels.cpp @@ -0,0 +1,52 @@ +#include "Convolution_kernels.hpp" + +namespace uTensor { +// It's not guaranteed that padding is symmetric. It's important to keep +// offset for algorithms need all paddings. +int ComputePaddingWithOffset(int stride, int dilation_rate, int in_size, + int filter_size, int out_size, + int* offset) { + int effective_filter_size = (filter_size - 1) * dilation_rate + 1; + int total_padding = + ((out_size - 1) * stride + effective_filter_size - in_size); + total_padding = total_padding > 0 ? total_padding : 0; + *offset = total_padding % 2; + return total_padding / 2; +} + +// Matching GetWindowedOutputSize in TensorFlow. +int ComputeOutSize(Padding padding, int image_size, + int filter_size, int stride, int dilation_rate = 1) { + int effective_filter_size = (filter_size - 1) * dilation_rate + 1; + switch (padding) { + case SAME: + return (image_size + stride - 1) / stride; + case VALID: + return (image_size + stride - effective_filter_size) / stride; + default: + return 0; + } +} + +void uComputePaddingHeightWidth(int stride_height, int stride_width, + int dilation_rate_height, + int dilation_rate_width, int in_height, + int in_width, int filter_height, + int filter_width, int* padding_height, + int* padding_width, Padding padding, + int* out_height, int* out_width) { + *out_width = ComputeOutSize(padding, in_width, filter_width, stride_width, + dilation_rate_width); + *out_height = ComputeOutSize(padding, in_height, filter_height, stride_height, + dilation_rate_height); + + int offset = 0; + *padding_height = + ComputePaddingWithOffset(stride_height, dilation_rate_height, in_height, + filter_height, *out_height, &offset); + *padding_width = + ComputePaddingWithOffset(stride_width, dilation_rate_width, in_width, + filter_width, *out_width, &offset); +} + +} diff --git a/src/uTensor/ops/Convolution_kernels.hpp b/src/uTensor/ops/Convolution_kernels.hpp index 7000df33..03b2b997 100644 --- a/src/uTensor/ops/Convolution_kernels.hpp +++ b/src/uTensor/ops/Convolution_kernels.hpp @@ -426,5 +426,118 @@ void depthwise_separable_convolution_kernel(Tensor& out, const Tensor& in, } } + +void uComputePaddingHeightWidth(int stride_height, int stride_width, + int dilation_rate_height, + int dilation_rate_width, int in_height, + int in_width, int filter_height, + int filter_width, int* padding_height, + int* padding_width, Padding padding, + int* out_height, int* out_width); + +template +void depthwise_separable_convolution_kernel_v2(Tensor& output, const Tensor& input, + const Tensor& filter, + const Tensor& bias, + const Padding padding, + const uint16_t (&strides)[4], + const int depth_multiplier, + const uint16_t (&dialation)[2] + ) { + + // Check dimensions of the tensors. + const TensorShape& input_shape = input->get_shape(); + const TensorShape& filter_shape = filter->get_shape(); + const TensorShape& output_shape = output->get_shape(); + + const int channels_out = filter_shape[3]; + const int batches = input_shape[0]; + const int output_depth = output_shape[3]; // This should be the same as filter_shape[3] + const int output_height = output_shape[1]; + const int output_width = output_shape[2]; + const int input_width = input_shape[2]; + const int input_height = input_shape[1]; + const int input_depth = input_shape[3]; + const int filter_width = filter_shape[2]; + const int filter_height = filter_shape[1]; + const int stride_width = strides[2]; + const int stride_height = strides[1]; + const int dialation_width_factor = dialation[1]; + const int dialation_height_factor = dialation[0]; + + int unused_output_height, unused_output_width; + int32_t pad_width, pad_height; + + uComputePaddingHeightWidth(stride_height, stride_width, 1, 1, input_height, + input_width, filter_height, filter_width, + &pad_height, &pad_width, + padding, + &unused_output_height, &unused_output_width); + + if (!(input_shape.num_dims() == 4)) { + Context::get_default_context()->throwError( + new InvalidTensorDimensionsError); + } + if (!(filter_shape.num_dims() == 4)) { + Context::get_default_context()->throwError( + new InvalidTensorDimensionsError); + } + if (!(output_shape.num_dims() == 4)) { + Context::get_default_context()->throwError( + new InvalidTensorDimensionsError); + } + if (!(output_depth == filter_shape[3])) { + Context::get_default_context()->throwError( + new InvalidTensorDimensionsError); + } + if (!(batches == output_shape[0])) { + Context::get_default_context()->throwError( + new InvalidTensorDimensionsError); + } + + for (int batch = 0; batch < batches; ++batch) { + for (int out_y = 0; out_y < output_height; ++out_y) { + for (int out_x = 0; out_x < output_width; ++out_x) { + for (int in_channel = 0; in_channel < input_depth; ++in_channel) { + for (int m = 0; m < depth_multiplier; ++m) { + const int output_channel = m + in_channel * depth_multiplier; + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + int32_t acc = 0; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) { + const int in_x = in_x_origin + dialation_width_factor * filter_x; + const int in_y = + in_y_origin + dialation_height_factor * filter_y; + // Zero padding by omitting the areas outside the image. + const bool is_point_inside_image = + (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && + (in_y < input_height); + if (is_point_inside_image) { + // int32_t input_val = input_data[Offset(input_shape, batch, + // in_y, + // in_x, in_channel)]; + T input_val = + static_cast(input(batch, in_y, in_x, in_channel)); + // int32_t filter_val = filter_data[Offset( + // filter_shape, 0, filter_y, filter_x, output_channel)]; + T filter_val = static_cast( + filter(0, filter_y, filter_x, output_channel)); + acc += filter_val * (input_val); + } + } + } + // assuming bias data will always be provided + acc += static_cast(bias(output_channel)); + + output(batch, out_y, out_x, output_channel) = + static_cast(acc); + } + } + } + } + } +} + } // namespace uTensor #endif diff --git a/src/uTensor/ops/Matrix.cpp b/src/uTensor/ops/Matrix.cpp index 21188df4..f0c6c714 100644 --- a/src/uTensor/ops/Matrix.cpp +++ b/src/uTensor/ops/Matrix.cpp @@ -2,5 +2,4 @@ namespace uTensor { -DEFINE_ERROR(InvalidMatrixMultIndicesError); } diff --git a/src/uTensor/ops/Matrix.hpp b/src/uTensor/ops/Matrix.hpp index 4b7058f5..2c126fc1 100644 --- a/src/uTensor/ops/Matrix.hpp +++ b/src/uTensor/ops/Matrix.hpp @@ -2,42 +2,12 @@ #define UTENSOR_MATRIX_OPS_H #include "context.hpp" #include "operatorBase.hpp" +#include "ActivationFncs.hpp" +#include "Matrix_kernels.hpp" namespace uTensor { - -DECLARE_ERROR(InvalidMatrixMultIndicesError); namespace ReferenceOperators { -// Assume c is already allocated to the correct size -// Naive implementation -template -void matrix_mult_kernel(Tensor& c, const Tensor& a, const Tensor& b) { - // Decide on c shape - TensorShape a_shape = a->get_shape(); - TensorShape b_shape = b->get_shape(); - TensorShape c_shape = c->get_shape(); - if (a_shape.num_dims() > 2 || b_shape.num_dims() > 2 || - c_shape.num_dims() > 2 || a_shape[1] != b_shape[0] || - a_shape[0] != c_shape[0] || b_shape[1] != c_shape[1]) { - uTensor_printf("[Error] Invalid matrix multiple shape mismatch\n"); - Context::get_default_context()->throwError( - new InvalidMatrixMultIndicesError); - } - - for (uint32_t i = 0; i < a_shape[0]; i++) { - for (uint32_t j = 0; j < b_shape[1]; j++) { - // c(i, j) = static_cast(0); - T tmp = 0; - for (uint32_t k = 0; k < a_shape[1]; k++) { - tmp += static_cast(a(i, k)) * static_cast(b(k, j)); - // printf("i, j, k : %d %d %d %d %d\n", i, j, k, static_cast(a(i, k)) - // , static_cast(b(k, j))); - } - c(i, j) = tmp; - } - } -} - template class MatrixMultOperator : public OperatorInterface<2, 1> { public: @@ -53,6 +23,51 @@ class MatrixMultOperator : public OperatorInterface<2, 1> { } }; +template +class MatrixMultOperatorV2 : public OperatorInterface<3, 1> {}; + +template <> +class MatrixMultOperatorV2 : public OperatorInterface<3, 1> { + public: + enum names_in : uint8_t { input, filter, bias }; + enum names_out : uint8_t { output }; + + MatrixMultOperatorV2(Fuseable::Activation activation = Fuseable::NoActivation) + : _activation(activation) {} + + private: + Fuseable::Activation _activation; + + protected: + virtual void compute() { + bool have_bias = + *(inputs[bias].name) != *(TensorMapInterface::not_found.name); + // Decide on c shape + TensorShape& a_shape = inputs[input].tensor()->get_shape(); + TensorShape& b_shape = inputs[filter].tensor()->get_shape(); + TensorShape& c_shape = outputs[output].tensor()->get_shape(); + if (a_shape.num_dims() > 2 || b_shape.num_dims() > 2 || + c_shape.num_dims() > 2 || a_shape[1] != b_shape[0] || + a_shape[0] != c_shape[0] || b_shape[1] != c_shape[1]) { + uTensor_printf("[Error] Invalid matrix multiple shape mismatch\n"); + Context::get_default_context()->throwError( + new InvalidMatrixMultIndicesError); + } + if (have_bias) { + matrix_mult_kernel_v2( + outputs[output].tensor(), inputs[input].tensor(), + inputs[filter].tensor(), inputs[bias].tensor(), _activation); + } else { + matrix_mult_kernel_v2( + outputs[output].tensor(), inputs[input].tensor(), + inputs[filter].tensor(), _activation); + } + } +}; + +template +using FullyConnectedOperator = MatrixMultOperatorV2; + } } // namespace uTensor #endif diff --git a/src/uTensor/ops/Matrix_kernels.cpp b/src/uTensor/ops/Matrix_kernels.cpp new file mode 100644 index 00000000..a8bdc803 --- /dev/null +++ b/src/uTensor/ops/Matrix_kernels.cpp @@ -0,0 +1,6 @@ +#include "Matrix_kernels.hpp" + +namespace uTensor { + +DEFINE_ERROR(InvalidMatrixMultIndicesError); +} diff --git a/src/uTensor/ops/Matrix_kernels.hpp b/src/uTensor/ops/Matrix_kernels.hpp new file mode 100644 index 00000000..96ff2f4f --- /dev/null +++ b/src/uTensor/ops/Matrix_kernels.hpp @@ -0,0 +1,104 @@ +#ifndef UTENSOR_MATRIX_KERNELS +#define UTENSOR_MATRIX_KERNELS +#include "context.hpp" +#include "operatorBase.hpp" +#include "ActivationFncs.hpp" + +namespace uTensor { +DECLARE_ERROR(InvalidMatrixMultIndicesError); + +// Assume c is already allocated to the correct size +// Naive implementation +template +void matrix_mult_kernel(Tensor& c, const Tensor& a, const Tensor& b) { + // Decide on c shape + TensorShape a_shape = a->get_shape(); + TensorShape b_shape = b->get_shape(); + TensorShape c_shape = c->get_shape(); + if (a_shape.num_dims() > 2 || b_shape.num_dims() > 2 || + c_shape.num_dims() > 2 || a_shape[1] != b_shape[0] || + a_shape[0] != c_shape[0] || b_shape[1] != c_shape[1]) { + uTensor_printf("[Error] Invalid matrix multiple shape mismatch\n"); + Context::get_default_context()->throwError( + new InvalidMatrixMultIndicesError); + } + + for (uint32_t i = 0; i < a_shape[0]; i++) { + for (uint32_t j = 0; j < b_shape[1]; j++) { + // c(i, j) = static_cast(0); + T tmp = 0; + for (uint32_t k = 0; k < a_shape[1]; k++) { + tmp += static_cast(a(i, k)) * static_cast(b(k, j)); + // printf("i, j, k : %d %d %d %d %d\n", i, j, k, static_cast(a(i, k)) + // , static_cast(b(k, j))); + } + c(i, j) = tmp; + } + } +} + +template +void matrix_mult_kernel_v2(Tensor& output, const Tensor& input, + const Tensor& filter, + Fuseable::Activation activation){ + const TensorShape& input_shape = input->get_shape(); + const TensorShape& filter_shape = filter->get_shape(); + TensorShape& output_shape = output->get_shape(); + + const int filter_dim_count = filter_shape.num_dims(); + const int batches = output_shape[0]; + const int output_depth = output_shape[1]; + if (!(output_depth <= filter_shape[filter_dim_count - 1])) { + Context::get_default_context()->throwError( + new InvalidMatrixMultIndicesError); + } + const int accum_depth = filter_shape[0]; + for (int b = 0; b < batches; ++b) { + for (int out_c = 0; out_c < output_depth; ++out_c) { + T acc = 0; + for (int d = 0; d < accum_depth; ++d) { + // TODO write this in tensor form + T input_val = static_cast(input(b, d, 0, 0)); + T filter_val = static_cast(filter(d, out_c, 0, 0)); + acc += filter_val * input_val; + } + acc = activation(acc); + output(b, out_c, 0, 0) = static_cast(acc); + } + } +} + +template +void matrix_mult_kernel_v2(Tensor& output, const Tensor& input, + const Tensor& filter, const Tensor& bias, + Fuseable::Activation activation){ + const TensorShape& input_shape = input->get_shape(); + const TensorShape& filter_shape = filter->get_shape(); + TensorShape& output_shape = output->get_shape(); + + const int filter_dim_count = filter_shape.num_dims(); + const int batches = output_shape[0]; + const int output_depth = output_shape[1]; + if (!(output_depth <= filter_shape[filter_dim_count - 1])) { + Context::get_default_context()->throwError( + new InvalidMatrixMultIndicesError); + } + const int accum_depth = filter_shape[0]; + for (int b = 0; b < batches; ++b) { + for (int out_c = 0; out_c < output_depth; ++out_c) { + T acc = 0; + for (int d = 0; d < accum_depth; ++d) { + // TODO write this in tensor form + T input_val = static_cast(input(b, d, 0, 0)); + T filter_val = static_cast(filter(d, out_c, 0, 0)); + acc += filter_val * input_val; + } + acc += static_cast(bias(out_c)); + acc = activation(acc); + output(b, out_c, 0, 0) = static_cast(acc); + } + } +} + +} +#endif diff --git a/src/uTensor/tensors/RomTensor.cpp b/src/uTensor/tensors/RomTensor.cpp index 2b1872ed..f2ef03eb 100644 --- a/src/uTensor/tensors/RomTensor.cpp +++ b/src/uTensor/tensors/RomTensor.cpp @@ -3,6 +3,7 @@ #include #include "context.hpp" +#include "uTensor_util.hpp" namespace uTensor { // EVENTS @@ -12,7 +13,7 @@ RomTensor::RomTensor(TensorShape _shape, ttype _type, const void* buffer) // TODO Need to fix the write/read selection functions in Handle void* RomTensor::write(uint32_t linear_index) { - // printf("[ERROR] Attempted write to ROM tensor, make sure it's declared + // uTensor_printf("[ERROR] Attempted write to ROM tensor, make sure it's declared // const\n"); return nullptr; return BufferTensor::write(linear_index); } @@ -29,17 +30,43 @@ size_t RomTensor::_get_writeable_block(void*& buffer, uint16_t req_write_size, uint32_t linear_index) { Context::get_default_context()->throwError( new InvalidOptimizableTensorError()); - printf( + uTensor_printf( "ERROR, Optimized op attempted to write access non-optimizable tensor\n"); return -1; } RomTensor::~RomTensor() {} void RomTensor::resize(TensorShape new_shape) { - printf("[ERROR] Attempted resize of ROM tensor\n"); + uTensor_printf("[ERROR] Attempted resize of ROM tensor\n"); Context::get_default_context()->throwError(new InvalidResizeError()); } +ScalarRomTensor::ScalarRomTensor(TensorShape _shape, ttype _type, + const void* buffer) + : RomTensor(_shape, _type, buffer) { + if (_shape.num_dims() != 1) { + uTensor_printf( + "[ERROR] Attempted to create scalar Tensor with more than one " + "dimension\n"); + Context::get_default_context()->throwError( + new InvalidTensorDimensionsError()); + } + if (_shape[0] != 1) { + uTensor_printf("[ERROR] Scalar Tensor size not 1\n"); + Context::get_default_context()->throwError( + new InvalidTensorDimensionsError()); + } +} + +ScalarRomTensor::~ScalarRomTensor() {} +void* ScalarRomTensor::read(uint32_t linear_index) const { + return RomTensor::read(0); +} +// HACK TODO, REMOVE THIS after getting Handles to work with const pointers +void* ScalarRomTensor::write(uint32_t linear_index) { + return RomTensor::write(0); +} + // Returns floor of square root of x int floorPerfSqrt(int x) { // Base cases @@ -66,7 +93,7 @@ DiagonalRomTensor::DiagonalRomTensor(TensorShape _shape, ttype _type, const void* buffer, size_t buffer_len) : RomTensor(_shape, _type, buffer) { if (_shape.num_dims() != 2) { - printf( + uTensor_printf( "[ERROR] Attempted to create diagonal Tensor with wrong number of " "dimensions\n"); Context::get_default_context()->throwError( @@ -74,7 +101,7 @@ DiagonalRomTensor::DiagonalRomTensor(TensorShape _shape, ttype _type, } uint16_t smaller_dim = (_shape[0] < _shape[1]) ? _shape[0] : _shape[1]; if (buffer_len < smaller_dim) { - printf("[ERROR] Diagnoal Tensor size mismatch with buffer\n"); + uTensor_printf("[ERROR] Diagnoal Tensor size mismatch with buffer\n"); Context::get_default_context()->throwError( new InvalidTensorDimensionsError()); } diff --git a/src/uTensor/tensors/RomTensor.hpp b/src/uTensor/tensors/RomTensor.hpp index a8733309..30b87869 100644 --- a/src/uTensor/tensors/RomTensor.hpp +++ b/src/uTensor/tensors/RomTensor.hpp @@ -34,6 +34,16 @@ class RomTensor : public BufferTensor { uint32_t linear_index) override; }; +class ScalarRomTensor : public RomTensor { + protected: + virtual void* read(uint32_t linear_index) const override; + virtual void* write(uint32_t linear_index) override; + + public: + ScalarRomTensor(TensorShape _shape, ttype _type, const void* buffer); + virtual ~ScalarRomTensor(); +}; + class DiagonalRomTensor : public RomTensor { protected: virtual void* read(uint32_t linear_index) const override;