diff --git a/.gitignore b/.gitignore
index 2b8c6977..9dda0010 100644
--- a/.gitignore
+++ b/.gitignore
@@ -39,6 +39,7 @@ sd-driver
 #Project generated test files
 python/test_scripts/testData
 python/.venv
+__pycache__
 
 #Vim stuff
 *.swp
diff --git a/TESTS/tensors/test_romtensor.cpp b/TESTS/tensors/test_romtensor.cpp
index 025ca69a..be968065 100644
--- a/TESTS/tensors/test_romtensor.cpp
+++ b/TESTS/tensors/test_romtensor.cpp
@@ -125,3 +125,35 @@ TEST(Rom_Tensor, read_write_i16) {
   cout << "uint16 Sizeof IntegralValue " << sizeof(IntegralValue(5)) << endl;
   delete[] buffer;
 }
+
+TEST(ScalarRom_Tensor, read_write_i8) {
+  ///setup_context();
+  localCircularArenaAllocator<256> meta_allocator;
+  localCircularArenaAllocator<256> ram_allocator;
+  Context::get_default_context()->set_metadata_allocator(&meta_allocator);
+  Context::get_default_context()->set_ram_data_allocator(&ram_allocator);
+  int8_t* buffer = new int8_t[1];
+  buffer[0] = 5;
+  ScalarRomTensor r({1}, i8, buffer);
+  int8_t read = r(2,2);
+  EXPECT_EQ(read, 5);
+  cout << "i8 Sizeof IntegralValue " << sizeof(IntegralValue(5)) << endl;
+  cout << "Sizeof RomTensor " << sizeof(r) << endl;
+  delete[] buffer;
+}
+
+TEST(ScalarRom_Tensor, read_write_flt) {
+  ///setup_context();
+  localCircularArenaAllocator<256> meta_allocator;
+  localCircularArenaAllocator<256> ram_allocator;
+  Context::get_default_context()->set_metadata_allocator(&meta_allocator);
+  Context::get_default_context()->set_ram_data_allocator(&ram_allocator);
+  float* buffer = new float[1];
+  buffer[0] = 5.0;
+  ScalarRomTensor r({1}, flt, buffer);
+  float read = r(2,2);
+  EXPECT_NEAR(read, 5.0, 0.0001);
+  cout << "float Sizeof IntegralValue " << sizeof(IntegralValue(5)) << endl;
+  cout << "Sizeof RomTensor " << sizeof(r) << endl;
+  delete[] buffer;
+}
diff --git a/python/test_scripts/__init__.py b/python/test_scripts/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/test_scripts/gen_softmax.py b/python/test_scripts/gen_softmax.py
new file mode 100644
index 00000000..a39a71a6
--- /dev/null
+++ b/python/test_scripts/gen_softmax.py
@@ -0,0 +1,24 @@
+from jinja_env import env2, Operator, Tensor, SingleOpTest
+import tensorflow as tf
+
+test_group = "Softmax"
+
+def gen_test(test_number, scale = 1.0):
+    test_name = "random_gen_scale_%d__%d" % ( int(scale), test_number)
+    in1 = tf.constant(tf.random.uniform([1,10])*scale).numpy()
+    out_1 = tf.nn.softmax(in1).numpy()
+
+    in_ref_name = "s_ref_in_%d" % test_number
+    out_ref_name = "s_ref_out_%d" % test_number
+    in_t = Tensor("in", in1, ref_name=in_ref_name)        
+    out_ref = Tensor("out_ref", out_1, ref_name=out_ref_name) # Store the reference out values
+    out_t = Tensor("out", out_1)                 
+    #conv_param_str = "{%s}, %s" % (str(strides).lstrip('[').rstrip(']'), padding)
+    #convOp = Operator("Conv2dOperator", "op_0", dtypes=["float"], param_str=conv_param_str)
+    op = Operator("SoftmaxOperator", "softmaxOp", dtypes=["float"])
+    op.set_inputs({"in": in_t}).set_outputs({"out": out_t})
+    
+    test = SingleOpTest(test_group, test_name, op)
+    test.add_tensor_comparison(out_t, out_ref)
+    test_rendered, const_snippets = test.render()
+    print(test_rendered)
diff --git a/python/test_scripts/jinja_env/__init__.py b/python/test_scripts/jinja_env/__init__.py
index 07dc9e1f..8d93ad9b 100644
--- a/python/test_scripts/jinja_env/__init__.py
+++ b/python/test_scripts/jinja_env/__init__.py
@@ -1,23 +1,177 @@
 import jinja2
 from pathlib import Path
+import numpy as np
 
 _template_dir = Path(__file__).parent / "templates"
+_template2_dir = Path(__file__).parent / "templates_v2"
 
 env = jinja2.Environment(
     loader=jinja2.FileSystemLoader(_template_dir), trim_blocks=True, lstrip_blocks=True
-)
+    )
 env.globals.update(
     zip=zip,
     len=len,
     TENSOR_TYPE_MAP={
-        "int8_t": "i8",
-        "uint8_t": "u8",
-        "int16_t": "i16",
-        "uint16_t": "u16",
-        "int32_t": "i32",
-        "uint32_t": "u32",
-        "float": "flt",
-    },
-)
+      "int8_t": "i8",
+      "uint8_t": "u8",
+      "int16_t": "i16",
+      "uint16_t": "u16",
+      "int32_t": "i32",
+      "uint32_t": "u32",
+      "float": "flt",
+      },
+    )
 
 del _template_dir
+
+env2 = jinja2.Environment(
+    loader=jinja2.FileSystemLoader(_template2_dir), trim_blocks=True, lstrip_blocks=True
+    )
+env2.globals.update(
+    zip=zip,
+    len=len,
+    TENSOR_TYPE_MAP={
+      "int8_t": "i8",
+      "uint8_t": "u8",
+      "int16_t": "i16",
+      "uint16_t": "u16",
+      "int32_t": "i32",
+      "uint32_t": "u32",
+      "float": "flt",
+      },
+    NUMPY_2_CMAP={
+      np.int8: "int8_t",
+      np.uint8: "uint8_t",
+      np.int16: "int16_t",
+      np.uint16: "uint16_t",
+      np.int32: "int32_t",
+      np.uint32: "uint32_t",
+      np.float: "float",
+      np.dtype('float32'): "float",
+      },
+    )
+
+TENSOR_TYPE_MAP={
+    "int8_t": "i8",
+    "uint8_t": "u8",
+    "int16_t": "i16",
+    "uint16_t": "u16",
+    "int32_t": "i32",
+    "uint32_t": "u32",
+    "float": "flt",
+    }
+NUMPY_2_CMAP={
+    np.int8: "int8_t",
+    np.uint8: "uint8_t",
+    np.int16: "int16_t",
+    np.uint16: "uint16_t",
+    np.int32: "int32_t",
+    np.uint32: "uint32_t",
+    np.float: "float",
+    np.dtype('float32'): "float",
+    }
+
+class Tensor:
+  def __init__(self, name, np_array, ref_name=None, quantize_params=[]):
+    self.name = name
+    self.np_array = np_array
+    self.ref_name = ref_name
+    self.quantize_params = quantize_params
+
+  @property
+  def shape(self):
+    return self.np_array.shape
+
+  @property
+  def dtype(self):
+    return NUMPY_2_CMAP[self.np_array.dtype]
+
+  @property
+  def utype(self):
+    return TENSOR_TYPE_MAP[self.dtype]
+
+  def flatten(self):
+    return self.np_array.flatten()
+
+  def render_constant(self):
+    if self.ref_name:
+      return env2.get_template('def_constant.hpp').render(tensor=self)
+    else:
+      return ""
+  def render_declaration(self):
+    if self.ref_name:
+      return env2.get_template('declare_rom_tensor.cpp').render(tensor=self)
+    else:
+      return env2.get_template('declare_ram_tensor.cpp').render(tensor=self)
+
+
+class Operator:
+  def __init__(self, op_type, name, dtypes=[], param_str=None):
+    self.op_type = op_type
+    self.name = name
+    self.dtypes = dtypes
+    self.param_str = param_str
+    self.array_template = env2.get_template('array_template.cpp')
+    self.input_map = {}
+    self.output_map = {}
+    self.type_signature = env2.get_template('op_type_signature.cpp').render(op=self)
+
+  def set_inputs(self, input_map):
+    self.input_map = input_map
+    return self
+
+  def set_outputs(self, output_map):
+    self.output_map = output_map
+    return self
+
+  def render_declaration(self):
+    return env2.get_template('declare_operator.cpp').render(op=self)
+
+  def render_eval(self):
+    return env2.get_template('eval_operator.cpp').render(op=self)
+
+class SingleOpTest:
+  def __init__(self, test_group, test_name, target_op):
+    self.test_group = test_group
+    self.test_name = test_name
+    self.out_size = 0
+    for out_tensor in target_op.output_map:
+      self.out_size += len(target_op.output_map[out_tensor].flatten())
+    self.target_op = target_op
+    self.compare_tensors = []
+    self.tensor_set = set()
+    for tensor in target_op.input_map:
+      self.tensor_set.add(target_op.input_map[tensor])
+    for tensor in target_op.output_map:
+      self.tensor_set.add(target_op.output_map[tensor])
+  
+  def add_tensor_comparison(self, a, b):
+    self.compare_tensors.append((a,b))
+    self.tensor_set.add(a)
+    self.tensor_set.add(b)
+
+  def render(self):
+    const_snippets = []
+    tensor_decls = []
+    for tensor in self.tensor_set:
+      const_snippets.append(tensor.render_constant())
+      tensor_decls.append(tensor.render_declaration())
+    op_decl = self.target_op.render_declaration()
+    op_eval = self.target_op.render_eval()
+
+    compare_snippets = []
+    for a, b in self.compare_tensors:
+      compare_snippets.append(env2.get_template('compare_outputs.cpp').render(a=a, b=b))
+
+    TestTemplate = env2.get_template('test_container.cpp')
+    test_rendered = TestTemplate.render(test_group= self.test_group, 
+                                        test_name = self.test_name,
+                                        out_size  = self.out_size,
+                                        tensor_declarations = tensor_decls,
+                                        op_decl = op_decl,
+                                        op_eval = op_eval,
+                                        compare_snippets=compare_snippets)
+    return (test_rendered, const_snippets)
+
+
+del _template2_dir
diff --git a/python/test_scripts/jinja_env/templates_v2/array_template.cpp b/python/test_scripts/jinja_env/templates_v2/array_template.cpp
new file mode 100644
index 00000000..cbaec841
--- /dev/null
+++ b/python/test_scripts/jinja_env/templates_v2/array_template.cpp
@@ -0,0 +1 @@
+{% for x in arr %}{{ x }}{{ "," if not loop.last }}{% endfor %}
diff --git a/python/test_scripts/jinja_env/templates_v2/compare_outputs.cpp b/python/test_scripts/jinja_env/templates_v2/compare_outputs.cpp
new file mode 100644
index 00000000..04f8c007
--- /dev/null
+++ b/python/test_scripts/jinja_env/templates_v2/compare_outputs.cpp
@@ -0,0 +1,3 @@
+for(int i = 0; i < {{ len(b.flatten()) }}; i++) {
+  EXPECT_NEAR(static_cast<{{ a.dtype }}>( {{ a.name }}(i) ), static_cast<{{ b.dtype }}>( {{ b.name }}(i) ), 0.0001);
+}
diff --git a/python/test_scripts/jinja_env/templates_v2/const_container.hpp b/python/test_scripts/jinja_env/templates_v2/const_container.hpp
new file mode 100644
index 00000000..f41695a2
--- /dev/null
+++ b/python/test_scripts/jinja_env/templates_v2/const_container.hpp
@@ -0,0 +1,6 @@
+#ifndef {{ constants_header | replace(".", "_") }} 
+#define {{ constants_header | replace(".", "_") }} 
+{% for constant_snippet in constants %}
+{{ constant_snippet }}
+{% endfor %}
+#endif
diff --git a/python/test_scripts/jinja_env/templates_v2/declare_operator.cpp b/python/test_scripts/jinja_env/templates_v2/declare_operator.cpp
new file mode 100644
index 00000000..035f26d0
--- /dev/null
+++ b/python/test_scripts/jinja_env/templates_v2/declare_operator.cpp
@@ -0,0 +1 @@
+{{ op.type_signature }} {{ op.name }}{% if op.param_str %}({{ op.param_str }}){% endif %};
diff --git a/python/test_scripts/jinja_env/templates_v2/declare_ram_tensor.cpp b/python/test_scripts/jinja_env/templates_v2/declare_ram_tensor.cpp
new file mode 100644
index 00000000..6b674d57
--- /dev/null
+++ b/python/test_scripts/jinja_env/templates_v2/declare_ram_tensor.cpp
@@ -0,0 +1,4 @@
+Tensor {{tensor.name}} = new RamTensor({ {%for s in tensor.shape%}{{ s }}{{"," if not loop.last}}{%endfor%} }, {{ tensor.utype }});
+{%if tensor.quantize_params%}
+  {{tensor.name}}->set_quantization_params(PerTensorQuantizationParams({{tensor.quantize_params[1]}}, {{tensor.quantize_params[0]}}));
+{%endif%}
diff --git a/python/test_scripts/jinja_env/templates_v2/declare_rom_tensor.cpp b/python/test_scripts/jinja_env/templates_v2/declare_rom_tensor.cpp
new file mode 100644
index 00000000..447775c1
--- /dev/null
+++ b/python/test_scripts/jinja_env/templates_v2/declare_rom_tensor.cpp
@@ -0,0 +1,4 @@
+Tensor {{ tensor.name }} = new RomTensor({ {% for s in tensor.shape %}{{ s }}{{"," if not loop.last}}{% endfor %} }, {{ tensor.utype }}, {{ tensor.ref_name }});
+{%if tensor.quantize_params%}
+  {{tensor.name}}->set_quantization_params(PerTensorQuantizationParams({{tensor.quantize_params[1]}}, {{tensor.quantize_params[0]}}));
+{%endif%}
diff --git a/python/test_scripts/jinja_env/templates_v2/def_constant.hpp b/python/test_scripts/jinja_env/templates_v2/def_constant.hpp
new file mode 100644
index 00000000..f213e528
--- /dev/null
+++ b/python/test_scripts/jinja_env/templates_v2/def_constant.hpp
@@ -0,0 +1,3 @@
+static const {{ tensor.dtype }} {{ tensor.ref_name }}[{{ len(tensor.flatten()) }}] = { 
+  {% for x in tensor.flatten() %} {{ x }}{{ "," if not loop.last }}{{ "\n" if not loop.first and loop.index % 10 == 0}} {% endfor %} 
+};
diff --git a/python/test_scripts/jinja_env/templates_v2/eval_operator.cpp b/python/test_scripts/jinja_env/templates_v2/eval_operator.cpp
new file mode 100644
index 00000000..0ec0c0d4
--- /dev/null
+++ b/python/test_scripts/jinja_env/templates_v2/eval_operator.cpp
@@ -0,0 +1,10 @@
+{{op.name}}
+  .set_inputs({ 
+    {% for x in op.input_map %}
+    { {{op.type_signature}}::{{x}}, {{op.input_map[x].name}} }{{"," if not loop.last}}
+  {% endfor %}
+  }).set_outputs({ 
+    {% for x in op.output_map %}
+    { {{op.type_signature}}::{{x}}, {{op.output_map[x].name}} }{{"," if not loop.last}}
+  {% endfor %}
+  }).eval();
diff --git a/python/test_scripts/jinja_env/templates_v2/op_type_signature.cpp b/python/test_scripts/jinja_env/templates_v2/op_type_signature.cpp
new file mode 100644
index 00000000..6b658313
--- /dev/null
+++ b/python/test_scripts/jinja_env/templates_v2/op_type_signature.cpp
@@ -0,0 +1 @@
+{{ op.op_type }}{% if op.dtypes %}<{{ op.array_template.render(arr=op.dtypes) }}>{% endif %}
diff --git a/python/test_scripts/jinja_env/templates_v2/test_container.cpp b/python/test_scripts/jinja_env/templates_v2/test_container.cpp
new file mode 100644
index 00000000..2c2716f6
--- /dev/null
+++ b/python/test_scripts/jinja_env/templates_v2/test_container.cpp
@@ -0,0 +1,20 @@
+/***************************************
+ * Generated Test
+ ***************************************/
+
+TEST({{ test_group }}, {{ test_name }}) {
+  localCircularArenaAllocator<1024> meta_allocator;
+  localCircularArenaAllocator<{{ out_size }}*2*sizeof(float), uint32_t> ram_allocator;
+  Context::get_default_context()->set_metadata_allocator(&meta_allocator);
+  Context::get_default_context()->set_ram_data_allocator(&ram_allocator);
+
+  {% for tensor_decl in tensor_declarations %}
+  {{ tensor_decl }}{% endfor %}
+
+  {{ op_decl }}
+  {{ op_eval }}
+
+  {% for compare_snippet in compare_snippets %}
+  {{ compare_snippet }}
+  {% endfor %}
+}
diff --git a/src/uTensor/CMakeLists.txt b/src/uTensor/CMakeLists.txt
index 527c9b4b..8e38e115 100644
--- a/src/uTensor/CMakeLists.txt
+++ b/src/uTensor/CMakeLists.txt
@@ -14,6 +14,8 @@ set(src_utensor_tensors
    )
 set(src_utensor_ops
     ops/Matrix.cpp
+    ops/Matrix_kernels.cpp
+    ops/Convolution_kernels.cpp
    )
 set(src_utensor_errhndl
   errorHandlers/SimpleErrorHandler.cpp
diff --git a/src/uTensor/core/quantizationPrimitives.cpp b/src/uTensor/core/quantizationPrimitives.cpp
index e3c21631..aa70d912 100644
--- a/src/uTensor/core/quantizationPrimitives.cpp
+++ b/src/uTensor/core/quantizationPrimitives.cpp
@@ -83,11 +83,15 @@ void QuantizationParamsHandle::free() {
 QuantizationParamsHandle::QuantizationParamsHandle() : Handle() {}
 QuantizationParamsHandle::QuantizationParamsHandle(QuantizationParams* ptr) : Handle((void*)ptr) {
   // Context::get_default_context()->get_metadata_allocator()->bind(_ptr, this);
-  bind(*this, *Context::get_default_context()->get_metadata_allocator());
+  if(ptr){
+    bind(*this, *Context::get_default_context()->get_metadata_allocator());
+  }
 }
 QuantizationParamsHandle& QuantizationParamsHandle::operator=(QuantizationParams* ptr) {
   _ptr = (void*)ptr;
-  bind(*this, *Context::get_default_context()->get_metadata_allocator());
+  if(ptr){
+    bind(*this, *Context::get_default_context()->get_metadata_allocator());
+  }
   // Context::get_metadata_allocator()->bind(_ptr, this);
   return *this;
 }
diff --git a/src/uTensor/ops/ActivationFncs.hpp b/src/uTensor/ops/ActivationFncs.hpp
index 9b2dc390..5ca82698 100644
--- a/src/uTensor/ops/ActivationFncs.hpp
+++ b/src/uTensor/ops/ActivationFncs.hpp
@@ -83,7 +83,87 @@ class ReLU6Operator : public OperatorInterface<1, 1> {
   }
 };
 
-} 
+template <typename T>
+class InPlaceSoftmax : public InPlaceActivationFnc {
+  // ReLU only makes sense if there is a notion of negative
+  static_assert(std::is_signed<T>::value,
+                "Error attempted to construct Softmax on non-signed types");
+
+ public:
+  InPlaceSoftmax() : beta(1) {}
+  InPlaceSoftmax(T beta) : beta(beta) {}
+ protected:
+  virtual void compute() { inplace_softmax_k<T>(inputs[x].tensor(), beta); }
+
+ private:
+  T beta;
+};
+
+template <typename T>
+class SoftmaxOperator : public OperatorInterface<1, 1> {
+  // ReLU only makes sense if there is a notion of negative
+  static_assert(std::is_signed<T>::value,
+                "Error attempted to construct softmax on non-signed types");
+
+ public:
+  enum names_in : uint8_t { in };
+  enum names_out : uint8_t { out };
+
+ public:
+  SoftmaxOperator() : beta(1) {}
+  SoftmaxOperator(T beta) : beta(beta) {}
+ protected:
+  virtual void compute() {
+    const Tensor& inT = inputs[in].tensor();
+    Tensor& outT = outputs[out].tensor();
+    // TODO Check sizes here and throw mismatch
+    uint32_t in_size = inT->get_shape().get_linear_size();
+    uint32_t out_size = outT->get_shape().get_linear_size();
+    if (in_size != out_size)
+      Context::get_default_context()->throwError(
+          new OperatorIOSizeMismatchError);
+    softmax_k<T>(outT, inT, beta);
+  }
+
+ private:
+  T beta;
+};
+
+template <typename T>
+class InPlaceSigmoid : public InPlaceActivationFnc {
+  // ReLU only makes sense if there is a notion of negative
+  static_assert(std::is_signed<T>::value,
+                "Error attempted to construct Sigmoid on non-signed types");
+
+ protected:
+  virtual void compute() { inplace_softmax_k<T>(inputs[x].tensor()); }
+};
+
+template <typename T>
+class SigmoidOperator : public OperatorInterface<1, 1> {
+  // ReLU only makes sense if there is a notion of negative
+  static_assert(std::is_signed<T>::value,
+                "Error attempted to construct Sigmoid on non-signed types");
+
+ public:
+  enum names_in : uint8_t { in };
+  enum names_out : uint8_t { out };
+
+ protected:
+  virtual void compute() {
+    const Tensor& inT = inputs[in].tensor();
+    Tensor& outT = outputs[out].tensor();
+    // TODO Check sizes here and throw mismatch
+    uint32_t in_size = inT->get_shape().get_linear_size();
+    uint32_t out_size = outT->get_shape().get_linear_size();
+    if (in_size != out_size)
+      Context::get_default_context()->throwError(
+          new OperatorIOSizeMismatchError);
+    sigmoid_k<T>(outT, inT);
+  }
+};
+
+}  // ReferenceOperators 
 }  // namespace uTensor
 
 #endif
diff --git a/src/uTensor/ops/ActivationFncs_kernels.hpp b/src/uTensor/ops/ActivationFncs_kernels.hpp
index 9300c134..62777ede 100644
--- a/src/uTensor/ops/ActivationFncs_kernels.hpp
+++ b/src/uTensor/ops/ActivationFncs_kernels.hpp
@@ -1,9 +1,43 @@
 #ifndef UTENSOR_ACTIVATIONS_KERNELS_H
 #define UTENSOR_ACTIVATIONS_KERNELS_H
 #include "operatorBase.hpp"
+#include <cmath>
+#include <limits>
+#include <functional>
+
+using std::exp;
 
 namespace uTensor {
 
+namespace Fuseable {
+
+  template <typename T>
+  using Activation = std::function<T(T)>;
+  
+  template <typename T>
+  T NoActivation(T x) { return x; }
+  
+  template <typename T>
+  T ReLU(T x) { return (x < 0) ? 0 : x; }
+  
+  template <typename T>
+  T ReLU6(T x) { 
+    if (x < 0){
+      return 0;
+    } else if (x > 6) {
+      return 6;
+    } else {
+      return x;
+    }
+  }
+  
+  template <typename T>
+  T Sigmoid(T x) {
+    const T one = 1;
+    return one / ( one + exp(-x) );
+  }
+
+} // namespace Fuseable
 template <typename T>
 void inplace_relu_k(Tensor& t) {
   T tmp;
@@ -60,5 +94,86 @@ void relu6_k(Tensor& out, const Tensor& in) {
   }
 }
 
+template <typename T>
+void inplace_softmax_k(Tensor& in, T beta = 1) {
+  T tmp;
+  T mSum = 0;
+  const TensorShape& inShape = in->get_shape();
+  int outer_dim = inShape.num_dims() -1;
+  int depth = inShape[outer_dim];
+  int out_side_numelems = 1;
+  for(int i = 0; i < inShape.num_dims(); i++){
+    out_side_numelems *= (i == outer_dim) ? 1: inShape[i];
+  }
+
+  for (int i = 0; i < out_side_numelems; i++) {
+    // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
+    T max = std::numeric_limits<T>::lowest();
+    for(int j = 0; j < depth; j++){
+      max = std::max(max, static_cast<T>(in(i, j)));
+    }
+
+    T mSum = 0;
+    for(int j = 0; j < depth; j++){
+      T tmp = exp((static_cast<T>(in(i,j)) - max) * beta);
+      mSum += tmp;
+      in(i,j) = tmp;
+    }
+    for(int j = 0; j < depth; j++){
+      in(i, j)  = static_cast<T>(in(i, j)) / mSum;
+    }
+  }
+}
+template <typename T>
+void softmax_k(Tensor& out, const Tensor& in, T beta=1) {
+  T tmp;
+  T mSum = 0;
+  const TensorShape& inShape = in->get_shape();
+  int outer_dim = inShape.num_dims() -1;
+  int depth = inShape[outer_dim];
+  int out_side_numelems = 1;
+  for(int i = 0; i < inShape.num_dims(); i++){
+    out_side_numelems *= (i == outer_dim) ? 1: inShape[i];
+  }
+
+  for (int i = 0; i < out_side_numelems; i++) {
+    // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
+    T max = std::numeric_limits<T>::lowest();
+    for(int j = 0; j < depth; j++){
+      max = std::max(max, static_cast<T>(in(i, j)));
+    }
+
+    T mSum = 0;
+    for(int j = 0; j < depth; j++){
+      T tmp = exp((static_cast<T>(in(i,j)) - max) * beta);
+      mSum += tmp;
+      out(i,j) = tmp;
+    }
+    for(int j = 0; j < depth; j++){
+      out(i, j)  = static_cast<T>(out(i, j)) / mSum;
+    }
+  }
+
+}
+template <typename T>
+void inplace_sigmoid_k(Tensor& t) {
+  const T one = 1;
+  uint32_t t_size = t->get_shape().get_linear_size();
+  for (uint32_t i = 0; i < t_size; i++) {
+    const T tmp = one / (one + exp(- static_cast<T>(t(i))));
+    t(i) = tmp;
+  }
+}
+
+template <typename T>
+void sigmoid_k(Tensor& out, const Tensor& in) {
+  const T one = 1;
+  uint32_t t_size = in->get_shape().get_linear_size();
+  for (uint32_t i = 0; i < t_size; i++) {
+    const T tmp = one / (one + exp(- static_cast<T>(in(i))));
+    out(i) = tmp;
+  }
+}
+
 }  // namespace uTensor
 #endif
diff --git a/src/uTensor/ops/Arithmetic.hpp b/src/uTensor/ops/Arithmetic.hpp
index 2be81534..2bfe1e38 100644
--- a/src/uTensor/ops/Arithmetic.hpp
+++ b/src/uTensor/ops/Arithmetic.hpp
@@ -23,6 +23,34 @@ class AddOperator : public OperatorInterface<2, 1> {
   }
 };
 
+template <typename T>
+class SubOperator : public OperatorInterface<2, 1> {
+ public:
+  enum names_in : uint8_t { a, b };
+  enum names_out : uint8_t { c };
+  // AddOperator(FixedTensorMap<2> inputs, FixedTensorMap<1> outputs) :
+  // OperatorBase(inputs, outputs) {}
+
+ protected:
+  virtual void compute() {
+    sub_kernel<T>(outputs[c].tensor(), inputs[a].tensor(), inputs[b].tensor());
+  }
+};
+
+template <typename T>
+class MulOperator : public OperatorInterface<2, 1> {
+ public:
+  enum names_in : uint8_t { a, b };
+  enum names_out : uint8_t { c };
+  // AddOperator(FixedTensorMap<2> inputs, FixedTensorMap<1> outputs) :
+  // OperatorBase(inputs, outputs) {}
+
+ protected:
+  virtual void compute() {
+    mul_kernel<T>(outputs[c].tensor(), inputs[a].tensor(), inputs[b].tensor());
+  }
+};
+
 }
 }  // namespace uTensor
 #endif
diff --git a/src/uTensor/ops/Arithmetic_kernels.hpp b/src/uTensor/ops/Arithmetic_kernels.hpp
index 4937f45e..1250d2ab 100644
--- a/src/uTensor/ops/Arithmetic_kernels.hpp
+++ b/src/uTensor/ops/Arithmetic_kernels.hpp
@@ -6,7 +6,7 @@ namespace uTensor {
 template <typename T>
 void add_kernel(Tensor& c, const Tensor& a, const Tensor& b) {
   // Decide on c shape
-  TensorShape c_shape = c->get_shape();
+  const TensorShape& c_shape = c->get_shape();
   uint32_t c_size = c_shape.get_linear_size();
   // TensorInterface& C = reinterpret_cast<TensorInterface*>(*c);
   // const TensorInterface& A = reinterpret_cast<TensorInterface*>(*a);
@@ -16,5 +16,31 @@ void add_kernel(Tensor& c, const Tensor& a, const Tensor& b) {
     c(i) = static_cast<T>(static_cast<T>(a(i)) + static_cast<T>(b(i)));
 }
 
+template <typename T>
+void sub_kernel(Tensor& c, const Tensor& a, const Tensor& b) {
+  // Decide on c shape
+  const TensorShape& c_shape = c->get_shape();
+  uint32_t c_size = c_shape.get_linear_size();
+  // TensorInterface& C = reinterpret_cast<TensorInterface*>(*c);
+  // const TensorInterface& A = reinterpret_cast<TensorInterface*>(*a);
+  // const TensorInterface& B = reinterpret_cast<TensorInterface*>(*b);
+
+  for (uint32_t i = 0; i < c_size; i++)
+    c(i) = static_cast<T>(static_cast<T>(a(i)) - static_cast<T>(b(i)));
+}
+
+template <typename T>
+void mul_kernel(Tensor& c, const Tensor& a, const Tensor& b) {
+  // Decide on c shape
+  const TensorShape& c_shape = c->get_shape();
+  uint32_t c_size = c_shape.get_linear_size();
+  // TensorInterface& C = reinterpret_cast<TensorInterface*>(*c);
+  // const TensorInterface& A = reinterpret_cast<TensorInterface*>(*a);
+  // const TensorInterface& B = reinterpret_cast<TensorInterface*>(*b);
+
+  for (uint32_t i = 0; i < c_size; i++)
+    c(i) = static_cast<T>(static_cast<T>(a(i)) * static_cast<T>(b(i)));
+}
+
 }  // namespace uTensor
 #endif
diff --git a/src/uTensor/ops/Convolution.hpp b/src/uTensor/ops/Convolution.hpp
index faf9e5ba..462908ca 100644
--- a/src/uTensor/ops/Convolution.hpp
+++ b/src/uTensor/ops/Convolution.hpp
@@ -225,6 +225,80 @@ using MaxPoolOperator = GenericPoolOperator<T, MaxFilter<T>>;
 template <typename T>
 using AvgPoolOperator = GenericPoolOperator<T, AvgFilter<T>>;
 
+template <typename Tout>
+class DepthwiseSeparableConvOperatorV2 : public OperatorInterface<3, 1> {
+ public:
+  enum names_in : uint8_t { in, filter, bias };
+  enum names_out : uint8_t { out };
+
+ public:
+  DepthwiseSeparableConvOperatorV2();
+  // TODO allow 4D bits later
+  //DepthwiseSeparableConvOperatorV2(
+  //    const uint16_t (&strides)[4], Padding padding,
+  //    const int depth_multiplier = 1, const uint16_t (&dialation)[2] = {1, 1});
+  DepthwiseSeparableConvOperatorV2(
+      const uint16_t (&strides)[2], Padding padding,
+      const int depth_multiplier = 1, const uint16_t (&dialation)[2] = {1, 1});
+
+ protected:
+  virtual void compute();
+
+ private:
+  // TfLiteDepthwiseConvParams
+  // Set by constructors
+  uint16_t _stride[4];
+  Padding _padding;
+  int depth_multiplier;
+  uint16_t _dialation[2];
+};
+
+template <typename Tout>
+DepthwiseSeparableConvOperatorV2<
+    Tout>::DepthwiseSeparableConvOperatorV2()
+    : _stride{1, 1},
+      _padding(SAME),
+      depth_multiplier(1),
+      _dialation{1, 1} {}
+
+template <typename Tout>
+DepthwiseSeparableConvOperatorV2<Tout>::
+    DepthwiseSeparableConvOperatorV2(
+        const uint16_t (&strides)[2], Padding padding,
+        const int depth_multiplier, const uint16_t (&dialation)[2])
+    : _stride{1, strides[0], strides[1], 1}, _padding(padding),
+      depth_multiplier(depth_multiplier),
+      _dialation{dialation[0], dialation[1]}
+{
+}
+
+template <typename Tout>
+void DepthwiseSeparableConvOperatorV2<Tout>::compute() {
+  AllocatorInterface* ram_allocator =
+      Context::get_default_context()->get_ram_data_allocator();
+  const TensorShape& in_shape = inputs[in].tensor()->get_shape();
+  const TensorShape& df_shape = inputs[filter].tensor()->get_shape();
+  const TensorShape& bias_shape = inputs[bias].tensor()->get_shape();
+  const TensorShape& out_shape = outputs[out].tensor()->get_shape();
+
+  if (in_shape[3] != df_shape[2]) {
+    Context::get_default_context()->throwError(
+        new InvalidTensorDimensionsError);
+  }
+  if (bias_shape[0] != 1 || bias_shape[1] != 1) {
+    Context::get_default_context()->throwError(
+        new InvalidTensorDimensionsError);
+  }
+
+
+  depthwise_separable_convolution_kernel_v2<Tout>(
+      outputs[out].tensor(),
+      inputs[in].tensor(), inputs[filter].tensor(), inputs[bias].tensor(),
+      _padding, _stride, depth_multiplier, _dialation);
+
+}
+
+
 }
 }  // namespace uTensor
 #endif
diff --git a/src/uTensor/ops/Convolution_kernels.cpp b/src/uTensor/ops/Convolution_kernels.cpp
new file mode 100644
index 00000000..557e7f81
--- /dev/null
+++ b/src/uTensor/ops/Convolution_kernels.cpp
@@ -0,0 +1,52 @@
+#include "Convolution_kernels.hpp"
+
+namespace uTensor {
+// It's not guaranteed that padding is symmetric. It's important to keep
+// offset for algorithms need all paddings.
+int ComputePaddingWithOffset(int stride, int dilation_rate, int in_size,
+                                    int filter_size, int out_size,
+                                    int* offset) {
+  int effective_filter_size = (filter_size - 1) * dilation_rate + 1;
+  int total_padding =
+      ((out_size - 1) * stride + effective_filter_size - in_size);
+  total_padding = total_padding > 0 ? total_padding : 0;
+  *offset = total_padding % 2;
+  return total_padding / 2;
+}
+
+// Matching GetWindowedOutputSize in TensorFlow.
+int ComputeOutSize(Padding padding, int image_size,
+                          int filter_size, int stride, int dilation_rate = 1) {
+  int effective_filter_size = (filter_size - 1) * dilation_rate + 1;
+  switch (padding) {
+    case SAME:
+      return (image_size + stride - 1) / stride;
+    case VALID:
+      return (image_size + stride - effective_filter_size) / stride;
+    default:
+      return 0;
+  }
+}
+
+void uComputePaddingHeightWidth(int stride_height, int stride_width,
+                               int dilation_rate_height,
+                               int dilation_rate_width, int in_height,
+                               int in_width, int filter_height,
+                               int filter_width, int* padding_height,
+                               int* padding_width, Padding padding,
+                               int* out_height, int* out_width) {
+  *out_width = ComputeOutSize(padding, in_width, filter_width, stride_width,
+                              dilation_rate_width);
+  *out_height = ComputeOutSize(padding, in_height, filter_height, stride_height,
+                               dilation_rate_height);
+
+  int offset = 0;
+  *padding_height =
+      ComputePaddingWithOffset(stride_height, dilation_rate_height, in_height,
+                               filter_height, *out_height, &offset);
+  *padding_width =
+      ComputePaddingWithOffset(stride_width, dilation_rate_width, in_width,
+                               filter_width, *out_width, &offset);
+}
+
+}
diff --git a/src/uTensor/ops/Convolution_kernels.hpp b/src/uTensor/ops/Convolution_kernels.hpp
index 7000df33..03b2b997 100644
--- a/src/uTensor/ops/Convolution_kernels.hpp
+++ b/src/uTensor/ops/Convolution_kernels.hpp
@@ -426,5 +426,118 @@ void depthwise_separable_convolution_kernel(Tensor& out, const Tensor& in,
   }
 }
 
+
+void uComputePaddingHeightWidth(int stride_height, int stride_width,
+                               int dilation_rate_height,
+                               int dilation_rate_width, int in_height,
+                               int in_width, int filter_height,
+                               int filter_width, int* padding_height,
+                               int* padding_width, Padding padding,
+                               int* out_height, int* out_width);
+
+template <typename T>
+void depthwise_separable_convolution_kernel_v2(Tensor& output, const Tensor& input,
+                                            const Tensor& filter,
+                                            const Tensor& bias,
+                                            const Padding padding,
+                                            const uint16_t (&strides)[4],
+                                            const int depth_multiplier,
+                                            const uint16_t (&dialation)[2]
+                                            ) {
+  
+  // Check dimensions of the tensors.
+  const TensorShape& input_shape = input->get_shape();
+  const TensorShape& filter_shape = filter->get_shape();
+  const TensorShape& output_shape = output->get_shape();
+  
+  const int channels_out = filter_shape[3];
+  const int batches = input_shape[0];
+  const int output_depth = output_shape[3]; // This should be the same as filter_shape[3]
+  const int output_height = output_shape[1];
+  const int output_width = output_shape[2];
+  const int input_width = input_shape[2];
+  const int input_height = input_shape[1];
+  const int input_depth = input_shape[3];
+  const int filter_width = filter_shape[2];
+  const int filter_height = filter_shape[1];
+  const int stride_width = strides[2];
+  const int stride_height = strides[1];
+  const int dialation_width_factor = dialation[1];
+  const int dialation_height_factor = dialation[0];
+
+  int unused_output_height, unused_output_width;
+  int32_t pad_width, pad_height;
+
+  uComputePaddingHeightWidth(stride_height, stride_width, 1, 1, input_height,
+                                  input_width, filter_height, filter_width,
+                                  &pad_height, &pad_width,
+                                  padding,
+                                  &unused_output_height, &unused_output_width);
+  
+  if (!(input_shape.num_dims() == 4)) {
+    Context::get_default_context()->throwError(
+        new InvalidTensorDimensionsError);
+  }
+  if (!(filter_shape.num_dims() == 4)) {
+    Context::get_default_context()->throwError(
+        new InvalidTensorDimensionsError);
+  }
+  if (!(output_shape.num_dims() == 4)) {
+    Context::get_default_context()->throwError(
+        new InvalidTensorDimensionsError);
+  }
+  if (!(output_depth == filter_shape[3])) {
+    Context::get_default_context()->throwError(
+        new InvalidTensorDimensionsError);
+  }
+  if (!(batches == output_shape[0])) {
+    Context::get_default_context()->throwError(
+        new InvalidTensorDimensionsError);
+  }
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dialation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dialation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  // int32_t input_val = input_data[Offset(input_shape, batch,
+                  // in_y,
+                  //                                     in_x, in_channel)];
+                  T input_val =
+                      static_cast<T>(input(batch, in_y, in_x, in_channel));
+                  // int32_t filter_val = filter_data[Offset(
+                  //     filter_shape, 0, filter_y, filter_x, output_channel)];
+                  T filter_val = static_cast<T>(
+                      filter(0, filter_y, filter_x, output_channel));
+                  acc += filter_val * (input_val);
+                }
+              }
+            }
+            // assuming bias data will always be provided
+            acc += static_cast<T>(bias(output_channel));
+
+            output(batch, out_y, out_x, output_channel) =
+                static_cast<T>(acc);
+          }
+        }
+      }
+    }
+  }
+}
+
 }  // namespace uTensor
 #endif
diff --git a/src/uTensor/ops/Matrix.cpp b/src/uTensor/ops/Matrix.cpp
index 21188df4..f0c6c714 100644
--- a/src/uTensor/ops/Matrix.cpp
+++ b/src/uTensor/ops/Matrix.cpp
@@ -2,5 +2,4 @@
 
 namespace uTensor {
 
-DEFINE_ERROR(InvalidMatrixMultIndicesError);
 }
diff --git a/src/uTensor/ops/Matrix.hpp b/src/uTensor/ops/Matrix.hpp
index 4b7058f5..2c126fc1 100644
--- a/src/uTensor/ops/Matrix.hpp
+++ b/src/uTensor/ops/Matrix.hpp
@@ -2,42 +2,12 @@
 #define UTENSOR_MATRIX_OPS_H
 #include "context.hpp"
 #include "operatorBase.hpp"
+#include "ActivationFncs.hpp"
+#include "Matrix_kernels.hpp"
 
 namespace uTensor {
-
-DECLARE_ERROR(InvalidMatrixMultIndicesError);
 namespace ReferenceOperators {
 
-// Assume c is already allocated to the correct size
-// Naive implementation
-template <typename T>
-void matrix_mult_kernel(Tensor& c, const Tensor& a, const Tensor& b) {
-  // Decide on c shape
-  TensorShape a_shape = a->get_shape();
-  TensorShape b_shape = b->get_shape();
-  TensorShape c_shape = c->get_shape();
-  if (a_shape.num_dims() > 2 || b_shape.num_dims() > 2 ||
-      c_shape.num_dims() > 2 || a_shape[1] != b_shape[0] ||
-      a_shape[0] != c_shape[0] || b_shape[1] != c_shape[1]) {
-    uTensor_printf("[Error] Invalid matrix multiple shape mismatch\n");
-    Context::get_default_context()->throwError(
-        new InvalidMatrixMultIndicesError);
-  }
-
-  for (uint32_t i = 0; i < a_shape[0]; i++) {
-    for (uint32_t j = 0; j < b_shape[1]; j++) {
-      // c(i, j) = static_cast<T>(0);
-      T tmp = 0;
-      for (uint32_t k = 0; k < a_shape[1]; k++) {
-        tmp += static_cast<T>(a(i, k)) * static_cast<T>(b(k, j));
-        // printf("i, j, k : %d %d %d %d %d\n", i, j, k, static_cast<T>(a(i, k))
-        // , static_cast<T>(b(k, j)));
-      }
-      c(i, j) = tmp;
-    }
-  }
-}
-
 template <typename T>
 class MatrixMultOperator : public OperatorInterface<2, 1> {
  public:
@@ -53,6 +23,51 @@ class MatrixMultOperator : public OperatorInterface<2, 1> {
   }
 };
 
+template <typename T>
+class MatrixMultOperatorV2 : public OperatorInterface<3, 1> {};
+
+template <>
+class MatrixMultOperatorV2<float> : public OperatorInterface<3, 1> {
+ public:
+  enum names_in : uint8_t { input, filter, bias };
+  enum names_out : uint8_t { output };
+
+  MatrixMultOperatorV2(Fuseable::Activation<float> activation = Fuseable::NoActivation<float>)
+      : _activation(activation) {}
+
+ private:
+  Fuseable::Activation<float> _activation;
+
+ protected:
+  virtual void compute() {
+    bool have_bias =
+        *(inputs[bias].name) != *(TensorMapInterface::not_found.name);
+    // Decide on c shape
+    TensorShape& a_shape = inputs[input].tensor()->get_shape();
+    TensorShape& b_shape = inputs[filter].tensor()->get_shape();
+    TensorShape& c_shape = outputs[output].tensor()->get_shape();
+    if (a_shape.num_dims() > 2 || b_shape.num_dims() > 2 ||
+        c_shape.num_dims() > 2 || a_shape[1] != b_shape[0] ||
+        a_shape[0] != c_shape[0] || b_shape[1] != c_shape[1]) {
+      uTensor_printf("[Error] Invalid matrix multiple shape mismatch\n");
+      Context::get_default_context()->throwError(
+          new InvalidMatrixMultIndicesError);
+    }
+    if (have_bias) {
+      matrix_mult_kernel_v2<float>(
+          outputs[output].tensor(), inputs[input].tensor(),
+          inputs[filter].tensor(), inputs[bias].tensor(), _activation);
+    } else {
+      matrix_mult_kernel_v2<float>(
+          outputs[output].tensor(), inputs[input].tensor(),
+          inputs[filter].tensor(), _activation);
+    }
+  }
+};
+
+template <typename Tout>
+using FullyConnectedOperator = MatrixMultOperatorV2<Tout>;
+
 }
 }  // namespace uTensor
 #endif
diff --git a/src/uTensor/ops/Matrix_kernels.cpp b/src/uTensor/ops/Matrix_kernels.cpp
new file mode 100644
index 00000000..a8bdc803
--- /dev/null
+++ b/src/uTensor/ops/Matrix_kernels.cpp
@@ -0,0 +1,6 @@
+#include "Matrix_kernels.hpp"
+
+namespace uTensor {
+
+DEFINE_ERROR(InvalidMatrixMultIndicesError);
+}
diff --git a/src/uTensor/ops/Matrix_kernels.hpp b/src/uTensor/ops/Matrix_kernels.hpp
new file mode 100644
index 00000000..96ff2f4f
--- /dev/null
+++ b/src/uTensor/ops/Matrix_kernels.hpp
@@ -0,0 +1,104 @@
+#ifndef UTENSOR_MATRIX_KERNELS
+#define UTENSOR_MATRIX_KERNELS
+#include "context.hpp"
+#include "operatorBase.hpp"
+#include "ActivationFncs.hpp"
+
+namespace uTensor {
+DECLARE_ERROR(InvalidMatrixMultIndicesError);
+
+// Assume c is already allocated to the correct size
+// Naive implementation
+template <typename T>
+void matrix_mult_kernel(Tensor& c, const Tensor& a, const Tensor& b) {
+  // Decide on c shape
+  TensorShape a_shape = a->get_shape();
+  TensorShape b_shape = b->get_shape();
+  TensorShape c_shape = c->get_shape();
+  if (a_shape.num_dims() > 2 || b_shape.num_dims() > 2 ||
+      c_shape.num_dims() > 2 || a_shape[1] != b_shape[0] ||
+      a_shape[0] != c_shape[0] || b_shape[1] != c_shape[1]) {
+    uTensor_printf("[Error] Invalid matrix multiple shape mismatch\n");
+    Context::get_default_context()->throwError(
+        new InvalidMatrixMultIndicesError);
+  }
+
+  for (uint32_t i = 0; i < a_shape[0]; i++) {
+    for (uint32_t j = 0; j < b_shape[1]; j++) {
+      // c(i, j) = static_cast<T>(0);
+      T tmp = 0;
+      for (uint32_t k = 0; k < a_shape[1]; k++) {
+        tmp += static_cast<T>(a(i, k)) * static_cast<T>(b(k, j));
+        // printf("i, j, k : %d %d %d %d %d\n", i, j, k, static_cast<T>(a(i, k))
+        // , static_cast<T>(b(k, j)));
+      }
+      c(i, j) = tmp;
+    }
+  }
+}
+
+template <typename T>
+void matrix_mult_kernel_v2(Tensor& output, const Tensor& input,
+                                  const Tensor& filter,
+                                  Fuseable::Activation<T> activation){
+  const TensorShape& input_shape = input->get_shape();
+  const TensorShape& filter_shape = filter->get_shape();
+  TensorShape& output_shape = output->get_shape();
+
+  const int filter_dim_count = filter_shape.num_dims();
+  const int batches = output_shape[0];
+  const int output_depth = output_shape[1];
+  if (!(output_depth <= filter_shape[filter_dim_count - 1])) {
+    Context::get_default_context()->throwError(
+        new InvalidMatrixMultIndicesError);
+  }
+  const int accum_depth = filter_shape[0];
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      T acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        // TODO write this in tensor form
+        T input_val = static_cast<T>(input(b, d, 0, 0));
+        T filter_val = static_cast<T>(filter(d, out_c, 0, 0));
+        acc += filter_val * input_val;
+      }
+      acc = activation(acc);
+      output(b, out_c, 0, 0) = static_cast<T>(acc);
+    }
+  }
+}
+
+template <typename T>
+void matrix_mult_kernel_v2(Tensor& output, const Tensor& input,
+                                  const Tensor& filter, const Tensor& bias,
+                                  Fuseable::Activation<T> activation){
+  const TensorShape& input_shape = input->get_shape();
+  const TensorShape& filter_shape = filter->get_shape();
+  TensorShape& output_shape = output->get_shape();
+
+  const int filter_dim_count = filter_shape.num_dims();
+  const int batches = output_shape[0];
+  const int output_depth = output_shape[1];
+  if (!(output_depth <= filter_shape[filter_dim_count - 1])) {
+    Context::get_default_context()->throwError(
+        new InvalidMatrixMultIndicesError);
+  }
+  const int accum_depth = filter_shape[0];
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      T acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        // TODO write this in tensor form
+        T input_val = static_cast<T>(input(b, d, 0, 0));
+        T filter_val = static_cast<T>(filter(d, out_c, 0, 0));
+        acc += filter_val * input_val;
+      }
+      acc += static_cast<T>(bias(out_c));
+      acc = activation(acc);
+      output(b, out_c, 0, 0) = static_cast<T>(acc);
+    }
+  }
+}
+
+}
+#endif
diff --git a/src/uTensor/tensors/RomTensor.cpp b/src/uTensor/tensors/RomTensor.cpp
index 2b1872ed..f2ef03eb 100644
--- a/src/uTensor/tensors/RomTensor.cpp
+++ b/src/uTensor/tensors/RomTensor.cpp
@@ -3,6 +3,7 @@
 #include <cstdio>
 
 #include "context.hpp"
+#include "uTensor_util.hpp"
 namespace uTensor {
 
 // EVENTS
@@ -12,7 +13,7 @@ RomTensor::RomTensor(TensorShape _shape, ttype _type, const void* buffer)
 
 // TODO Need to fix the write/read selection functions in Handle
 void* RomTensor::write(uint32_t linear_index) {
-  // printf("[ERROR] Attempted write to ROM tensor, make sure it's declared
+  // uTensor_printf("[ERROR] Attempted write to ROM tensor, make sure it's declared
   // const\n"); return nullptr;
   return BufferTensor::write(linear_index);
 }
@@ -29,17 +30,43 @@ size_t RomTensor::_get_writeable_block(void*& buffer, uint16_t req_write_size,
                                        uint32_t linear_index) {
   Context::get_default_context()->throwError(
       new InvalidOptimizableTensorError());
-  printf(
+  uTensor_printf(
       "ERROR, Optimized op attempted to write access non-optimizable tensor\n");
   return -1;
 }
 
 RomTensor::~RomTensor() {}
 void RomTensor::resize(TensorShape new_shape) {
-  printf("[ERROR] Attempted resize of ROM tensor\n");
+  uTensor_printf("[ERROR] Attempted resize of ROM tensor\n");
   Context::get_default_context()->throwError(new InvalidResizeError());
 }
 
+ScalarRomTensor::ScalarRomTensor(TensorShape _shape, ttype _type,
+                                     const void* buffer)
+    : RomTensor(_shape, _type, buffer) {
+  if (_shape.num_dims() != 1) {
+    uTensor_printf(
+        "[ERROR] Attempted to create scalar Tensor with more than one "
+        "dimension\n");
+    Context::get_default_context()->throwError(
+        new InvalidTensorDimensionsError());
+  }
+  if (_shape[0] != 1) {
+    uTensor_printf("[ERROR] Scalar Tensor size not 1\n");
+    Context::get_default_context()->throwError(
+        new InvalidTensorDimensionsError());
+  }
+}
+
+ScalarRomTensor::~ScalarRomTensor() {}
+void* ScalarRomTensor::read(uint32_t linear_index) const {
+    return RomTensor::read(0);
+}
+// HACK TODO, REMOVE THIS after getting Handles to work with const pointers
+void* ScalarRomTensor::write(uint32_t linear_index) {
+    return RomTensor::write(0);
+}
+
 // Returns floor of square root of x
 int floorPerfSqrt(int x) {
   // Base cases
@@ -66,7 +93,7 @@ DiagonalRomTensor::DiagonalRomTensor(TensorShape _shape, ttype _type,
                                      const void* buffer, size_t buffer_len)
     : RomTensor(_shape, _type, buffer) {
   if (_shape.num_dims() != 2) {
-    printf(
+    uTensor_printf(
         "[ERROR] Attempted to create diagonal Tensor with wrong number of "
         "dimensions\n");
     Context::get_default_context()->throwError(
@@ -74,7 +101,7 @@ DiagonalRomTensor::DiagonalRomTensor(TensorShape _shape, ttype _type,
   }
   uint16_t smaller_dim = (_shape[0] < _shape[1]) ? _shape[0] : _shape[1];
   if (buffer_len < smaller_dim) {
-    printf("[ERROR] Diagnoal Tensor size mismatch with buffer\n");
+    uTensor_printf("[ERROR] Diagnoal Tensor size mismatch with buffer\n");
     Context::get_default_context()->throwError(
         new InvalidTensorDimensionsError());
   }
diff --git a/src/uTensor/tensors/RomTensor.hpp b/src/uTensor/tensors/RomTensor.hpp
index a8733309..30b87869 100644
--- a/src/uTensor/tensors/RomTensor.hpp
+++ b/src/uTensor/tensors/RomTensor.hpp
@@ -34,6 +34,16 @@ class RomTensor : public BufferTensor {
                                       uint32_t linear_index) override;
 };
 
+class ScalarRomTensor : public RomTensor {
+ protected:
+  virtual void* read(uint32_t linear_index) const override;
+  virtual void* write(uint32_t linear_index) override;
+
+ public:
+  ScalarRomTensor(TensorShape _shape, ttype _type, const void* buffer);
+  virtual ~ScalarRomTensor();
+};
+
 class DiagonalRomTensor : public RomTensor {
  protected:
   virtual void* read(uint32_t linear_index) const override;