Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 22 additions & 4 deletions modules/dnn/src/cuda4dnn/primitives/matmul_broadcast.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
#include "../csl/tensor.hpp"
#include "../csl/tensor_ops.hpp"

#include "../kernels/eltwise_ops.hpp" // for adding bias

#include <opencv2/core.hpp>

#include <utility>
Expand All @@ -23,7 +25,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
public:
using wrapper_type = GetCUDABackendWrapperType<T>;

MatMulBroadcastOp(csl::Stream stream_, csl::cublas::Handle handle, const Mat &B, bool _transA, bool _transB,
MatMulBroadcastOp(csl::Stream stream_, csl::cublas::Handle handle, const Mat &B, const Mat &bias, bool _transA, bool _transB,
const std::vector<size_t> &A_offsets_, const std::vector<size_t> &B_offsets_, std::vector<size_t> &C_offsets_,
size_t batch_)
: stream(std::move(stream_)), cublasHandle(std::move(handle)), A_offsets(A_offsets_), B_offsets(B_offsets_), C_offsets(C_offsets_), batch(batch_)
Expand All @@ -33,6 +35,11 @@ namespace cv { namespace dnn { namespace cuda4dnn {
csl::copyMatToTensor<T>(B, input_B_tensor, stream);
}

if (!bias.empty()) {
bias_tensor = csl::makeTensorHeader<T>(bias);
csl::copyMatToTensor<T>(bias, bias_tensor, stream);
}

transA = _transA;
transB = _transB;
}
Expand All @@ -42,9 +49,6 @@ namespace cv { namespace dnn { namespace cuda4dnn {
const std::vector<cv::Ptr<BackendWrapper>>& outputs,
csl::Workspace& workspace) override
{
CV_Assert(((inputs.size() == 2 && input_B_tensor.empty()) ||
(inputs.size() == 1 && !input_B_tensor.empty())) && outputs.size() == 1);

auto input_A_wrapper = inputs[0].dynamicCast<wrapper_type>();
auto input_A = input_A_wrapper->getView();

Expand All @@ -60,12 +64,26 @@ namespace cv { namespace dnn { namespace cuda4dnn {
auto output = output_wrapper->getSpan();

csl::tensor_ops::gemmBatched<T>(cublasHandle, batch, 0.f, output, C_offsets, 1.f, transA, input_A, A_offsets, transB, input_B, B_offsets);

// add bias if exists
if (!bias_tensor.empty() || inputs.size() >= 3) {
csl::TensorView<T> bias;
if (bias_tensor.empty()) {
auto bias_wrapper = inputs[2].dynamicCast<wrapper_type>();
bias = bias_wrapper->getView();
} else {
bias = csl::TensorView<T>(bias_tensor);
}

kernels::eltwise_sum_2<T>(stream, output, output, bias);
}
}

private:
csl::Stream stream;
csl::cublas::Handle cublasHandle;
csl::Tensor<T> input_B_tensor;
csl::Tensor<T> bias_tensor;
bool transA, transB;

std::vector<size_t> A_offsets;
Expand Down
Loading