opencv · Abdurrahheem · Dec 12, 2023 · Dec 14, 2023 · Dec 14, 2023 · Dec 14, 2023
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -1659,6 +1659,95 @@ CV__DNN_INLINE_NS_BEGIN
                              float confThreshold = 0.5f, float nmsThreshold = 0.0f);
      };
 
+     /** @brief This class represents high-level API for YOLO object detection networks.
+      *
+      * YoloDetectionModel allows to set params for preprocessing input image.
+      * YoloDetectionModel creates net from file with trained weights and config,
+      * sets preprocessing input, runs forward pass and return result detections.
+      */
+     class CV_EXPORTS_W_SIMPLE YOLODetectionModel : public DetectionModel
+     {
+     public:
+         /**
+          * @brief Creates yolo detection model from network represented in one of the supported formats.
+          * An order of @p model and @p config arguments does not matter.
+          * @param[in] model Binary file contains trained weights.
+          * @param[in] config Text file contains network configuration.
+          * @param[in] version Version of the yolo model.
+          */
+         CV_WRAP YOLODetectionModel(const String& model, const String& config);
+
+         /**
+          * @brief Create model from onnx graph.
+          * @param[in] onnx path to onnx graph.
+          * @param[in] version Version of the yolo model.
+          */
+         CV_WRAP YOLODetectionModel(const String& onnx);
+
+         CV_DEPRECATED_EXTERNAL  // avoid using in C++ code (need to fix bindings first)
+         YOLODetectionModel();
+
+         /**
+          *  @brief Given the @p input frame, create input blob, run net and return result detections.
+          *  @param[in]  frame  The input image.
+          *  @param[out] classIds Class indexes in result detection.
+          *  @param[out] confidences A set of corresponding confidences.
+          *  @param[out] boxes A set of bounding boxes.
+          *  @param[in] confThreshold A threshold used to filter boxes by confidences.
+          *  @param[in] nmsThreshold A threshold used in non maximum suppression.
+          */
+         CV_WRAP void detect(InputArray frame, CV_OUT std::vector<int>& classIds,
+                             CV_OUT std::vector<float>& confidences, CV_OUT std::vector<Rect>& boxes,
+                             float confThreshold = 0.5f, float nmsThreshold = 0.0f);
+
+         /**
+          * @brief Static method for post-processing the detections.
+          *
+          * This method applies scaling and transformation to the detected bounding boxes and
+          * filters them using non-maximum suppression and confidence thresholding.
+          *
+          * @param[in] detections Input detections from the network.
+          * @param[out] boxes Processed bounding boxes after scaling and NMS.
+          * @param[out] confidences Confidence scores for the processed boxes.
+          * @param[out] classIds Class IDs for each processed box.
+          * @param[in] confThreshold Threshold for filtering boxes by confidence.
+          * @param[in] nmsThreshold Threshold for non-maximum suppression.
+          * @param[in] yoloVersion The version of the YOLO model used.
+          * @param[in] darknet Boolean to indicate if darknet architecture is used.
+          * @param[in] nmsAcrossClasses Boolean to indicate if NMS is to be applied across classes.
+          * @param[in] frameWidth Width of the input frame.
+          * @param[in] frameHeight Height of the input frame.
+          */
+         CV_WRAP static void postProccess(
+            std::vector<Mat>& detections,
+            CV_OUT std::vector<Rect>& boxes,
+            CV_OUT std::vector<float>& confidences,
+            CV_OUT std::vector<int>& classIds,
+            Size inputImgSize,
+            const float confThreshold = 0.5f,
+            const float nmsThreshold = 0.4f,
+            const bool nmsAcrossClasses = true
+            );
+         /**
+          * @brief Set the padding mode used in image preprocessing.
+          *
+          * This method sets the padding mode which determines how the input images are padded
+          * and resized before being fed into the network.
+          *
+          * @param[in] paddingMode The padding mode to use.
+          * @return Reference to the current object for chaining calls.
+          */
+        CV_WRAP YOLODetectionModel& setPaddingMode(const ImagePaddingMode paddingMode);
+         /**
+          * @brief Set the padding value used in image preprocessing.
+          *
+          * This method sets the value used to pad the input images during preprocessing.
+          *
+          * @param[in] paddingValue The padding value to use.
+          * @return Reference to the current object for chaining calls.
+          */
+        CV_WRAP YOLODetectionModel& setPaddingValue(const float paddingValue);
+        };
 
 /** @brief This class represents high-level API for text recognition networks.
  *

diff --git a/modules/dnn/src/model.cpp b/modules/dnn/src/model.cpp
@@ -637,6 +637,265 @@ void DetectionModel::detect(InputArray frame, CV_OUT std::vector<int>& classIds,
         CV_Error(Error::StsNotImplemented, "Unknown output layer type: \"" + lastLayer->type + "\"");
 }
 
+class YOLODetectionModel_Impl : public DetectionModel_Impl
+{
+
+    ImagePaddingMode paddingMode;
+    float padValue;
+    Size2f frameSize;
+
+    public:
+    YOLODetectionModel_Impl() {
+        //nothing
+    }
+
+    void processFrame(InputArray frame, OutputArrayOfArrays outs)
+    {
+        CV_TRACE_FUNCTION();
+        if (size.empty())
+            CV_Error(Error::StsBadSize, "Input size not specified");
+
+        frameSize.width = frame.cols();
+        frameSize.height = frame.rows();
+
+        Image2BlobParams param;
+        param.scalefactor = scale;
+        param.size = size;
+        param.mean = mean;
+        param.swapRB = swapRB;
+        param.borderValue = padValue;
+        param.paddingmode = paddingMode;
+
+        Mat blob = dnn::blobFromImageWithParams(frame, param);
+        net.setInput(blob);
+
+        net.forward(outs, outNames);
+    }
+
+    void setPaddingValue(const float padValue_){
+        padValue = padValue_;
+    }
+
+    void setPaddingMode(const ImagePaddingMode paddingmode_){
+        if (paddingmode_ == ImagePaddingMode::DNN_PMODE_NULL ||
+            paddingmode_ == ImagePaddingMode::DNN_PMODE_CROP_CENTER ||
+            paddingmode_ == ImagePaddingMode::DNN_PMODE_LETTERBOX){
+            paddingMode = paddingmode_;
+            } else {
+                CV_Error(Error::StsNotImplemented, "Unsupported padding mode");
+            }
+    }
+
+    void setInputParams(double scale_, const Size& size_, const Scalar& mean_,
+                        bool swapRB_, ImagePaddingMode paddingMode_, float padValue_)
+    {
+        size = size_;
+        mean = mean_;
+        scale = Scalar::all(scale_);
+        swapRB = swapRB_;
+        paddingMode = paddingMode_;
+        padValue = padValue_;
+    }
+
+    void detect(InputArray frame, CV_OUT std::vector<int>& classIds,
+                            CV_OUT std::vector<float>& confidences, CV_OUT std::vector<Rect>& boxes,
+                            float confThreshold, float nmsThreshold){
+
+        std::vector<Mat> detections;
+        processFrame(frame, detections);
+
+        YOLODetectionModel::postProccess(
+            detections,
+            boxes,
+            confidences,
+            classIds,
+            size,
+            confThreshold,
+            nmsThreshold,
+            getNmsAcrossClasses()
+            );
+
+            Image2BlobParams paramNet;
+            paramNet.scalefactor = scale;
+            paramNet.size = size;
+            paramNet.mean = mean;
+            paramNet.swapRB = swapRB;
+            paramNet.paddingmode = paddingMode;
+            paramNet.blobRectsToImageRects(boxes, boxes, frameSize);
+    }
+};
+
+YOLODetectionModel::YOLODetectionModel(const String& model, const String& config)
+{
+    impl = makePtr<YOLODetectionModel_Impl>();
+    impl->initNet(readNet(model, config));
+}
+
+YOLODetectionModel::YOLODetectionModel(const String& onnx)
+{
+    impl = makePtr<YOLODetectionModel_Impl>();
+    impl->initNet(readNetFromONNX(onnx));
+}
+
+YOLODetectionModel::YOLODetectionModel()
+{
+    impl = std::static_pointer_cast<Model::Impl>(makePtr<YOLODetectionModel_Impl>());
+}
+
+void YOLODetectionModel::detect(InputArray frame, CV_OUT std::vector<int>& classIds,
+                            CV_OUT std::vector<float>& confidences, CV_OUT std::vector<Rect>& boxes,
+                            float confThreshold, float nmsThreshold) {
+    impl.dynamicCast<YOLODetectionModel_Impl>()->detect(frame, classIds,
+                            confidences, boxes, confThreshold, nmsThreshold);
+}
+
+void YOLODetectionModel::postProccess(
+    std::vector<Mat>& detections,
-    std::vector<Mat>& detections,
+    const std::vector<Mat>& detections,
-    std::vector<Mat>& detections,
+    const std::vector<Mat>& detections,
+    CV_OUT std::vector<Rect>& keep_boxes,
+    CV_OUT std::vector<float>& keep_confidences,
+    CV_OUT std::vector<int>& keep_classIds,
+    Size inputImgSize,
+    const float confThreshold,
+    const float nmsThreshold,
+    const bool nmsAcrossClasses
+    ){
+
+    bool yolov8 = false;
+    bool darknet = false;
+
+    if (detections[0].dims == 2){
+        darknet = true;
+    }
+
+    if (!darknet && detections[0].size[1] < detections[0].size[2]) {
+        yolov8 = true;  // Set the correct flag based on tensor shape
+    }
+
+    // Retrieve
+    std::vector<int> classIds;
+    std::vector<float> confidences;
+    std::vector<Rect> boxes;
+
+    if (yolov8){
+        for(auto & detection : detections){
+            cv::transposeND(detection, {0, 2, 1}, detection);
+        }
+    }
+
+    // each row is [cx, cy, w, h, conf_obj, conf_class1, ..., conf_class80]
+    for (auto preds : detections)
+    {
+        if (!darknet)
+            preds = preds.reshape(1, preds.size[1]);
+
+        for (int i = 0; i < preds.rows; ++i)
+        {
+            // filter out non objects
+            float obj_conf = (!yolov8) ? preds.at<float>(i, 4) : 1.0f;
+            if (obj_conf < confThreshold)
+                continue;
+
+            Mat scores = preds.row(i).colRange((!yolov8) ? 5 : 4, preds.cols);
+            double conf;
+            Point maxLoc;
+            minMaxLoc(scores, 0, &conf, 0, &maxLoc);
+
+            conf = (!yolov8) ? conf * obj_conf : conf;
+            if (conf < confThreshold)
+                continue;
+
+            // get bbox coords
+            float* det = preds.ptr<float>(i);
+            double cx = det[0];
+            double cy = det[1];
+            double w = det[2];
+            double h = det[3];
+
+            // [x1, y1, x2, y2]
+            double x1 = cx - 0.5 * w;
+            double y1 = cy - 0.5 * h;
+            double x2 = cx + 0.5 * w;
+            double y2 = cy + 0.5 * h;
+
+
+            int width  = x2 - x1 + 1;
+            int height = y2 - y1 + 1;
+
+            if (width <= 2 || height <= 2)
+            {
+                x1  = x1 * inputImgSize.width;
+                y1  = y1 * inputImgSize.height;
+                x2  = x2 * inputImgSize.width;
+                y2  = y2 * inputImgSize.height;
+                width  = x2 - x1 + 1;
+                height = y2 - y1 + 1;
+            }
+
+            boxes.emplace_back(Rect(x1, y1, x2, y2));
+            classIds.emplace_back(maxLoc.x);
+            confidences.emplace_back(conf);
+        }
+    }
+
+    // NMS
+    if (nmsAcrossClasses)
+    {
+        std::vector<int> keep_idx;
+        NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, keep_idx);
+        for (auto i : keep_idx)
+        {
+            keep_classIds.emplace_back(classIds[i]);
+            keep_confidences.emplace_back(confidences[i]);
+            keep_boxes.emplace_back(boxes[i]);
+        }
+    }
+    else
+    {
+        std::map<int, std::vector<size_t> > class2indices;
+        for (size_t i = 0; i < classIds.size(); i++)
+        {
+            if (confidences[i] >= confThreshold)
+            {
+                class2indices[classIds[i]].push_back(i);
+            }
+        }
+        for (const auto& it : class2indices)
+        {
+            std::vector<Rect> localBoxes;
+            std::vector<float> localConfidences;
+            for (size_t idx : it.second)
+            {
+                localBoxes.push_back(boxes[idx]);
+                localConfidences.push_back(confidences[idx]);
+            }
+            std::vector<int> indices;
+            NMSBoxes(localBoxes, localConfidences, confThreshold, nmsThreshold, indices);
+
+            keep_classIds.resize(keep_classIds.size() + indices.size(), it.first);
+            for (int idx : indices)
+            {
+                keep_boxes.push_back(localBoxes[idx]);
+                keep_confidences.push_back(localConfidences[idx]);
+            }
+        }
+    }
+    // convert boxes to xywh
+    for(auto & box : keep_boxes){
+        box.width = box.width - box.x;
+        box.height = box.height - box.y;
+    }
+}
+
+YOLODetectionModel& YOLODetectionModel::setPaddingMode(const ImagePaddingMode paddingMode){
+    impl.dynamicCast<YOLODetectionModel_Impl>()->setPaddingMode(paddingMode);
+    return *this;
+}
+
+YOLODetectionModel& YOLODetectionModel::setPaddingValue(const float PadingValue){
+    impl.dynamicCast<YOLODetectionModel_Impl>()->setPaddingValue(PadingValue);
+    return *this;
+}
+
 struct TextRecognitionModel_Impl : public Model::Impl
 {
     std::string decodeType;