Correct aspect ratio after frame to input resizing

Nuzhny007 · Dec 10, 2024 · bd7ae44 · bd7ae44
1 parent f5d869b
commit bd7ae44
Show file tree

Hide file tree

Showing 13 changed files with 62 additions and 66 deletions.
diff --git a/src/Detector/tensorrt_yolo/YoloONNX.cpp b/src/Detector/tensorrt_yolo/YoloONNX.cpp
@@ -326,8 +326,9 @@ bool YoloONNX::ProcessInputAspectRatio(const std::vector<cv::Mat>& sampleImages)
         }
     }
 
-#if 0
+    m_resizedROI = cv::Rect(0, 0, inputW, inputH);
 
+#if 1
     // resize the DsImage with scale
     const float imgHeight = static_cast<float>(sampleImages[0].rows);
     const float imgWidth = static_cast<float>(sampleImages[0].cols);
@@ -351,7 +352,7 @@ bool YoloONNX::ProcessInputAspectRatio(const std::vector<cv::Mat>& sampleImages)
     assert(2 * yOffset + resizeH == inputH);
 
     cv::Size scaleSize(inputW, inputH);
-    cv::Rect roiRect(xOffset, yOffset, resizeW, resizeH);
+    m_resizedROI = cv::Rect(xOffset, yOffset, resizeW, resizeH);
 
     if (m_resizedBatch.size() < sampleImages.size())
         m_resizedBatch.resize(sampleImages.size());
@@ -361,7 +362,7 @@ bool YoloONNX::ProcessInputAspectRatio(const std::vector<cv::Mat>& sampleImages)
     {
         if (m_resizedBatch[b].size() != scaleSize)
             m_resizedBatch[b] = cv::Mat(scaleSize, sampleImages[b].type(), cv::Scalar::all(128));
-        cv::resize(sampleImages[b], cv::Mat(m_resizedBatch[b], roiRect), roiRect.size(), 0, 0, cv::INTER_LINEAR);
+        cv::resize(sampleImages[b], cv::Mat(m_resizedBatch[b], m_resizedROI), m_resizedROI.size(), 0, 0, cv::INTER_LINEAR);
         cv::split(m_resizedBatch[b], m_inputChannels[b]);
         std::swap(m_inputChannels[b][0], m_inputChannels[b][2]);
     }

diff --git a/src/Detector/tensorrt_yolo/YoloONNX.hpp b/src/Detector/tensorrt_yolo/YoloONNX.hpp
@@ -79,9 +79,10 @@ class YoloONNX
     size_t GetNumClasses() const;
 
 protected:
-    SampleYoloParams m_params; //!< The parameters for the sample.
-    nvinfer1::Dims m_inputDims; //!< The dimensions of the input to the network.
-    std::vector<nvinfer1::Dims> m_outpuDims; //!< The dimensions of the input to the network.
+    SampleYoloParams m_params;               //!< The parameters for the sample
+    nvinfer1::Dims m_inputDims;              //!< The dimensions of the input to the network
+    std::vector<nvinfer1::Dims> m_outpuDims; //!< The dimensions of the input to the network
+    cv::Rect m_resizedROI;                   //!< Input frame resized into input dimensions with the frame aspect ratio
 
     virtual std::vector<tensor_rt::Result> GetResult(size_t imgIdx, int keep_topk, const std::vector<float*>& outputs, cv::Size frameSize) = 0;
 

diff --git a/src/Detector/tensorrt_yolo/YoloONNXv10_bb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv10_bb.hpp
@@ -20,8 +20,8 @@ class YOLOv10_bb_onnx : public YoloONNX
 		//0: name: images, size: 1x3x640x640
 		//1: name: output0, size: 1x300x6
 
-		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_resizedROI.width);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_resizedROI.height);
 
 		auto output = outputs[0];
 
@@ -51,8 +51,8 @@ class YOLOv10_bb_onnx : public YoloONNX
 			//if (i == 0)
 			//	std::cout << i << ": " << output[k + 0] << " " << output[k + 1] << " " << output[k + 2] << " " << output[k + 3] << " " << output[k + 4] << " " << output[k + 5] << std::endl;
 
-			float x = fw * output[k + 0];
-			float y = fh * output[k + 1];
+			float x = fw * (output[k + 0] - m_resizedROI.x);
+			float y = fh * (output[k + 1] - m_resizedROI.y);
 			float width = fw * (output[k + 2] - output[k + 0]);
 			float height = fh * (output[k + 3] - output[k + 1]);
 			float objectConf = output[k + 4];

diff --git a/src/Detector/tensorrt_yolo/YoloONNXv11_bb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv11_bb.hpp
@@ -20,8 +20,8 @@ class YOLOv11_bb_onnx : public YoloONNX
 		//0: name: images, size: 1x3x640x640
 		//1: name: output0, size: 1x84x8400
 
-		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_resizedROI.width);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_resizedROI.height);
 
 		auto output = outputs[0];
 
@@ -88,8 +88,8 @@ class YOLOv11_bb_onnx : public YoloONNX
 				confidences.push_back(objectConf);
 
 				// (center x, center y, width, height) to (x, y, w, h)
-				float x = fw * (output[k] - output[k + 2] / 2);
-				float y = fh * (output[k + 1] - output[k + 3] / 2);
+				float x = fw * (output[k] - output[k + 2] / 2 - m_resizedROI.x);
+				float y = fh * (output[k + 1] - output[k + 3] / 2 - m_resizedROI.y);
 				float width = fw * output[k + 2];
 				float height = fh * output[k + 3];
 				rectBoxes.emplace_back(cvRound(x), cvRound(y), cvRound(width), cvRound(height));

diff --git a/src/Detector/tensorrt_yolo/YoloONNXv11_instance.hpp b/src/Detector/tensorrt_yolo/YoloONNXv11_instance.hpp
@@ -17,8 +17,8 @@ class YOLOv11_instance_onnx : public YoloONNX
 	{
 		std::vector<tensor_rt::Result> resBoxes;
 
-		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_resizedROI.width);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_resizedROI.height);
 
 		size_t outInd = (outputs.size() == 0) ? 1 : 0;
 		size_t segInd = (outputs.size() == 0) ? 0 : 1;
@@ -155,8 +155,8 @@ class YOLOv11_instance_onnx : public YoloONNX
 			if (objectConf >= m_params.confThreshold)
 			{
 				// (center x, center y, width, height) to (x, y, w, h)
-				float x = fw * (output[k] - output[k + 2] / 2);
-				float y = fh * (output[k + 1] - output[k + 3] / 2);
+				float x = fw * (output[k] - output[k + 2] / 2 - m_resizedROI.x);
+				float y = fh * (output[k + 1] - output[k + 3] / 2 - m_resizedROI.y);
 				float width = fw * output[k + 2];
 				float height = fh * output[k + 3];
 

diff --git a/src/Detector/tensorrt_yolo/YoloONNXv11_obb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv11_obb.hpp
@@ -22,8 +22,8 @@ class YOLOv11_obb_onnx : public YoloONNX
 		//20: 15 DOTA classes + x + y + w + h + a
 		constexpr int shapeDataSize = 5;
 
-		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_resizedROI.width);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_resizedROI.height);
 
 		auto output = outputs[0];
 
@@ -96,8 +96,8 @@ class YOLOv11_obb_onnx : public YoloONNX
 				confidences.push_back(objectConf);
 
 				// (center x, center y, width, height)
-				float cx = fw * output[k];
-				float cy = fh * output[k + 1];
+				float cx = fw * (output[k] - m_resizedROI.x);
+				float cy = fh * (output[k + 1] - m_resizedROI.y);
 				float width = fw * output[k + 2];
 				float height = fh * output[k + 3];
 				float angle = 180.f * output[k + nc + shapeDataSize - 1] / M_PI;

diff --git a/src/Detector/tensorrt_yolo/YoloONNXv6_bb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv6_bb.hpp
@@ -17,6 +17,9 @@ class YOLOv6_bb_onnx : public YoloONNX
 	{
 		std::vector<tensor_rt::Result> resBoxes;
 
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_resizedROI.width);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_resizedROI.height);
+
 		if (outputs.size() == 4)
 		{
 			auto dets = reinterpret_cast<int*>(outputs[0]);
@@ -26,9 +29,6 @@ class YOLOv6_bb_onnx : public YoloONNX
 
 			int objectsCount = m_outpuDims[1].d[1];
 
-			const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-			const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
-
 			//std::cout << "Dets[" << imgIdx << "] = " << dets[imgIdx] << ", objectsCount = " << objectsCount << std::endl;
 
 			const size_t step1 = imgIdx * objectsCount;
@@ -41,8 +41,8 @@ class YOLOv6_bb_onnx : public YoloONNX
 				int classId = classes[i + step1];
 				if (class_conf >= m_params.confThreshold)
 				{
-					float x = fw * boxes[k + 0 + step2];
-					float y = fh * boxes[k + 1 + step2];
+					float x = fw * (boxes[k + 0 + step2] - m_resizedROI.x);
+					float y = fh * (boxes[k + 1 + step2] - m_resizedROI.y);
 					float width = fw * boxes[k + 2 + step2] - x;
 					float height = fh * boxes[k + 3 + step2] - y;
 
@@ -57,9 +57,6 @@ class YOLOv6_bb_onnx : public YoloONNX
 		}
 		else if (outputs.size() == 1)
 		{
-			const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-			const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
-
 			auto output = outputs[0];
 
 			size_t ncInd = 2;
@@ -96,8 +93,8 @@ class YOLOv6_bb_onnx : public YoloONNX
 					int classId = cvRound(output[k + 5]);
 					if (class_conf >= m_params.confThreshold)
 					{
-						float x = fw * output[k + 1];
-						float y = fh * output[k + 2];
+						float x = fw * (output[k + 1] - m_resizedROI.x);
+						float y = fh * (output[k + 2] - m_resizedROI.y);
 						float width = fw * (output[k + 3] - output[k + 1]);
 						float height = fh * (output[k + 4] - output[k + 2]);
 
@@ -150,8 +147,8 @@ class YOLOv6_bb_onnx : public YoloONNX
 					if (object_conf >= m_params.confThreshold)
 					{
 						// (center x, center y, width, height) to (x, y, w, h)
-						float x = fw * (output[k] - output[k + 2] / 2);
-						float y = fh * (output[k + 1] - output[k + 3] / 2);
+						float x = fw * (output[k] - output[k + 2] / 2 - m_resizedROI.x);
+						float y = fh * (output[k + 1] - output[k + 3] / 2 - m_resizedROI.y);
 						float width = fw * output[k + 2];
 						float height = fh * output[k + 3];
 

diff --git a/src/Detector/tensorrt_yolo/YoloONNXv7_bb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv7_bb.hpp
@@ -17,6 +17,9 @@ class YOLOv7_bb_onnx : public YoloONNX
 	{
 		std::vector<tensor_rt::Result> resBoxes;
 
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_resizedROI.width);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_resizedROI.height);
+
 		if (outputs.size() == 4)
 		{
 			auto dets = reinterpret_cast<int*>(outputs[0]);
@@ -26,9 +29,6 @@ class YOLOv7_bb_onnx : public YoloONNX
 
 			int objectsCount = m_outpuDims[1].d[1];
 
-			const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-			const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
-
 			//std::cout << "Dets[" << imgIdx << "] = " << dets[imgIdx] << ", objectsCount = " << objectsCount << std::endl;
 
 			const size_t step1 = imgIdx * objectsCount;
@@ -41,8 +41,8 @@ class YOLOv7_bb_onnx : public YoloONNX
 				int classId = classes[i + step1];
 				if (class_conf >= m_params.confThreshold)
 				{
-					float x = fw * boxes[k + 0 + step2];
-					float y = fh * boxes[k + 1 + step2];
+					float x = fw * (boxes[k + 0 + step2] - m_resizedROI.x);
+					float y = fh * (boxes[k + 1 + step2] - m_resizedROI.y);
 					float width = fw * boxes[k + 2 + step2] - x;
 					float height = fh * boxes[k + 3 + step2] - y;
 
@@ -57,9 +57,6 @@ class YOLOv7_bb_onnx : public YoloONNX
 		}
 		else if (outputs.size() == 1)
 		{
-			const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-			const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
-
 			auto output = outputs[0];
 
 			size_t ncInd = 2;
@@ -96,8 +93,8 @@ class YOLOv7_bb_onnx : public YoloONNX
 					int classId = cvRound(output[k + 5]);
 					if (class_conf >= m_params.confThreshold)
 					{
-						float x = fw * output[k + 1];
-						float y = fh * output[k + 2];
+						float x = fw * (output[k + 1] - m_resizedROI.x);
+						float y = fh * (output[k + 2] - m_resizedROI.y);
 						float width = fw * (output[k + 3] - output[k + 1]);
 						float height = fh * (output[k + 4] - output[k + 2]);
 
@@ -150,8 +147,8 @@ class YOLOv7_bb_onnx : public YoloONNX
 					if (object_conf >= m_params.confThreshold)
 					{
 						// (center x, center y, width, height) to (x, y, w, h)
-						float x = fw * (output[k] - output[k + 2] / 2);
-						float y = fh * (output[k + 1] - output[k + 3] / 2);
+						float x = fw * (output[k] - output[k + 2] / 2 - m_resizedROI.x);
+						float y = fh * (output[k + 1] - output[k + 3] / 2 - m_resizedROI.y);
 						float width = fw * output[k + 2];
 						float height = fh * output[k + 3];
 

diff --git a/src/Detector/tensorrt_yolo/YoloONNXv7_instance.hpp b/src/Detector/tensorrt_yolo/YoloONNXv7_instance.hpp
@@ -18,8 +18,8 @@ class YOLOv7_instance_onnx : public YoloONNX
 	{
 		std::vector<tensor_rt::Result> resBoxes;
 
-		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_resizedROI.width);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_resizedROI.height);
 
 		size_t outInd = (outputs.size() == 0) ? 0 : 1;
 		size_t segInd = (outputs.size() == 0) ? 1 : 0;
@@ -123,8 +123,8 @@ class YOLOv7_instance_onnx : public YoloONNX
 			if (object_conf >= m_params.confThreshold)
 			{
 				// (center x, center y, width, height) to (x, y, w, h)
-				float x = fw * (output[k] - output[k + 2] / 2);
-				float y = fh * (output[k + 1] - output[k + 3] / 2);
+				float x = fw * (output[k] - output[k + 2] / 2 - m_resizedROI.x);
+				float y = fh * (output[k + 1] - output[k + 3] / 2 - m_resizedROI.y);
 				float width = fw * output[k + 2];
 				float height = fh * output[k + 3];
 

diff --git a/src/Detector/tensorrt_yolo/YoloONNXv8_bb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv8_bb.hpp
@@ -20,8 +20,8 @@ class YOLOv8_bb_onnx : public YoloONNX
 		//0: name: images, size: 1x3x640x640
 		//1: name: output0, size: 1x84x8400
 
-		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_resizedROI.width);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_resizedROI.height);
 
 		auto output = outputs[0];
 
@@ -88,8 +88,8 @@ class YOLOv8_bb_onnx : public YoloONNX
 				confidences.push_back(objectConf);
 
 				// (center x, center y, width, height) to (x, y, w, h)
-				float x = fw * (output[k] - output[k + 2] / 2);
-				float y = fh * (output[k + 1] - output[k + 3] / 2);
+				float x = fw * (output[k] - output[k + 2] / 2 - m_resizedROI.x);
+				float y = fh * (output[k + 1] - output[k + 3] / 2 - m_resizedROI.y);
 				float width = fw * output[k + 2];
 				float height = fh * output[k + 3];
 				rectBoxes.emplace_back(cvRound(x), cvRound(y), cvRound(width), cvRound(height));

diff --git a/src/Detector/tensorrt_yolo/YoloONNXv8_instance.hpp b/src/Detector/tensorrt_yolo/YoloONNXv8_instance.hpp
@@ -17,8 +17,8 @@ class YOLOv8_instance_onnx : public YoloONNX
 	{
 		std::vector<tensor_rt::Result> resBoxes;
 
-		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_resizedROI.width);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_resizedROI.height);
 
 		size_t outInd = (outputs.size() == 0) ? 0 : 1;
 		size_t segInd = (outputs.size() == 0) ? 1 : 0;
@@ -155,8 +155,8 @@ class YOLOv8_instance_onnx : public YoloONNX
 			if (objectConf >= m_params.confThreshold)
 			{
 				// (center x, center y, width, height) to (x, y, w, h)
-				float x = fw * (output[k] - output[k + 2] / 2);
-				float y = fh * (output[k + 1] - output[k + 3] / 2);
+				float x = fw * (output[k] - output[k + 2] / 2 - m_resizedROI.x);
+				float y = fh * (output[k + 1] - output[k + 3] / 2 - m_resizedROI.y);
 				float width = fw * output[k + 2];
 				float height = fh * output[k + 3];
 

diff --git a/src/Detector/tensorrt_yolo/YoloONNXv8_obb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv8_obb.hpp
@@ -22,8 +22,8 @@ class YOLOv8_obb_onnx : public YoloONNX
 		//20: 15 DOTA classes + x + y + w + h + a
 		constexpr int shapeDataSize = 5;
 
-		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_resizedROI.width);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_resizedROI.height);
 
 		auto output = outputs[0];
 
@@ -96,8 +96,8 @@ class YOLOv8_obb_onnx : public YoloONNX
 				confidences.push_back(objectConf);
 
 				// (center x, center y, width, height)
-				float cx = fw * output[k];
-				float cy = fh * output[k + 1];
+				float cx = fw * (output[k] - m_resizedROI.x);
+				float cy = fh * (output[k + 1] - m_resizedROI.y);
 				float width = fw * output[k + 2];
 				float height = fh * output[k + 3];
 				float angle = 180.f * output[k + nc + shapeDataSize - 1] / M_PI;

diff --git a/src/Detector/tensorrt_yolo/YoloONNXv9_bb.hpp b/src/Detector/tensorrt_yolo/YoloONNXv9_bb.hpp
@@ -22,8 +22,8 @@ class YOLOv9_bb_onnx : public YoloONNX
 		//84: 80 COCO classes + x + y + w + h
 		constexpr int shapeDataSize = 4;
 
-		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_inputDims.d[3]);
-		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_inputDims.d[2]);
+		const float fw = static_cast<float>(frameSize.width) / static_cast<float>(m_resizedROI.width);
+		const float fh = static_cast<float>(frameSize.height) / static_cast<float>(m_resizedROI.height);
 
 		auto output = outputs[0];
 
@@ -90,8 +90,8 @@ class YOLOv9_bb_onnx : public YoloONNX
 				confidences.push_back(objectConf);
 
 				// (center x, center y, width, height) to (x, y, w, h)
-				float x = fw * (output[k] - output[k + 2] / 2);
-				float y = fh * (output[k + 1] - output[k + 3] / 2);
+				float x = fw * (output[k] - output[k + 2] / 2 - m_resizedROI.x);
+				float y = fh * (output[k + 1] - output[k + 3] / 2 - m_resizedROI.y);
 				float width = fw * output[k + 2];
 				float height = fh * output[k + 3];
 				rectBoxes.emplace_back(cvRound(x), cvRound(y), cvRound(width), cvRound(height));