diff --git a/examples/inference/yolox_demo.py b/examples/inference/yolox_demo.py
new file mode 100644
index 0000000..4f58a41
--- /dev/null
+++ b/examples/inference/yolox_demo.py
@@ -0,0 +1,73 @@
+import cv2
+
+from what.models.detection.datasets.coco import COCO_CLASS_NAMES
+from what.models.detection.utils.box_utils import draw_bounding_boxes
+
+from what.models.detection.yolox.yolox_x import YOLOX_X
+from what.models.detection.yolox.yolox_l import YOLOX_L
+from what.models.detection.yolox.yolox_m import YOLOX_M
+from what.models.detection.yolox.yolox_s import YOLOX_S
+
+from what.cli.model import *
+from what.utils.file import get_file
+
+what_yolox_model_list = what_model_list[9:13]
+
+video = input(f"Please input the OpenCV capture device (e.g. 0, 1, 2): ")
+
+while not video.isdigit():
+ video = input(f"Please input the OpenCV capture device (e.g. 0, 1, 2): ")
+
+# Capture from camera
+cap = cv2.VideoCapture(int(video))
+#cap.set(3, 1920)
+#cap.set(4, 1080)
+
+# Check what_model_list for all supported models
+index = 0
+
+# Download the model first if not exists
+WHAT_YOLOX_MODEL_FILE = what_yolox_model_list[index][WHAT_MODEL_FILE_INDEX]
+WHAT_YOLOX_MODEL_URL = what_yolox_model_list[index][WHAT_MODEL_URL_INDEX]
+WHAT_YOLOX_MODEL_HASH = what_yolox_model_list[index][WHAT_MODEL_HASH_INDEX]
+
+if not os.path.isfile(os.path.join(WHAT_MODEL_PATH, WHAT_YOLOX_MODEL_FILE)):
+ get_file(WHAT_YOLOX_MODEL_FILE,
+ WHAT_MODEL_PATH,
+ WHAT_YOLOX_MODEL_URL,
+ WHAT_YOLOX_MODEL_HASH)
+
+if index == 0:
+ model = YOLOX_X(COCO_CLASS_NAMES, os.path.join(WHAT_MODEL_PATH, WHAT_YOLOX_MODEL_FILE))
+
+if index == 1:
+ model = YOLOX_L(COCO_CLASS_NAMES, os.path.join(WHAT_MODEL_PATH, WHAT_YOLOX_MODEL_FILE))
+
+if index == 2:
+ model = YOLOX_M(COCO_CLASS_NAMES, os.path.join(WHAT_MODEL_PATH, WHAT_YOLOX_MODEL_FILE))
+
+if index == 3:
+ model = YOLOX_S(COCO_CLASS_NAMES, os.path.join(WHAT_MODEL_PATH, WHAT_YOLOX_MODEL_FILE))
+
+while True:
+ _, orig_image = cap.read()
+ if orig_image is None:
+ continue
+
+ # Image preprocessing
+ image = cv2.cvtColor(orig_image, cv2.COLOR_BGR2RGB)
+
+ # Run inference
+ images, boxes, labels, probs = model.predict(image)
+ image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+
+ # Draw bounding boxes onto the image
+ output = draw_bounding_boxes(image, boxes, labels, model.class_names, probs);
+
+ cv2.imshow('YOLOX', image)
+
+ if cv2.waitKey(1) & 0xFF == ord('q'):
+ break
+
+cap.release()
+cv2.destroyAllWindows()
diff --git a/pyproject.toml b/pyproject.toml
index d8a7485..26c2a88 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,6 +12,7 @@ requires = [
"matplotlib",
"click",
"progressbar",
+ "loguru"
]
build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 0c0fc4e..8518ab7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,3 +11,4 @@ tensorflow
matplotlib
pandas
progressbar
+loguru
diff --git a/setup.py b/setup.py
index 1e38d99..480bd94 100644
--- a/setup.py
+++ b/setup.py
@@ -33,6 +33,7 @@ def get_version(rel_path):
"pandas",
"click",
"progressbar",
+ "loguru",
]
setuptools.setup(
diff --git a/what/cli/example.py b/what/cli/example.py
index 3af8cf6..d274f34 100644
--- a/what/cli/example.py
+++ b/what/cli/example.py
@@ -1,6 +1,7 @@
# Model Inference
from what.examples.yolov3_demo import yolov3_inference_demo
from what.examples.yolov4_demo import yolov4_inference_demo
+from what.examples.yolox_demo import yolox_inference_demo
from what.examples.faster_rcnn_demo import frcnn_inference_demo
from what.examples.mobilenet_ssd_demo import mobilenet_ssd_inference_demo
@@ -16,6 +17,7 @@
what_example_list = [
(' Yolov3 Demo ', ' Model Inference ', 'Yolov3 Object Detection.', yolov3_inference_demo),
(' Yolov4 Demo ', ' Model Inference ', 'Yolov4 Object Detection.', yolov4_inference_demo),
+ (' YoloX Demo ', ' Model Inference ', 'YoloX Object Detection.', yolox_inference_demo),
(' FasterRCNN Demo ', ' Model Inference ', 'FRCNN Object Detection.', frcnn_inference_demo),
('MobileNet SSD Demo', ' Model Inference ', 'MobileNet SSD Object Detection.', mobilenet_ssd_inference_demo),
(' TOG Attack Demo ', 'Adversarial Attack', 'Real-time TOG Attack against Yolov3 Tiny.', yolov3_pcb_attack_demo),
diff --git a/what/cli/model.py b/what/cli/model.py
index fdf0c0d..d971702 100644
--- a/what/cli/model.py
+++ b/what/cli/model.py
@@ -19,5 +19,9 @@
('YOLOv4 Tiny ( Darknet )', 'Object Detection', 'YOLOv4 Tiny pretrained on MS COCO dataset.', 'yolov4-tiny.h5', 'https://wuhanstudio.nyc3.cdn.digitaloceanspaces.com/what/yolov4-tiny.h5', '867f54dced382170538a9ca2374e14e778f80d4abd6011652b911b6aca77384e'),
('SSD ( MobileNet v1 )', 'Object Detection', 'SSD pretrained on VOC-2012 dataset.', 'mobilenet-v1-ssd-mp-0_675.pth', 'https://wuhanstudio.nyc3.cdn.digitaloceanspaces.com/what/mobilenet-v1-ssd-mp-0_675.pth', '58694cafa60456eeab4e81ae50ff49a01c46ab387bfea5200f047143ecd973a9'),
('SSD ( MobileNet v2 )', 'Object Detection', 'SSD pretrained on VOC-2012 dataset.', 'mobilenet-v2-ssd-lite-mp-0_686.pth', 'https://wuhanstudio.nyc3.cdn.digitaloceanspaces.com/what/mobilenet-v2-ssd-lite-mp-0_686.pth', 'b0d1ac2cdbf3c241ba837f51eeebc565ea37b95b7258e2604506a2f991e398a4'),
- ('FasterRCNN ( VGG16 )', 'Object Detection', 'Faster-RCNN pretrained on VOC-2012 dataset.', 'fasterrcnn_12211511_0.701052458187_torchvision_pretrain.pth', 'https://wuhanstudio.nyc3.cdn.digitaloceanspaces.com/what/fasterrcnn_12211511_0.701052458187_torchvision_pretrain.pth', '3fd279284b536da3eac754404779e32e2e9fdd82d8511bbc7f6c50e14f0c69d2')
+ ('FasterRCNN ( VGG16 )', 'Object Detection', 'Faster-RCNN pretrained on VOC-2012 dataset.', 'fasterrcnn_12211511_0.701052458187_torchvision_pretrain.pth', 'https://wuhanstudio.nyc3.cdn.digitaloceanspaces.com/what/fasterrcnn_12211511_0.701052458187_torchvision_pretrain.pth', '3fd279284b536da3eac754404779e32e2e9fdd82d8511bbc7f6c50e14f0c69d2'),
+ ('YOLOX X-Large ', 'Object Detection', 'YOLOX-X pretrained on MS COCO dataset.', 'yolox-x.pth', 'https://wuhanstudio.nyc3.cdn.digitaloceanspaces.com/what/yolox_x.pth', '5652330b6ae860043f091b8f550a60c10e1129f416edfdb65c259be6caf355cf'),
+ ('YOLOX Large ', 'Object Detection', 'YOLOX-L pretrained on MS COCO dataset.', 'yolox-l.pth', 'https://wuhanstudio.nyc3.cdn.digitaloceanspaces.com/what/yolox_l.pth', '1e6b7fa6240375370b2a8a8eab9066b3cdd43fd1d0bfa8d2027fd3a51def2917'),
+ ('YOLOX Medium ', 'Object Detection', 'YOLOX-M pretrained on MS COCO dataset.', 'yolox-m.pth', 'https://wuhanstudio.nyc3.cdn.digitaloceanspaces.com/what/yolox_m.pth', '60076992b32da82951c90cfa7bd6ab70eba9eda243e08b940a396f60ac2d19b6'),
+ ('YOLOX Small ', 'Object Detection', 'YOLOX-S pretrained on MS COCO dataset.', 'yolox-s.pth', 'https://wuhanstudio.nyc3.cdn.digitaloceanspaces.com/what/yolox_s.pth', 'f55ded7181e1b0c13285c56e7790b8f0e8f8db590fe4edb37f0b7f345c913a30'),
]
diff --git a/what/examples/faster_rcnn_demo.py b/what/examples/faster_rcnn_demo.py
index 8508dfc..da532f8 100644
--- a/what/examples/faster_rcnn_demo.py
+++ b/what/examples/faster_rcnn_demo.py
@@ -69,11 +69,12 @@ def frcnn_inference_demo():
boxes[:, 2] = box_w / width
boxes[:, 3] = box_h / height
- output = draw_bounding_boxes(orig_image,
- boxes,
- labels[0],
- VOC_CLASS_NAMES[1:],
- scores[0])
+ if len(boxes) > 0:
+ output = draw_bounding_boxes(orig_image,
+ boxes,
+ labels[0],
+ VOC_CLASS_NAMES[1:],
+ scores[0])
cv2.imshow('Faster RCNN Demo', output)
diff --git a/what/examples/mobilenet_ssd_demo.py b/what/examples/mobilenet_ssd_demo.py
index 7a0238d..608f462 100644
--- a/what/examples/mobilenet_ssd_demo.py
+++ b/what/examples/mobilenet_ssd_demo.py
@@ -78,7 +78,8 @@ def mobilenet_ssd_inference_demo():
# Draw bounding boxes onto the image
height, width, _ = image.shape
- output = draw_bounding_boxes(image, boxes, labels, model.class_names, probs);
+ if len(boxes) > 0:
+ output = draw_bounding_boxes(image, boxes, labels, model.class_names, probs);
cv2.imshow('MobileNet SSD Demo', output)
diff --git a/what/examples/yolov3_demo.py b/what/examples/yolov3_demo.py
index d878342..cfe7a53 100644
--- a/what/examples/yolov3_demo.py
+++ b/what/examples/yolov3_demo.py
@@ -74,7 +74,8 @@ def yolov3_inference_demo():
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
# Draw bounding boxes onto the image
- output = draw_bounding_boxes(image, boxes, labels, model.class_names, probs);
+ if len(boxes) > 0:
+ output = draw_bounding_boxes(image, boxes, labels, model.class_names, probs);
cv2.imshow('YOLOv3 Demo', image)
diff --git a/what/examples/yolov3_pcb_attack_demo.py b/what/examples/yolov3_pcb_attack_demo.py
index 4bc68d5..77c0b31 100644
--- a/what/examples/yolov3_pcb_attack_demo.py
+++ b/what/examples/yolov3_pcb_attack_demo.py
@@ -111,7 +111,8 @@ def yolov3_pcb_attack_demo():
# logger.info(f"{classes[labels[i]]}: {probs[i]:.2f}")
out_img = cv2.cvtColor(out_img, cv2.COLOR_RGB2BGR)
- out_img = draw_bounding_boxes(out_img, boxes, labels, classes, probs);
+ if len(boxes) > 0:
+ out_img = draw_bounding_boxes(out_img, boxes, labels, classes, probs);
cv2.namedWindow("result", cv2.WINDOW_NORMAL)
cv2.imshow("result", out_img)
diff --git a/what/examples/yolov3_tog_attack_demo.py b/what/examples/yolov3_tog_attack_demo.py
index a60d46a..4f1e29e 100644
--- a/what/examples/yolov3_tog_attack_demo.py
+++ b/what/examples/yolov3_tog_attack_demo.py
@@ -108,7 +108,8 @@ def yolov3_tog_attack_demo():
out_img = (out_img * 255.0).astype(np.uint8)
out_img = cv2.cvtColor(out_img, cv2.COLOR_RGB2BGR)
- out_img = draw_bounding_boxes(out_img, boxes, labels, classes, probs);
+ if len(boxes) > 0:
+ out_img = draw_bounding_boxes(out_img, boxes, labels, classes, probs);
cv2.namedWindow("result", cv2.WINDOW_NORMAL)
cv2.imshow("result", out_img)
diff --git a/what/examples/yolov4_demo.py b/what/examples/yolov4_demo.py
index 20ba4d1..cd00c15 100644
--- a/what/examples/yolov4_demo.py
+++ b/what/examples/yolov4_demo.py
@@ -74,7 +74,8 @@ def yolov4_inference_demo():
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
# Draw bounding boxes onto the image
- output = draw_bounding_boxes(image, boxes, labels, model.class_names, probs);
+ if len(boxes) > 0:
+ output = draw_bounding_boxes(image, boxes, labels, model.class_names, probs);
cv2.imshow('YOLOv4 Demo', image)
diff --git a/what/examples/yolox_demo.py b/what/examples/yolox_demo.py
new file mode 100644
index 0000000..ea4be2f
--- /dev/null
+++ b/what/examples/yolox_demo.py
@@ -0,0 +1,101 @@
+import cv2
+import os.path
+
+from what.models.detection.datasets.coco import COCO_CLASS_NAMES
+from what.models.detection.utils.box_utils import draw_bounding_boxes
+
+from what.models.detection.yolox.yolox_x import YOLOX_X
+from what.models.detection.yolox.yolox_l import YOLOX_L
+from what.models.detection.yolox.yolox_m import YOLOX_M
+from what.models.detection.yolox.yolox_s import YOLOX_S
+
+from what.cli.model import *
+
+from what.utils.file import get_file
+
+what_yolox_model_list = what_model_list[9:13]
+
+def yolox_inference_demo():
+
+ max_len = max([len(x[WHAT_MODEL_NAME_INDEX]) for x in what_yolox_model_list])
+ for i, model in enumerate(what_yolox_model_list, start=1):
+ if os.path.isfile(os.path.join(WHAT_MODEL_PATH, model[WHAT_MODEL_FILE_INDEX])):
+ downloaded = 'x'
+ else:
+ downloaded = ' '
+ print('[{}] {} : {:<{w}s}\t{}\t{}'.format(downloaded, i, model[WHAT_MODEL_NAME_INDEX], model[WHAT_MODEL_TYPE_INDEX], model[WHAT_MODEL_DESC_INDEX], w=max_len))
+
+ index = input(f"Please input the model index: ")
+ while not index.isdigit() or int(index) > len(what_yolox_model_list):
+ index = input(f"Model [{index}] does not exist. Please try again: ")
+
+ index = int(index) - 1
+
+ # Download the model first if not exists
+ WHAT_YOLOX_MODEL_FILE = what_yolox_model_list[index][WHAT_MODEL_FILE_INDEX]
+ WHAT_YOLOX_MODEL_URL = what_yolox_model_list[index][WHAT_MODEL_URL_INDEX]
+ WHAT_YOLOX_MODEL_HASH = what_yolox_model_list[index][WHAT_MODEL_HASH_INDEX]
+
+ if not os.path.isfile(os.path.join(WHAT_MODEL_PATH, WHAT_YOLOX_MODEL_FILE)):
+ get_file(WHAT_YOLOX_MODEL_FILE,
+ WHAT_MODEL_PATH,
+ WHAT_YOLOX_MODEL_URL,
+ WHAT_YOLOX_MODEL_HASH)
+
+ if index == 0:
+ model = YOLOX_X(COCO_CLASS_NAMES, os.path.join(WHAT_MODEL_PATH, WHAT_YOLOX_MODEL_FILE))
+
+ if index == 1:
+ model = YOLOX_L(COCO_CLASS_NAMES, os.path.join(WHAT_MODEL_PATH, WHAT_YOLOX_MODEL_FILE))
+
+ if index == 2:
+ model = YOLOX_M(COCO_CLASS_NAMES, os.path.join(WHAT_MODEL_PATH, WHAT_YOLOX_MODEL_FILE))
+
+ if index == 3:
+ model = YOLOX_S(COCO_CLASS_NAMES, os.path.join(WHAT_MODEL_PATH, WHAT_YOLOX_MODEL_FILE))
+
+ video = input(f"Please input the OpenCV capture device (e.g. 0, 1, 2): ")
+
+ while not video.isdigit():
+ video = input(f"Please input the OpenCV capture device (e.g. 0, 1, 2): ")
+
+ try:
+ # Capture from camera or video
+ if video.isdigit():
+ cap = cv2.VideoCapture(int(video))
+ else:
+ cap = cv2.VideoCapture(video)
+
+ #cap.set(3, 1920)
+ #cap.set(4, 1080)
+
+ while True:
+ _, orig_image = cap.read()
+ if orig_image is None:
+ continue
+
+ # Image preprocessing
+ image = cv2.cvtColor(orig_image, cv2.COLOR_BGR2RGB)
+
+ # Run inference
+ images, boxes, labels, probs = model.predict(image)
+ image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+
+ # Draw bounding boxes onto the image
+ if len(boxes) > 0:
+ output = draw_bounding_boxes(image, boxes, labels, model.class_names, probs);
+
+ cv2.imshow('YOLOX Demo', image)
+
+ if cv2.waitKey(1) & 0xFF == ord('q'):
+ break
+
+ cap.release()
+ cv2.destroyAllWindows()
+
+ except Exception as e:
+ print(e)
+
+
+if __name__ == "__main__":
+ yolox_inference_demo()
diff --git a/what/models/detection/__init__.py b/what/models/detection/__init__.py
index 6e3c0a9..81142cd 100644
--- a/what/models/detection/__init__.py
+++ b/what/models/detection/__init__.py
@@ -9,3 +9,4 @@
from what.models.detection import yolo
from what.models.detection import ssd
from what.models.detection import frcnn
+from what.models.detection import yolox
diff --git a/what/models/detection/utils/box_utils.py b/what/models/detection/utils/box_utils.py
index ec829e2..2a1c8cb 100644
--- a/what/models/detection/utils/box_utils.py
+++ b/what/models/detection/utils/box_utils.py
@@ -7,36 +7,37 @@ def draw_bounding_boxes(image, boxes, labels, class_names, probs):
assert(boxes.shape[1] == 4)
boxes = to_numpy(boxes)
- # (x, y, w, h) --> (x1, y1, x2, y2)
- height, width, _ = image.shape
- for box in boxes:
- box[0] *= width
- box[1] *= height
- box[2] *= width
- box[3] *= height
-
- # From center to top left
- box[0] -= box[2] / 2
- box[1] -= box[3] / 2
-
- # From width and height to x2 and y2
- box[2] += box[0]
- box[3] += box[1]
-
- # Draw bounding boxes and labels
- for i in range(boxes.shape[0]):
- box = boxes[i]
- label = f"{class_names[labels[i]]}: {probs[i]:.2f}"
- # print(label)
-
- # Draw bounding boxes
- cv2.rectangle(image, (int(box[0].item()), int(box[1].item())), (int(box[2].item()), int(box[3].item())), (255, 255, 0), 4)
-
- # Draw labels
- cv2.putText(image, label,
- (int(box[0]+20), int(box[1]+40)),
- cv2.FONT_HERSHEY_SIMPLEX,
- 1, # font scale
- (255, 0, 255),
- 2) # line type
+ # (x, y, w, h) --> (x1, y1, x2, y2)
+ height, width, _ = image.shape
+ for box in boxes:
+ box[0] *= width
+ box[1] *= height
+ box[2] *= width
+ box[3] *= height
+
+ # From center to top left
+ box[0] -= box[2] / 2
+ box[1] -= box[3] / 2
+
+ # From width and height to x2 and y2
+ box[2] += box[0]
+ box[3] += box[1]
+
+ # Draw bounding boxes and labels
+ for i in range(boxes.shape[0]):
+ box = boxes[i]
+ label = f"{class_names[labels[i]]}: {probs[i]:.2f}"
+ # print(label)
+
+ # Draw bounding boxes
+ cv2.rectangle(image, (int(box[0].item()), int(box[1].item())), (int(box[2].item()), int(box[3].item())), (255, 255, 0), 4)
+
+ # Draw labels
+ cv2.putText(image, label,
+ (int(box[0]+20), int(box[1]+40)),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ 1, # font scale
+ (255, 0, 255),
+ 2) # line type
+
return image
diff --git a/what/models/detection/yolox/__init__.py b/what/models/detection/yolox/__init__.py
new file mode 100644
index 0000000..3006b2b
--- /dev/null
+++ b/what/models/detection/yolox/__init__.py
@@ -0,0 +1,18 @@
+r'''
+This module implements YOLOX object detection model.
+
+
+
+## what.models.detection.yolox.yolox_x
+## what.models.detection.yolox.yolox_m
+## what.models.detection.yolox.yolox_l
+## what.models.detection.yolox.yolox_s
+
+'''
+
+from what.models.detection.yolox.yolox_x import YOLOX_X
+from what.models.detection.yolox.yolox_m import YOLOX_M
+from what.models.detection.yolox.yolox_l import YOLOX_L
+from what.models.detection.yolox.yolox_s import YOLOX_S
+
+__all__ = ["YOLOX_X", "YOLOX_M", "YOLOX_L", "YOLOX_S"]
diff --git a/what/models/detection/yolox/core/__init__.py b/what/models/detection/yolox/core/__init__.py
new file mode 100644
index 0000000..c2379c7
--- /dev/null
+++ b/what/models/detection/yolox/core/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+from .launch import launch
+from .trainer import Trainer
diff --git a/what/models/detection/yolox/core/launch.py b/what/models/detection/yolox/core/launch.py
new file mode 100644
index 0000000..9f8eec6
--- /dev/null
+++ b/what/models/detection/yolox/core/launch.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Code are based on
+# https://github.com/facebookresearch/detectron2/blob/master/detectron2/engine/launch.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import sys
+from datetime import timedelta
+from loguru import logger
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+import yolox.utils.dist as comm
+
+__all__ = ["launch"]
+
+
+DEFAULT_TIMEOUT = timedelta(minutes=30)
+
+
+def _find_free_port():
+ """
+ Find an available port of current machine / node.
+ """
+ import socket
+
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ # Binding to port 0 will cause the OS to find an available port for us
+ sock.bind(("", 0))
+ port = sock.getsockname()[1]
+ sock.close()
+ # NOTE: there is still a chance the port could be taken by other processes.
+ return port
+
+
+def launch(
+ main_func,
+ num_gpus_per_machine,
+ num_machines=1,
+ machine_rank=0,
+ backend="nccl",
+ dist_url=None,
+ args=(),
+ timeout=DEFAULT_TIMEOUT,
+):
+ """
+ Args:
+ main_func: a function that will be called by `main_func(*args)`
+ num_machines (int): the total number of machines
+ machine_rank (int): the rank of this machine (one per machine)
+ dist_url (str): url to connect to for distributed training, including protocol
+ e.g. "tcp://127.0.0.1:8686".
+ Can be set to auto to automatically select a free port on localhost
+ args (tuple): arguments passed to main_func
+ """
+ world_size = num_machines * num_gpus_per_machine
+ if world_size > 1:
+ # https://github.com/pytorch/pytorch/pull/14391
+ # TODO prctl in spawned processes
+
+ if dist_url == "auto":
+ assert (
+ num_machines == 1
+ ), "dist_url=auto cannot work with distributed training."
+ port = _find_free_port()
+ dist_url = f"tcp://127.0.0.1:{port}"
+
+ start_method = "spawn"
+ cache = vars(args[1]).get("cache", False)
+
+ # To use numpy memmap for caching image into RAM, we have to use fork method
+ if cache:
+ assert sys.platform != "win32", (
+ "As Windows platform doesn't support fork method, "
+ "do not add --cache in your training command."
+ )
+ start_method = "fork"
+
+ mp.start_processes(
+ _distributed_worker,
+ nprocs=num_gpus_per_machine,
+ args=(
+ main_func,
+ world_size,
+ num_gpus_per_machine,
+ machine_rank,
+ backend,
+ dist_url,
+ args,
+ ),
+ daemon=False,
+ start_method=start_method,
+ )
+ else:
+ main_func(*args)
+
+
+def _distributed_worker(
+ local_rank,
+ main_func,
+ world_size,
+ num_gpus_per_machine,
+ machine_rank,
+ backend,
+ dist_url,
+ args,
+ timeout=DEFAULT_TIMEOUT,
+):
+ assert (
+ torch.cuda.is_available()
+ ), "cuda is not available. Please check your installation."
+ global_rank = machine_rank * num_gpus_per_machine + local_rank
+ logger.info("Rank {} initialization finished.".format(global_rank))
+ try:
+ dist.init_process_group(
+ backend=backend,
+ init_method=dist_url,
+ world_size=world_size,
+ rank=global_rank,
+ timeout=timeout,
+ )
+ except Exception:
+ logger.error("Process group URL: {}".format(dist_url))
+ raise
+
+ # Setup the local process group (which contains ranks within the same machine)
+ assert comm._LOCAL_PROCESS_GROUP is None
+ num_machines = world_size // num_gpus_per_machine
+ for i in range(num_machines):
+ ranks_on_i = list(
+ range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine)
+ )
+ pg = dist.new_group(ranks_on_i)
+ if i == machine_rank:
+ comm._LOCAL_PROCESS_GROUP = pg
+
+ # synchronize is needed here to prevent a possible timeout after calling init_process_group
+ # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
+ comm.synchronize()
+
+ assert num_gpus_per_machine <= torch.cuda.device_count()
+ torch.cuda.set_device(local_rank)
+
+ main_func(*args)
diff --git a/what/models/detection/yolox/core/trainer.py b/what/models/detection/yolox/core/trainer.py
new file mode 100644
index 0000000..a764426
--- /dev/null
+++ b/what/models/detection/yolox/core/trainer.py
@@ -0,0 +1,390 @@
+#!/usr/bin/env python3
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import datetime
+import os
+import time
+from loguru import logger
+
+import torch
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+
+from yolox.data import DataPrefetcher
+from yolox.exp import Exp
+from yolox.utils import (
+ MeterBuffer,
+ ModelEMA,
+ WandbLogger,
+ adjust_status,
+ all_reduce_norm,
+ get_local_rank,
+ get_model_info,
+ get_rank,
+ get_world_size,
+ gpu_mem_usage,
+ is_parallel,
+ load_ckpt,
+ mem_usage,
+ occupy_mem,
+ save_checkpoint,
+ setup_logger,
+ synchronize
+)
+
+
+class Trainer:
+ def __init__(self, exp: Exp, args):
+ # init function only defines some basic attr, other attrs like model, optimizer are built in
+ # before_train methods.
+ self.exp = exp
+ self.args = args
+
+ # training related attr
+ self.max_epoch = exp.max_epoch
+ self.amp_training = args.fp16
+ self.scaler = torch.cuda.amp.GradScaler(enabled=args.fp16)
+ self.is_distributed = get_world_size() > 1
+ self.rank = get_rank()
+ self.local_rank = get_local_rank()
+ self.device = "cuda:{}".format(self.local_rank)
+ self.use_model_ema = exp.ema
+ self.save_history_ckpt = exp.save_history_ckpt
+
+ # data/dataloader related attr
+ self.data_type = torch.float16 if args.fp16 else torch.float32
+ self.input_size = exp.input_size
+ self.best_ap = 0
+
+ # metric record
+ self.meter = MeterBuffer(window_size=exp.print_interval)
+ self.file_name = os.path.join(exp.output_dir, args.experiment_name)
+
+ if self.rank == 0:
+ os.makedirs(self.file_name, exist_ok=True)
+
+ setup_logger(
+ self.file_name,
+ distributed_rank=self.rank,
+ filename="train_log.txt",
+ mode="a",
+ )
+
+ def train(self):
+ self.before_train()
+ try:
+ self.train_in_epoch()
+ except Exception:
+ raise
+ finally:
+ self.after_train()
+
+ def train_in_epoch(self):
+ for self.epoch in range(self.start_epoch, self.max_epoch):
+ self.before_epoch()
+ self.train_in_iter()
+ self.after_epoch()
+
+ def train_in_iter(self):
+ for self.iter in range(self.max_iter):
+ self.before_iter()
+ self.train_one_iter()
+ self.after_iter()
+
+ def train_one_iter(self):
+ iter_start_time = time.time()
+
+ inps, targets = self.prefetcher.next()
+ inps = inps.to(self.data_type)
+ targets = targets.to(self.data_type)
+ targets.requires_grad = False
+ inps, targets = self.exp.preprocess(inps, targets, self.input_size)
+ data_end_time = time.time()
+
+ with torch.cuda.amp.autocast(enabled=self.amp_training):
+ outputs = self.model(inps, targets)
+
+ loss = outputs["total_loss"]
+
+ self.optimizer.zero_grad()
+ self.scaler.scale(loss).backward()
+ self.scaler.step(self.optimizer)
+ self.scaler.update()
+
+ if self.use_model_ema:
+ self.ema_model.update(self.model)
+
+ lr = self.lr_scheduler.update_lr(self.progress_in_iter + 1)
+ for param_group in self.optimizer.param_groups:
+ param_group["lr"] = lr
+
+ iter_end_time = time.time()
+ self.meter.update(
+ iter_time=iter_end_time - iter_start_time,
+ data_time=data_end_time - iter_start_time,
+ lr=lr,
+ **outputs,
+ )
+
+ def before_train(self):
+ logger.info("args: {}".format(self.args))
+ logger.info("exp value:\n{}".format(self.exp))
+
+ # model related init
+ torch.cuda.set_device(self.local_rank)
+ model = self.exp.get_model()
+ logger.info(
+ "Model Summary: {}".format(get_model_info(model, self.exp.test_size))
+ )
+ model.to(self.device)
+
+ # solver related init
+ self.optimizer = self.exp.get_optimizer(self.args.batch_size)
+
+ # value of epoch will be set in `resume_train`
+ model = self.resume_train(model)
+
+ # data related init
+ self.no_aug = self.start_epoch >= self.max_epoch - self.exp.no_aug_epochs
+ self.train_loader = self.exp.get_data_loader(
+ batch_size=self.args.batch_size,
+ is_distributed=self.is_distributed,
+ no_aug=self.no_aug,
+ cache_img=self.args.cache,
+ )
+ logger.info("init prefetcher, this might take one minute or less...")
+ self.prefetcher = DataPrefetcher(self.train_loader)
+ # max_iter means iters per epoch
+ self.max_iter = len(self.train_loader)
+
+ self.lr_scheduler = self.exp.get_lr_scheduler(
+ self.exp.basic_lr_per_img * self.args.batch_size, self.max_iter
+ )
+ if self.args.occupy:
+ occupy_mem(self.local_rank)
+
+ if self.is_distributed:
+ model = DDP(model, device_ids=[self.local_rank], broadcast_buffers=False)
+
+ if self.use_model_ema:
+ self.ema_model = ModelEMA(model, 0.9998)
+ self.ema_model.updates = self.max_iter * self.start_epoch
+
+ self.model = model
+
+ self.evaluator = self.exp.get_evaluator(
+ batch_size=self.args.batch_size, is_distributed=self.is_distributed
+ )
+ # Tensorboard and Wandb loggers
+ if self.rank == 0:
+ if self.args.logger == "tensorboard":
+ self.tblogger = SummaryWriter(os.path.join(self.file_name, "tensorboard"))
+ elif self.args.logger == "wandb":
+ self.wandb_logger = WandbLogger.initialize_wandb_logger(
+ self.args,
+ self.exp,
+ self.evaluator.dataloader.dataset
+ )
+ else:
+ raise ValueError("logger must be either 'tensorboard' or 'wandb'")
+
+ logger.info("Training start...")
+ logger.info("\n{}".format(model))
+
+ def after_train(self):
+ logger.info(
+ "Training of experiment is done and the best AP is {:.2f}".format(self.best_ap * 100)
+ )
+ if self.rank == 0:
+ if self.args.logger == "wandb":
+ self.wandb_logger.finish()
+
+ def before_epoch(self):
+ logger.info("---> start train epoch{}".format(self.epoch + 1))
+
+ if self.epoch + 1 == self.max_epoch - self.exp.no_aug_epochs or self.no_aug:
+ logger.info("--->No mosaic aug now!")
+ self.train_loader.close_mosaic()
+ logger.info("--->Add additional L1 loss now!")
+ if self.is_distributed:
+ self.model.module.head.use_l1 = True
+ else:
+ self.model.head.use_l1 = True
+ self.exp.eval_interval = 1
+ if not self.no_aug:
+ self.save_ckpt(ckpt_name="last_mosaic_epoch")
+
+ def after_epoch(self):
+ self.save_ckpt(ckpt_name="latest")
+
+ if (self.epoch + 1) % self.exp.eval_interval == 0:
+ all_reduce_norm(self.model)
+ self.evaluate_and_save_model()
+
+ def before_iter(self):
+ pass
+
+ def after_iter(self):
+ """
+ `after_iter` contains two parts of logic:
+ * log information
+ * reset setting of resize
+ """
+ # log needed information
+ if (self.iter + 1) % self.exp.print_interval == 0:
+ # TODO check ETA logic
+ left_iters = self.max_iter * self.max_epoch - (self.progress_in_iter + 1)
+ eta_seconds = self.meter["iter_time"].global_avg * left_iters
+ eta_str = "ETA: {}".format(datetime.timedelta(seconds=int(eta_seconds)))
+
+ progress_str = "epoch: {}/{}, iter: {}/{}".format(
+ self.epoch + 1, self.max_epoch, self.iter + 1, self.max_iter
+ )
+ loss_meter = self.meter.get_filtered_meter("loss")
+ loss_str = ", ".join(
+ ["{}: {:.1f}".format(k, v.latest) for k, v in loss_meter.items()]
+ )
+
+ time_meter = self.meter.get_filtered_meter("time")
+ time_str = ", ".join(
+ ["{}: {:.3f}s".format(k, v.avg) for k, v in time_meter.items()]
+ )
+
+ mem_str = "gpu mem: {:.0f}Mb, mem: {:.1f}Gb".format(gpu_mem_usage(), mem_usage())
+
+ logger.info(
+ "{}, {}, {}, {}, lr: {:.3e}".format(
+ progress_str,
+ mem_str,
+ time_str,
+ loss_str,
+ self.meter["lr"].latest,
+ )
+ + (", size: {:d}, {}".format(self.input_size[0], eta_str))
+ )
+
+ if self.rank == 0:
+ if self.args.logger == "tensorboard":
+ self.tblogger.add_scalar(
+ "train/lr", self.meter["lr"].latest, self.progress_in_iter)
+ for k, v in loss_meter.items():
+ self.tblogger.add_scalar(
+ f"train/{k}", v.latest, self.progress_in_iter)
+ if self.args.logger == "wandb":
+ metrics = {"train/" + k: v.latest for k, v in loss_meter.items()}
+ metrics.update({
+ "train/lr": self.meter["lr"].latest
+ })
+ self.wandb_logger.log_metrics(metrics, step=self.progress_in_iter)
+
+ self.meter.clear_meters()
+
+ # random resizing
+ if (self.progress_in_iter + 1) % 10 == 0:
+ self.input_size = self.exp.random_resize(
+ self.train_loader, self.epoch, self.rank, self.is_distributed
+ )
+
+ @property
+ def progress_in_iter(self):
+ return self.epoch * self.max_iter + self.iter
+
+ def resume_train(self, model):
+ if self.args.resume:
+ logger.info("resume training")
+ if self.args.ckpt is None:
+ ckpt_file = os.path.join(self.file_name, "latest" + "_ckpt.pth")
+ else:
+ ckpt_file = self.args.ckpt
+
+ ckpt = torch.load(ckpt_file, map_location=self.device)
+ # resume the model/optimizer state dict
+ model.load_state_dict(ckpt["model"])
+ self.optimizer.load_state_dict(ckpt["optimizer"])
+ self.best_ap = ckpt.pop("best_ap", 0)
+ # resume the training states variables
+ start_epoch = (
+ self.args.start_epoch - 1
+ if self.args.start_epoch is not None
+ else ckpt["start_epoch"]
+ )
+ self.start_epoch = start_epoch
+ logger.info(
+ "loaded checkpoint '{}' (epoch {})".format(
+ self.args.resume, self.start_epoch
+ )
+ ) # noqa
+ else:
+ if self.args.ckpt is not None:
+ logger.info("loading checkpoint for fine tuning")
+ ckpt_file = self.args.ckpt
+ ckpt = torch.load(ckpt_file, map_location=self.device)["model"]
+ model = load_ckpt(model, ckpt)
+ self.start_epoch = 0
+
+ return model
+
+ def evaluate_and_save_model(self):
+ if self.use_model_ema:
+ evalmodel = self.ema_model.ema
+ else:
+ evalmodel = self.model
+ if is_parallel(evalmodel):
+ evalmodel = evalmodel.module
+
+ with adjust_status(evalmodel, training=False):
+ (ap50_95, ap50, summary), predictions = self.exp.eval(
+ evalmodel, self.evaluator, self.is_distributed, return_outputs=True
+ )
+
+ update_best_ckpt = ap50_95 > self.best_ap
+ self.best_ap = max(self.best_ap, ap50_95)
+
+ if self.rank == 0:
+ if self.args.logger == "tensorboard":
+ self.tblogger.add_scalar("val/COCOAP50", ap50, self.epoch + 1)
+ self.tblogger.add_scalar("val/COCOAP50_95", ap50_95, self.epoch + 1)
+ if self.args.logger == "wandb":
+ self.wandb_logger.log_metrics({
+ "val/COCOAP50": ap50,
+ "val/COCOAP50_95": ap50_95,
+ "train/epoch": self.epoch + 1,
+ })
+ self.wandb_logger.log_images(predictions)
+ logger.info("\n" + summary)
+ synchronize()
+
+ self.save_ckpt("last_epoch", update_best_ckpt, ap=ap50_95)
+ if self.save_history_ckpt:
+ self.save_ckpt(f"epoch_{self.epoch + 1}", ap=ap50_95)
+
+ def save_ckpt(self, ckpt_name, update_best_ckpt=False, ap=None):
+ if self.rank == 0:
+ save_model = self.ema_model.ema if self.use_model_ema else self.model
+ logger.info("Save weights to {}".format(self.file_name))
+ ckpt_state = {
+ "start_epoch": self.epoch + 1,
+ "model": save_model.state_dict(),
+ "optimizer": self.optimizer.state_dict(),
+ "best_ap": self.best_ap,
+ "curr_ap": ap,
+ }
+ save_checkpoint(
+ ckpt_state,
+ update_best_ckpt,
+ self.file_name,
+ ckpt_name,
+ )
+
+ if self.args.logger == "wandb":
+ self.wandb_logger.save_checkpoint(
+ self.file_name,
+ ckpt_name,
+ update_best_ckpt,
+ metadata={
+ "epoch": self.epoch + 1,
+ "optimizer": self.optimizer.state_dict(),
+ "best_ap": self.best_ap,
+ "curr_ap": ap
+ }
+ )
diff --git a/what/models/detection/yolox/data/__init__.py b/what/models/detection/yolox/data/__init__.py
new file mode 100644
index 0000000..aeaf4f9
--- /dev/null
+++ b/what/models/detection/yolox/data/__init__.py
@@ -0,0 +1,9 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+from .data_augment import TrainTransform, ValTransform
+from .data_prefetcher import DataPrefetcher
+from .dataloading import DataLoader, get_yolox_datadir, worker_init_reset_seed
+from .datasets import *
+from .samplers import InfiniteSampler, YoloBatchSampler
diff --git a/what/models/detection/yolox/data/data_augment.py b/what/models/detection/yolox/data/data_augment.py
new file mode 100644
index 0000000..2dad96a
--- /dev/null
+++ b/what/models/detection/yolox/data/data_augment.py
@@ -0,0 +1,243 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+"""
+Data augmentation functionality. Passed as callable transformations to
+Dataset classes.
+
+The data augmentation procedures were interpreted from @weiliu89's SSD paper
+http://arxiv.org/abs/1512.02325
+"""
+
+import math
+import random
+
+import cv2
+import numpy as np
+
+from what.models.detection.yolox.utils import xyxy2cxcywh
+
+
+def augment_hsv(img, hgain=5, sgain=30, vgain=30):
+ hsv_augs = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] # random gains
+ hsv_augs *= np.random.randint(0, 2, 3) # random selection of h, s, v
+ hsv_augs = hsv_augs.astype(np.int16)
+ img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV).astype(np.int16)
+
+ img_hsv[..., 0] = (img_hsv[..., 0] + hsv_augs[0]) % 180
+ img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_augs[1], 0, 255)
+ img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_augs[2], 0, 255)
+
+ cv2.cvtColor(img_hsv.astype(img.dtype), cv2.COLOR_HSV2BGR, dst=img) # no return needed
+
+
+def get_aug_params(value, center=0):
+ if isinstance(value, float):
+ return random.uniform(center - value, center + value)
+ elif len(value) == 2:
+ return random.uniform(value[0], value[1])
+ else:
+ raise ValueError(
+ "Affine params should be either a sequence containing two values\
+ or single float values. Got {}".format(value)
+ )
+
+
+def get_affine_matrix(
+ target_size,
+ degrees=10,
+ translate=0.1,
+ scales=0.1,
+ shear=10,
+):
+ twidth, theight = target_size
+
+ # Rotation and Scale
+ angle = get_aug_params(degrees)
+ scale = get_aug_params(scales, center=1.0)
+
+ if scale <= 0.0:
+ raise ValueError("Argument scale should be positive")
+
+ R = cv2.getRotationMatrix2D(angle=angle, center=(0, 0), scale=scale)
+
+ M = np.ones([2, 3])
+ # Shear
+ shear_x = math.tan(get_aug_params(shear) * math.pi / 180)
+ shear_y = math.tan(get_aug_params(shear) * math.pi / 180)
+
+ M[0] = R[0] + shear_y * R[1]
+ M[1] = R[1] + shear_x * R[0]
+
+ # Translation
+ translation_x = get_aug_params(translate) * twidth # x translation (pixels)
+ translation_y = get_aug_params(translate) * theight # y translation (pixels)
+
+ M[0, 2] = translation_x
+ M[1, 2] = translation_y
+
+ return M, scale
+
+
+def apply_affine_to_bboxes(targets, target_size, M, scale):
+ num_gts = len(targets)
+
+ # warp corner points
+ twidth, theight = target_size
+ corner_points = np.ones((4 * num_gts, 3))
+ corner_points[:, :2] = targets[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
+ 4 * num_gts, 2
+ ) # x1y1, x2y2, x1y2, x2y1
+ corner_points = corner_points @ M.T # apply affine transform
+ corner_points = corner_points.reshape(num_gts, 8)
+
+ # create new boxes
+ corner_xs = corner_points[:, 0::2]
+ corner_ys = corner_points[:, 1::2]
+ new_bboxes = (
+ np.concatenate(
+ (corner_xs.min(1), corner_ys.min(1), corner_xs.max(1), corner_ys.max(1))
+ )
+ .reshape(4, num_gts)
+ .T
+ )
+
+ # clip boxes
+ new_bboxes[:, 0::2] = new_bboxes[:, 0::2].clip(0, twidth)
+ new_bboxes[:, 1::2] = new_bboxes[:, 1::2].clip(0, theight)
+
+ targets[:, :4] = new_bboxes
+
+ return targets
+
+
+def random_affine(
+ img,
+ targets=(),
+ target_size=(640, 640),
+ degrees=10,
+ translate=0.1,
+ scales=0.1,
+ shear=10,
+):
+ M, scale = get_affine_matrix(target_size, degrees, translate, scales, shear)
+
+ img = cv2.warpAffine(img, M, dsize=target_size, borderValue=(114, 114, 114))
+
+ # Transform label coordinates
+ if len(targets) > 0:
+ targets = apply_affine_to_bboxes(targets, target_size, M, scale)
+
+ return img, targets
+
+
+def _mirror(image, boxes, prob=0.5):
+ _, width, _ = image.shape
+ if random.random() < prob:
+ image = image[:, ::-1]
+ boxes[:, 0::2] = width - boxes[:, 2::-2]
+ return image, boxes
+
+
+def preproc(img, input_size, swap=(2, 0, 1)):
+ if len(img.shape) == 3:
+ padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114
+ else:
+ padded_img = np.ones(input_size, dtype=np.uint8) * 114
+
+ r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
+ resized_img = cv2.resize(
+ img,
+ (int(img.shape[1] * r), int(img.shape[0] * r)),
+ interpolation=cv2.INTER_LINEAR,
+ ).astype(np.uint8)
+ padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
+
+ padded_img = padded_img.transpose(swap)
+ padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
+ return padded_img, r
+
+
+class TrainTransform:
+ def __init__(self, max_labels=50, flip_prob=0.5, hsv_prob=1.0):
+ self.max_labels = max_labels
+ self.flip_prob = flip_prob
+ self.hsv_prob = hsv_prob
+
+ def __call__(self, image, targets, input_dim):
+ boxes = targets[:, :4].copy()
+ labels = targets[:, 4].copy()
+ if len(boxes) == 0:
+ targets = np.zeros((self.max_labels, 5), dtype=np.float32)
+ image, r_o = preproc(image, input_dim)
+ return image, targets
+
+ image_o = image.copy()
+ targets_o = targets.copy()
+ height_o, width_o, _ = image_o.shape
+ boxes_o = targets_o[:, :4]
+ labels_o = targets_o[:, 4]
+ # bbox_o: [xyxy] to [c_x,c_y,w,h]
+ boxes_o = xyxy2cxcywh(boxes_o)
+
+ if random.random() < self.hsv_prob:
+ augment_hsv(image)
+ image_t, boxes = _mirror(image, boxes, self.flip_prob)
+ height, width, _ = image_t.shape
+ image_t, r_ = preproc(image_t, input_dim)
+ # boxes [xyxy] 2 [cx,cy,w,h]
+ boxes = xyxy2cxcywh(boxes)
+ boxes *= r_
+
+ mask_b = np.minimum(boxes[:, 2], boxes[:, 3]) > 1
+ boxes_t = boxes[mask_b]
+ labels_t = labels[mask_b]
+
+ if len(boxes_t) == 0:
+ image_t, r_o = preproc(image_o, input_dim)
+ boxes_o *= r_o
+ boxes_t = boxes_o
+ labels_t = labels_o
+
+ labels_t = np.expand_dims(labels_t, 1)
+
+ targets_t = np.hstack((labels_t, boxes_t))
+ padded_labels = np.zeros((self.max_labels, 5))
+ padded_labels[range(len(targets_t))[: self.max_labels]] = targets_t[
+ : self.max_labels
+ ]
+ padded_labels = np.ascontiguousarray(padded_labels, dtype=np.float32)
+ return image_t, padded_labels
+
+
+class ValTransform:
+ """
+ Defines the transformations that should be applied to test PIL image
+ for input into the network
+
+ dimension -> tensorize -> color adj
+
+ Arguments:
+ resize (int): input dimension to SSD
+ rgb_means ((int,int,int)): average RGB of the dataset
+ (104,117,123)
+ swap ((int,int,int)): final order of channels
+
+ Returns:
+ transform (transform) : callable transform to be applied to test/val
+ data
+ """
+
+ def __init__(self, swap=(2, 0, 1), legacy=False):
+ self.swap = swap
+ self.legacy = legacy
+
+ # assume input is cv2 img for now
+ def __call__(self, img, res, input_size):
+ img, _ = preproc(img, input_size, self.swap)
+ if self.legacy:
+ img = img[::-1, :, :].copy()
+ img /= 255.0
+ img -= np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1)
+ img /= np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1)
+ return img, np.zeros((1, 5))
diff --git a/what/models/detection/yolox/data/data_prefetcher.py b/what/models/detection/yolox/data/data_prefetcher.py
new file mode 100644
index 0000000..a118cf4
--- /dev/null
+++ b/what/models/detection/yolox/data/data_prefetcher.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import torch
+
+
+class DataPrefetcher:
+ """
+ DataPrefetcher is inspired by code of following file:
+ https://github.com/NVIDIA/apex/blob/master/examples/imagenet/main_amp.py
+ It could speedup your pytorch dataloader. For more information, please check
+ https://github.com/NVIDIA/apex/issues/304#issuecomment-493562789.
+ """
+
+ def __init__(self, loader):
+ self.loader = iter(loader)
+ self.stream = torch.cuda.Stream()
+ self.input_cuda = self._input_cuda_for_image
+ self.record_stream = DataPrefetcher._record_stream_for_image
+ self.preload()
+
+ def preload(self):
+ try:
+ self.next_input, self.next_target, _, _ = next(self.loader)
+ except StopIteration:
+ self.next_input = None
+ self.next_target = None
+ return
+
+ with torch.cuda.stream(self.stream):
+ self.input_cuda()
+ self.next_target = self.next_target.cuda(non_blocking=True)
+
+ def next(self):
+ torch.cuda.current_stream().wait_stream(self.stream)
+ input = self.next_input
+ target = self.next_target
+ if input is not None:
+ self.record_stream(input)
+ if target is not None:
+ target.record_stream(torch.cuda.current_stream())
+ self.preload()
+ return input, target
+
+ def _input_cuda_for_image(self):
+ self.next_input = self.next_input.cuda(non_blocking=True)
+
+ @staticmethod
+ def _record_stream_for_image(input):
+ input.record_stream(torch.cuda.current_stream())
diff --git a/what/models/detection/yolox/data/dataloading.py b/what/models/detection/yolox/data/dataloading.py
new file mode 100644
index 0000000..6fecf3f
--- /dev/null
+++ b/what/models/detection/yolox/data/dataloading.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+import random
+import uuid
+
+import numpy as np
+
+import torch
+from torch.utils.data.dataloader import DataLoader as torchDataLoader
+from torch.utils.data.dataloader import default_collate
+
+from .samplers import YoloBatchSampler
+
+
+def get_yolox_datadir():
+ """
+ get dataset dir of YOLOX. If environment variable named `YOLOX_DATADIR` is set,
+ this function will return value of the environment variable. Otherwise, use data
+ """
+ yolox_datadir = os.getenv("YOLOX_DATADIR", None)
+ if yolox_datadir is None:
+ import yolox
+
+ yolox_path = os.path.dirname(os.path.dirname(yolox.__file__))
+ yolox_datadir = os.path.join(yolox_path, "datasets")
+ return yolox_datadir
+
+
+class DataLoader(torchDataLoader):
+ """
+ Lightnet dataloader that enables on the fly resizing of the images.
+ See :class:`torch.utils.data.DataLoader` for more information on the arguments.
+ Check more on the following website:
+ https://gitlab.com/EAVISE/lightnet/-/blob/master/lightnet/data/_dataloading.py
+ """
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.__initialized = False
+ shuffle = False
+ batch_sampler = None
+ if len(args) > 5:
+ shuffle = args[2]
+ sampler = args[3]
+ batch_sampler = args[4]
+ elif len(args) > 4:
+ shuffle = args[2]
+ sampler = args[3]
+ if "batch_sampler" in kwargs:
+ batch_sampler = kwargs["batch_sampler"]
+ elif len(args) > 3:
+ shuffle = args[2]
+ if "sampler" in kwargs:
+ sampler = kwargs["sampler"]
+ if "batch_sampler" in kwargs:
+ batch_sampler = kwargs["batch_sampler"]
+ else:
+ if "shuffle" in kwargs:
+ shuffle = kwargs["shuffle"]
+ if "sampler" in kwargs:
+ sampler = kwargs["sampler"]
+ if "batch_sampler" in kwargs:
+ batch_sampler = kwargs["batch_sampler"]
+
+ # Use custom BatchSampler
+ if batch_sampler is None:
+ if sampler is None:
+ if shuffle:
+ sampler = torch.utils.data.sampler.RandomSampler(self.dataset)
+ # sampler = torch.utils.data.DistributedSampler(self.dataset)
+ else:
+ sampler = torch.utils.data.sampler.SequentialSampler(self.dataset)
+ batch_sampler = YoloBatchSampler(
+ sampler,
+ self.batch_size,
+ self.drop_last,
+ input_dimension=self.dataset.input_dim,
+ )
+ # batch_sampler = IterationBasedBatchSampler(batch_sampler, num_iterations =
+
+ self.batch_sampler = batch_sampler
+
+ self.__initialized = True
+
+ def close_mosaic(self):
+ self.batch_sampler.mosaic = False
+
+
+def list_collate(batch):
+ """
+ Function that collates lists or tuples together into one list (of lists/tuples).
+ Use this as the collate function in a Dataloader, if you want to have a list of
+ items as an output, as opposed to tensors (eg. Brambox.boxes).
+ """
+ items = list(zip(*batch))
+
+ for i in range(len(items)):
+ if isinstance(items[i][0], (list, tuple)):
+ items[i] = list(items[i])
+ else:
+ items[i] = default_collate(items[i])
+
+ return items
+
+
+def worker_init_reset_seed(worker_id):
+ seed = uuid.uuid4().int % 2**32
+ random.seed(seed)
+ torch.set_rng_state(torch.manual_seed(seed).get_state())
+ np.random.seed(seed)
diff --git a/what/models/detection/yolox/data/datasets/__init__.py b/what/models/detection/yolox/data/datasets/__init__.py
new file mode 100644
index 0000000..0b6fd8e
--- /dev/null
+++ b/what/models/detection/yolox/data/datasets/__init__.py
@@ -0,0 +1,9 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+from .coco import COCODataset
+from .coco_classes import COCO_CLASSES
+from .datasets_wrapper import CacheDataset, ConcatDataset, Dataset, MixConcatDataset
+from .mosaicdetection import MosaicDetection
+from .voc import VOCDetection
diff --git a/what/models/detection/yolox/data/datasets/coco.py b/what/models/detection/yolox/data/datasets/coco.py
new file mode 100644
index 0000000..8d19047
--- /dev/null
+++ b/what/models/detection/yolox/data/datasets/coco.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+import copy
+import os
+
+import cv2
+import numpy as np
+from pycocotools.coco import COCO
+
+from ..dataloading import get_yolox_datadir
+from .datasets_wrapper import CacheDataset, cache_read_img
+
+
+def remove_useless_info(coco):
+ """
+ Remove useless info in coco dataset. COCO object is modified inplace.
+ This function is mainly used for saving memory (save about 30% mem).
+ """
+ if isinstance(coco, COCO):
+ dataset = coco.dataset
+ dataset.pop("info", None)
+ dataset.pop("licenses", None)
+ for img in dataset["images"]:
+ img.pop("license", None)
+ img.pop("coco_url", None)
+ img.pop("date_captured", None)
+ img.pop("flickr_url", None)
+ if "annotations" in coco.dataset:
+ for anno in coco.dataset["annotations"]:
+ anno.pop("segmentation", None)
+
+
+class COCODataset(CacheDataset):
+ """
+ COCO dataset class.
+ """
+
+ def __init__(
+ self,
+ data_dir=None,
+ json_file="instances_train2017.json",
+ name="train2017",
+ img_size=(416, 416),
+ preproc=None,
+ cache=False,
+ cache_type="ram",
+ ):
+ """
+ COCO dataset initialization. Annotation data are read into memory by COCO API.
+ Args:
+ data_dir (str): dataset root directory
+ json_file (str): COCO json file name
+ name (str): COCO data name (e.g. 'train2017' or 'val2017')
+ img_size (int): target image size after pre-processing
+ preproc: data augmentation strategy
+ """
+ if data_dir is None:
+ data_dir = os.path.join(get_yolox_datadir(), "COCO")
+ self.data_dir = data_dir
+ self.json_file = json_file
+
+ self.coco = COCO(os.path.join(self.data_dir, "annotations", self.json_file))
+ remove_useless_info(self.coco)
+ self.ids = self.coco.getImgIds()
+ self.num_imgs = len(self.ids)
+ self.class_ids = sorted(self.coco.getCatIds())
+ self.cats = self.coco.loadCats(self.coco.getCatIds())
+ self._classes = tuple([c["name"] for c in self.cats])
+ self.name = name
+ self.img_size = img_size
+ self.preproc = preproc
+ self.annotations = self._load_coco_annotations()
+
+ path_filename = [os.path.join(name, anno[3]) for anno in self.annotations]
+ super().__init__(
+ input_dimension=img_size,
+ num_imgs=self.num_imgs,
+ data_dir=data_dir,
+ cache_dir_name=f"cache_{name}",
+ path_filename=path_filename,
+ cache=cache,
+ cache_type=cache_type
+ )
+
+ def __len__(self):
+ return self.num_imgs
+
+ def _load_coco_annotations(self):
+ return [self.load_anno_from_ids(_ids) for _ids in self.ids]
+
+ def load_anno_from_ids(self, id_):
+ im_ann = self.coco.loadImgs(id_)[0]
+ width = im_ann["width"]
+ height = im_ann["height"]
+ anno_ids = self.coco.getAnnIds(imgIds=[int(id_)], iscrowd=False)
+ annotations = self.coco.loadAnns(anno_ids)
+ objs = []
+ for obj in annotations:
+ x1 = np.max((0, obj["bbox"][0]))
+ y1 = np.max((0, obj["bbox"][1]))
+ x2 = np.min((width, x1 + np.max((0, obj["bbox"][2]))))
+ y2 = np.min((height, y1 + np.max((0, obj["bbox"][3]))))
+ if obj["area"] > 0 and x2 >= x1 and y2 >= y1:
+ obj["clean_bbox"] = [x1, y1, x2, y2]
+ objs.append(obj)
+
+ num_objs = len(objs)
+
+ res = np.zeros((num_objs, 5))
+ for ix, obj in enumerate(objs):
+ cls = self.class_ids.index(obj["category_id"])
+ res[ix, 0:4] = obj["clean_bbox"]
+ res[ix, 4] = cls
+
+ r = min(self.img_size[0] / height, self.img_size[1] / width)
+ res[:, :4] *= r
+
+ img_info = (height, width)
+ resized_info = (int(height * r), int(width * r))
+
+ file_name = (
+ im_ann["file_name"]
+ if "file_name" in im_ann
+ else "{:012}".format(id_) + ".jpg"
+ )
+
+ return (res, img_info, resized_info, file_name)
+
+ def load_anno(self, index):
+ return self.annotations[index][0]
+
+ def load_resized_img(self, index):
+ img = self.load_image(index)
+ r = min(self.img_size[0] / img.shape[0], self.img_size[1] / img.shape[1])
+ resized_img = cv2.resize(
+ img,
+ (int(img.shape[1] * r), int(img.shape[0] * r)),
+ interpolation=cv2.INTER_LINEAR,
+ ).astype(np.uint8)
+ return resized_img
+
+ def load_image(self, index):
+ file_name = self.annotations[index][3]
+
+ img_file = os.path.join(self.data_dir, self.name, file_name)
+
+ img = cv2.imread(img_file)
+ assert img is not None, f"file named {img_file} not found"
+
+ return img
+
+ @cache_read_img(use_cache=True)
+ def read_img(self, index):
+ return self.load_resized_img(index)
+
+ def pull_item(self, index):
+ id_ = self.ids[index]
+ label, origin_image_size, _, _ = self.annotations[index]
+ img = self.read_img(index)
+
+ return img, copy.deepcopy(label), origin_image_size, np.array([id_])
+
+ @CacheDataset.mosaic_getitem
+ def __getitem__(self, index):
+ """
+ One image / label pair for the given index is picked up and pre-processed.
+
+ Args:
+ index (int): data index
+
+ Returns:
+ img (numpy.ndarray): pre-processed image
+ padded_labels (torch.Tensor): pre-processed label data.
+ The shape is :math:`[max_labels, 5]`.
+ each label consists of [class, xc, yc, w, h]:
+ class (float): class index.
+ xc, yc (float) : center of bbox whose values range from 0 to 1.
+ w, h (float) : size of bbox whose values range from 0 to 1.
+ info_img : tuple of h, w.
+ h, w (int): original shape of the image
+ img_id (int): same as the input index. Used for evaluation.
+ """
+ img, target, img_info, img_id = self.pull_item(index)
+
+ if self.preproc is not None:
+ img, target = self.preproc(img, target, self.input_dim)
+ return img, target, img_info, img_id
diff --git a/what/models/detection/yolox/data/datasets/coco_classes.py b/what/models/detection/yolox/data/datasets/coco_classes.py
new file mode 100644
index 0000000..17f5cbe
--- /dev/null
+++ b/what/models/detection/yolox/data/datasets/coco_classes.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+COCO_CLASSES = (
+ "person",
+ "bicycle",
+ "car",
+ "motorcycle",
+ "airplane",
+ "bus",
+ "train",
+ "truck",
+ "boat",
+ "traffic light",
+ "fire hydrant",
+ "stop sign",
+ "parking meter",
+ "bench",
+ "bird",
+ "cat",
+ "dog",
+ "horse",
+ "sheep",
+ "cow",
+ "elephant",
+ "bear",
+ "zebra",
+ "giraffe",
+ "backpack",
+ "umbrella",
+ "handbag",
+ "tie",
+ "suitcase",
+ "frisbee",
+ "skis",
+ "snowboard",
+ "sports ball",
+ "kite",
+ "baseball bat",
+ "baseball glove",
+ "skateboard",
+ "surfboard",
+ "tennis racket",
+ "bottle",
+ "wine glass",
+ "cup",
+ "fork",
+ "knife",
+ "spoon",
+ "bowl",
+ "banana",
+ "apple",
+ "sandwich",
+ "orange",
+ "broccoli",
+ "carrot",
+ "hot dog",
+ "pizza",
+ "donut",
+ "cake",
+ "chair",
+ "couch",
+ "potted plant",
+ "bed",
+ "dining table",
+ "toilet",
+ "tv",
+ "laptop",
+ "mouse",
+ "remote",
+ "keyboard",
+ "cell phone",
+ "microwave",
+ "oven",
+ "toaster",
+ "sink",
+ "refrigerator",
+ "book",
+ "clock",
+ "vase",
+ "scissors",
+ "teddy bear",
+ "hair drier",
+ "toothbrush",
+)
diff --git a/what/models/detection/yolox/data/datasets/datasets_wrapper.py b/what/models/detection/yolox/data/datasets/datasets_wrapper.py
new file mode 100644
index 0000000..c45fe38
--- /dev/null
+++ b/what/models/detection/yolox/data/datasets/datasets_wrapper.py
@@ -0,0 +1,300 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import bisect
+import copy
+import os
+import random
+from abc import ABCMeta, abstractmethod
+from functools import partial, wraps
+from multiprocessing.pool import ThreadPool
+import psutil
+from loguru import logger
+from tqdm import tqdm
+
+import numpy as np
+
+from torch.utils.data.dataset import ConcatDataset as torchConcatDataset
+from torch.utils.data.dataset import Dataset as torchDataset
+
+
+class ConcatDataset(torchConcatDataset):
+ def __init__(self, datasets):
+ super(ConcatDataset, self).__init__(datasets)
+ if hasattr(self.datasets[0], "input_dim"):
+ self._input_dim = self.datasets[0].input_dim
+ self.input_dim = self.datasets[0].input_dim
+
+ def pull_item(self, idx):
+ if idx < 0:
+ if -idx > len(self):
+ raise ValueError(
+ "absolute value of index should not exceed dataset length"
+ )
+ idx = len(self) + idx
+ dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+ if dataset_idx == 0:
+ sample_idx = idx
+ else:
+ sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+ return self.datasets[dataset_idx].pull_item(sample_idx)
+
+
+class MixConcatDataset(torchConcatDataset):
+ def __init__(self, datasets):
+ super(MixConcatDataset, self).__init__(datasets)
+ if hasattr(self.datasets[0], "input_dim"):
+ self._input_dim = self.datasets[0].input_dim
+ self.input_dim = self.datasets[0].input_dim
+
+ def __getitem__(self, index):
+
+ if not isinstance(index, int):
+ idx = index[1]
+ if idx < 0:
+ if -idx > len(self):
+ raise ValueError(
+ "absolute value of index should not exceed dataset length"
+ )
+ idx = len(self) + idx
+ dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+ if dataset_idx == 0:
+ sample_idx = idx
+ else:
+ sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+ if not isinstance(index, int):
+ index = (index[0], sample_idx, index[2])
+
+ return self.datasets[dataset_idx][index]
+
+
+class Dataset(torchDataset):
+ """ This class is a subclass of the base :class:`torch.utils.data.Dataset`,
+ that enables on the fly resizing of the ``input_dim``.
+
+ Args:
+ input_dimension (tuple): (width,height) tuple with default dimensions of the network
+ """
+
+ def __init__(self, input_dimension, mosaic=True):
+ super().__init__()
+ self.__input_dim = input_dimension[:2]
+ self.enable_mosaic = mosaic
+
+ @property
+ def input_dim(self):
+ """
+ Dimension that can be used by transforms to set the correct image size, etc.
+ This allows transforms to have a single source of truth
+ for the input dimension of the network.
+
+ Return:
+ list: Tuple containing the current width,height
+ """
+ if hasattr(self, "_input_dim"):
+ return self._input_dim
+ return self.__input_dim
+
+ @staticmethod
+ def mosaic_getitem(getitem_fn):
+ """
+ Decorator method that needs to be used around the ``__getitem__`` method. |br|
+ This decorator enables the closing mosaic
+
+ Example:
+ >>> class CustomSet(ln.data.Dataset):
+ ... def __len__(self):
+ ... return 10
+ ... @ln.data.Dataset.mosaic_getitem
+ ... def __getitem__(self, index):
+ ... return self.enable_mosaic
+ """
+
+ @wraps(getitem_fn)
+ def wrapper(self, index):
+ if not isinstance(index, int):
+ self.enable_mosaic = index[0]
+ index = index[1]
+
+ ret_val = getitem_fn(self, index)
+
+ return ret_val
+
+ return wrapper
+
+
+class CacheDataset(Dataset, metaclass=ABCMeta):
+ """ This class is a subclass of the base :class:`yolox.data.datasets.Dataset`,
+ that enables cache images to ram or disk.
+
+ Args:
+ input_dimension (tuple): (width,height) tuple with default dimensions of the network
+ num_imgs (int): datset size
+ data_dir (str): the root directory of the dataset, e.g. `/path/to/COCO`.
+ cache_dir_name (str): the name of the directory to cache to disk,
+ e.g. `"custom_cache"`. The files cached to disk will be saved
+ under `/path/to/COCO/custom_cache`.
+ path_filename (str): a list of paths to the data relative to the `data_dir`,
+ e.g. if you have data `/path/to/COCO/train/1.jpg`, `/path/to/COCO/train/2.jpg`,
+ then `path_filename = ['train/1.jpg', ' train/2.jpg']`.
+ cache (bool): whether to cache the images to ram or disk.
+ cache_type (str): the type of cache,
+ "ram" : Caching imgs to ram for fast training.
+ "disk": Caching imgs to disk for fast training.
+ """
+
+ def __init__(
+ self,
+ input_dimension,
+ num_imgs=None,
+ data_dir=None,
+ cache_dir_name=None,
+ path_filename=None,
+ cache=False,
+ cache_type="ram",
+ ):
+ super().__init__(input_dimension)
+ self.cache = cache
+ self.cache_type = cache_type
+
+ if self.cache and self.cache_type == "disk":
+ self.cache_dir = os.path.join(data_dir, cache_dir_name)
+ self.path_filename = path_filename
+
+ if self.cache and self.cache_type == "ram":
+ self.imgs = None
+
+ if self.cache:
+ self.cache_images(
+ num_imgs=num_imgs,
+ data_dir=data_dir,
+ cache_dir_name=cache_dir_name,
+ path_filename=path_filename,
+ )
+
+ def __del__(self):
+ if self.cache and self.cache_type == "ram":
+ del self.imgs
+
+ @abstractmethod
+ def read_img(self, index):
+ """
+ Given index, return the corresponding image
+
+ Args:
+ index (int): image index
+ """
+ raise NotImplementedError
+
+ def cache_images(
+ self,
+ num_imgs=None,
+ data_dir=None,
+ cache_dir_name=None,
+ path_filename=None,
+ ):
+ assert num_imgs is not None, "num_imgs must be specified as the size of the dataset"
+ if self.cache_type == "disk":
+ assert (data_dir and cache_dir_name and path_filename) is not None, \
+ "data_dir, cache_name and path_filename must be specified if cache_type is disk"
+ self.path_filename = path_filename
+
+ mem = psutil.virtual_memory()
+ mem_required = self.cal_cache_occupy(num_imgs)
+ gb = 1 << 30
+
+ if self.cache_type == "ram":
+ if mem_required > mem.available:
+ self.cache = False
+ else:
+ logger.info(
+ f"{mem_required / gb:.1f}GB RAM required, "
+ f"{mem.available / gb:.1f}/{mem.total / gb:.1f}GB RAM available, "
+ f"Since the first thing we do is cache, "
+ f"there is no guarantee that the remaining memory space is sufficient"
+ )
+
+ if self.cache and self.imgs is None:
+ if self.cache_type == 'ram':
+ self.imgs = [None] * num_imgs
+ logger.info("You are using cached images in RAM to accelerate training!")
+ else: # 'disk'
+ if not os.path.exists(self.cache_dir):
+ os.mkdir(self.cache_dir)
+ logger.warning(
+ f"\n*******************************************************************\n"
+ f"You are using cached images in DISK to accelerate training.\n"
+ f"This requires large DISK space.\n"
+ f"Make sure you have {mem_required / gb:.1f} "
+ f"available DISK space for training your dataset.\n"
+ f"*******************************************************************\\n"
+ )
+ else:
+ logger.info(f"Found disk cache at {self.cache_dir}")
+ return
+
+ logger.info(
+ "Caching images...\n"
+ "This might take some time for your dataset"
+ )
+
+ num_threads = min(8, max(1, os.cpu_count() - 1))
+ b = 0
+ load_imgs = ThreadPool(num_threads).imap(
+ partial(self.read_img, use_cache=False),
+ range(num_imgs)
+ )
+ pbar = tqdm(enumerate(load_imgs), total=num_imgs)
+ for i, x in pbar: # x = self.read_img(self, i, use_cache=False)
+ if self.cache_type == 'ram':
+ self.imgs[i] = x
+ else: # 'disk'
+ cache_filename = f'{self.path_filename[i].split(".")[0]}.npy'
+ cache_path_filename = os.path.join(self.cache_dir, cache_filename)
+ os.makedirs(os.path.dirname(cache_path_filename), exist_ok=True)
+ np.save(cache_path_filename, x)
+ b += x.nbytes
+ pbar.desc = \
+ f'Caching images ({b / gb:.1f}/{mem_required / gb:.1f}GB {self.cache_type})'
+ pbar.close()
+
+ def cal_cache_occupy(self, num_imgs):
+ cache_bytes = 0
+ num_samples = min(num_imgs, 32)
+ for _ in range(num_samples):
+ img = self.read_img(index=random.randint(0, num_imgs - 1), use_cache=False)
+ cache_bytes += img.nbytes
+ mem_required = cache_bytes * num_imgs / num_samples
+ return mem_required
+
+
+def cache_read_img(use_cache=True):
+ def decorator(read_img_fn):
+ """
+ Decorate the read_img function to cache the image
+
+ Args:
+ read_img_fn: read_img function
+ use_cache (bool, optional): For the decorated read_img function,
+ whether to read the image from cache.
+ Defaults to True.
+ """
+ @wraps(read_img_fn)
+ def wrapper(self, index, use_cache=use_cache):
+ cache = self.cache and use_cache
+ if cache:
+ if self.cache_type == "ram":
+ img = self.imgs[index]
+ img = copy.deepcopy(img)
+ elif self.cache_type == "disk":
+ img = np.load(
+ os.path.join(
+ self.cache_dir, f"{self.path_filename[index].split('.')[0]}.npy"))
+ else:
+ raise ValueError(f"Unknown cache type: {self.cache_type}")
+ else:
+ img = read_img_fn(self, index)
+ return img
+ return wrapper
+ return decorator
diff --git a/what/models/detection/yolox/data/datasets/mosaicdetection.py b/what/models/detection/yolox/data/datasets/mosaicdetection.py
new file mode 100644
index 0000000..7f4ee57
--- /dev/null
+++ b/what/models/detection/yolox/data/datasets/mosaicdetection.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import random
+
+import cv2
+import numpy as np
+
+from what.models.detection.yolox.utils import adjust_box_anns, get_local_rank
+
+from ..data_augment import random_affine
+from .datasets_wrapper import Dataset
+
+
+def get_mosaic_coordinate(mosaic_image, mosaic_index, xc, yc, w, h, input_h, input_w):
+ # TODO update doc
+ # index0 to top left part of image
+ if mosaic_index == 0:
+ x1, y1, x2, y2 = max(xc - w, 0), max(yc - h, 0), xc, yc
+ small_coord = w - (x2 - x1), h - (y2 - y1), w, h
+ # index1 to top right part of image
+ elif mosaic_index == 1:
+ x1, y1, x2, y2 = xc, max(yc - h, 0), min(xc + w, input_w * 2), yc
+ small_coord = 0, h - (y2 - y1), min(w, x2 - x1), h
+ # index2 to bottom left part of image
+ elif mosaic_index == 2:
+ x1, y1, x2, y2 = max(xc - w, 0), yc, xc, min(input_h * 2, yc + h)
+ small_coord = w - (x2 - x1), 0, w, min(y2 - y1, h)
+ # index2 to bottom right part of image
+ elif mosaic_index == 3:
+ x1, y1, x2, y2 = xc, yc, min(xc + w, input_w * 2), min(input_h * 2, yc + h) # noqa
+ small_coord = 0, 0, min(w, x2 - x1), min(y2 - y1, h)
+ return (x1, y1, x2, y2), small_coord
+
+
+class MosaicDetection(Dataset):
+ """Detection dataset wrapper that performs mixup for normal dataset."""
+
+ def __init__(
+ self, dataset, img_size, mosaic=True, preproc=None,
+ degrees=10.0, translate=0.1, mosaic_scale=(0.5, 1.5),
+ mixup_scale=(0.5, 1.5), shear=2.0, enable_mixup=True,
+ mosaic_prob=1.0, mixup_prob=1.0, *args
+ ):
+ """
+
+ Args:
+ dataset(Dataset) : Pytorch dataset object.
+ img_size (tuple):
+ mosaic (bool): enable mosaic augmentation or not.
+ preproc (func):
+ degrees (float):
+ translate (float):
+ mosaic_scale (tuple):
+ mixup_scale (tuple):
+ shear (float):
+ enable_mixup (bool):
+ *args(tuple) : Additional arguments for mixup random sampler.
+ """
+ super().__init__(img_size, mosaic=mosaic)
+ self._dataset = dataset
+ self.preproc = preproc
+ self.degrees = degrees
+ self.translate = translate
+ self.scale = mosaic_scale
+ self.shear = shear
+ self.mixup_scale = mixup_scale
+ self.enable_mosaic = mosaic
+ self.enable_mixup = enable_mixup
+ self.mosaic_prob = mosaic_prob
+ self.mixup_prob = mixup_prob
+ self.local_rank = get_local_rank()
+
+ def __len__(self):
+ return len(self._dataset)
+
+ @Dataset.mosaic_getitem
+ def __getitem__(self, idx):
+ if self.enable_mosaic and random.random() < self.mosaic_prob:
+ mosaic_labels = []
+ input_dim = self._dataset.input_dim
+ input_h, input_w = input_dim[0], input_dim[1]
+
+ # yc, xc = s, s # mosaic center x, y
+ yc = int(random.uniform(0.5 * input_h, 1.5 * input_h))
+ xc = int(random.uniform(0.5 * input_w, 1.5 * input_w))
+
+ # 3 additional image indices
+ indices = [idx] + [random.randint(0, len(self._dataset) - 1) for _ in range(3)]
+
+ for i_mosaic, index in enumerate(indices):
+ img, _labels, _, img_id = self._dataset.pull_item(index)
+ h0, w0 = img.shape[:2] # orig hw
+ scale = min(1. * input_h / h0, 1. * input_w / w0)
+ img = cv2.resize(
+ img, (int(w0 * scale), int(h0 * scale)), interpolation=cv2.INTER_LINEAR
+ )
+ # generate output mosaic image
+ (h, w, c) = img.shape[:3]
+ if i_mosaic == 0:
+ mosaic_img = np.full((input_h * 2, input_w * 2, c), 114, dtype=np.uint8)
+
+ # suffix l means large image, while s means small image in mosaic aug.
+ (l_x1, l_y1, l_x2, l_y2), (s_x1, s_y1, s_x2, s_y2) = get_mosaic_coordinate(
+ mosaic_img, i_mosaic, xc, yc, w, h, input_h, input_w
+ )
+
+ mosaic_img[l_y1:l_y2, l_x1:l_x2] = img[s_y1:s_y2, s_x1:s_x2]
+ padw, padh = l_x1 - s_x1, l_y1 - s_y1
+
+ labels = _labels.copy()
+ # Normalized xywh to pixel xyxy format
+ if _labels.size > 0:
+ labels[:, 0] = scale * _labels[:, 0] + padw
+ labels[:, 1] = scale * _labels[:, 1] + padh
+ labels[:, 2] = scale * _labels[:, 2] + padw
+ labels[:, 3] = scale * _labels[:, 3] + padh
+ mosaic_labels.append(labels)
+
+ if len(mosaic_labels):
+ mosaic_labels = np.concatenate(mosaic_labels, 0)
+ np.clip(mosaic_labels[:, 0], 0, 2 * input_w, out=mosaic_labels[:, 0])
+ np.clip(mosaic_labels[:, 1], 0, 2 * input_h, out=mosaic_labels[:, 1])
+ np.clip(mosaic_labels[:, 2], 0, 2 * input_w, out=mosaic_labels[:, 2])
+ np.clip(mosaic_labels[:, 3], 0, 2 * input_h, out=mosaic_labels[:, 3])
+
+ mosaic_img, mosaic_labels = random_affine(
+ mosaic_img,
+ mosaic_labels,
+ target_size=(input_w, input_h),
+ degrees=self.degrees,
+ translate=self.translate,
+ scales=self.scale,
+ shear=self.shear,
+ )
+
+ # -----------------------------------------------------------------
+ # CopyPaste: https://arxiv.org/abs/2012.07177
+ # -----------------------------------------------------------------
+ if (
+ self.enable_mixup
+ and not len(mosaic_labels) == 0
+ and random.random() < self.mixup_prob
+ ):
+ mosaic_img, mosaic_labels = self.mixup(mosaic_img, mosaic_labels, self.input_dim)
+ mix_img, padded_labels = self.preproc(mosaic_img, mosaic_labels, self.input_dim)
+ img_info = (mix_img.shape[1], mix_img.shape[0])
+
+ # -----------------------------------------------------------------
+ # img_info and img_id are not used for training.
+ # They are also hard to be specified on a mosaic image.
+ # -----------------------------------------------------------------
+ return mix_img, padded_labels, img_info, img_id
+
+ else:
+ self._dataset._input_dim = self.input_dim
+ img, label, img_info, img_id = self._dataset.pull_item(idx)
+ img, label = self.preproc(img, label, self.input_dim)
+ return img, label, img_info, img_id
+
+ def mixup(self, origin_img, origin_labels, input_dim):
+ jit_factor = random.uniform(*self.mixup_scale)
+ FLIP = random.uniform(0, 1) > 0.5
+ cp_labels = []
+ while len(cp_labels) == 0:
+ cp_index = random.randint(0, self.__len__() - 1)
+ cp_labels = self._dataset.load_anno(cp_index)
+ img, cp_labels, _, _ = self._dataset.pull_item(cp_index)
+
+ if len(img.shape) == 3:
+ cp_img = np.ones((input_dim[0], input_dim[1], 3), dtype=np.uint8) * 114
+ else:
+ cp_img = np.ones(input_dim, dtype=np.uint8) * 114
+
+ cp_scale_ratio = min(input_dim[0] / img.shape[0], input_dim[1] / img.shape[1])
+ resized_img = cv2.resize(
+ img,
+ (int(img.shape[1] * cp_scale_ratio), int(img.shape[0] * cp_scale_ratio)),
+ interpolation=cv2.INTER_LINEAR,
+ )
+
+ cp_img[
+ : int(img.shape[0] * cp_scale_ratio), : int(img.shape[1] * cp_scale_ratio)
+ ] = resized_img
+
+ cp_img = cv2.resize(
+ cp_img,
+ (int(cp_img.shape[1] * jit_factor), int(cp_img.shape[0] * jit_factor)),
+ )
+ cp_scale_ratio *= jit_factor
+
+ if FLIP:
+ cp_img = cp_img[:, ::-1, :]
+
+ origin_h, origin_w = cp_img.shape[:2]
+ target_h, target_w = origin_img.shape[:2]
+ padded_img = np.zeros(
+ (max(origin_h, target_h), max(origin_w, target_w), 3), dtype=np.uint8
+ )
+ padded_img[:origin_h, :origin_w] = cp_img
+
+ x_offset, y_offset = 0, 0
+ if padded_img.shape[0] > target_h:
+ y_offset = random.randint(0, padded_img.shape[0] - target_h - 1)
+ if padded_img.shape[1] > target_w:
+ x_offset = random.randint(0, padded_img.shape[1] - target_w - 1)
+ padded_cropped_img = padded_img[
+ y_offset: y_offset + target_h, x_offset: x_offset + target_w
+ ]
+
+ cp_bboxes_origin_np = adjust_box_anns(
+ cp_labels[:, :4].copy(), cp_scale_ratio, 0, 0, origin_w, origin_h
+ )
+ if FLIP:
+ cp_bboxes_origin_np[:, 0::2] = (
+ origin_w - cp_bboxes_origin_np[:, 0::2][:, ::-1]
+ )
+ cp_bboxes_transformed_np = cp_bboxes_origin_np.copy()
+ cp_bboxes_transformed_np[:, 0::2] = np.clip(
+ cp_bboxes_transformed_np[:, 0::2] - x_offset, 0, target_w
+ )
+ cp_bboxes_transformed_np[:, 1::2] = np.clip(
+ cp_bboxes_transformed_np[:, 1::2] - y_offset, 0, target_h
+ )
+
+ cls_labels = cp_labels[:, 4:5].copy()
+ box_labels = cp_bboxes_transformed_np
+ labels = np.hstack((box_labels, cls_labels))
+ origin_labels = np.vstack((origin_labels, labels))
+ origin_img = origin_img.astype(np.float32)
+ origin_img = 0.5 * origin_img + 0.5 * padded_cropped_img.astype(np.float32)
+
+ return origin_img.astype(np.uint8), origin_labels
diff --git a/what/models/detection/yolox/data/datasets/voc.py b/what/models/detection/yolox/data/datasets/voc.py
new file mode 100644
index 0000000..1f176ed
--- /dev/null
+++ b/what/models/detection/yolox/data/datasets/voc.py
@@ -0,0 +1,331 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Code are based on
+# https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py
+# Copyright (c) Francisco Massa.
+# Copyright (c) Ellis Brown, Max deGroot.
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+import os.path
+import pickle
+import xml.etree.ElementTree as ET
+
+import cv2
+import numpy as np
+
+from what.models.detection.yolox.evaluators.voc_eval import voc_eval
+
+from .datasets_wrapper import CacheDataset, cache_read_img
+from .voc_classes import VOC_CLASSES
+
+
+class AnnotationTransform(object):
+
+ """Transforms a VOC annotation into a Tensor of bbox coords and label index
+ Initilized with a dictionary lookup of classnames to indexes
+
+ Arguments:
+ class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
+ (default: alphabetic indexing of VOC's 20 classes)
+ keep_difficult (bool, optional): keep difficult instances or not
+ (default: False)
+ height (int): height
+ width (int): width
+ """
+
+ def __init__(self, class_to_ind=None, keep_difficult=True):
+ self.class_to_ind = class_to_ind or dict(
+ zip(VOC_CLASSES, range(len(VOC_CLASSES)))
+ )
+ self.keep_difficult = keep_difficult
+
+ def __call__(self, target):
+ """
+ Arguments:
+ target (annotation) : the target annotation to be made usable
+ will be an ET.Element
+ Returns:
+ a list containing lists of bounding boxes [bbox coords, class name]
+ """
+ res = np.empty((0, 5))
+ for obj in target.iter("object"):
+ difficult = obj.find("difficult")
+ if difficult is not None:
+ difficult = int(difficult.text) == 1
+ else:
+ difficult = False
+ if not self.keep_difficult and difficult:
+ continue
+ name = obj.find("name").text.strip()
+ bbox = obj.find("bndbox")
+
+ pts = ["xmin", "ymin", "xmax", "ymax"]
+ bndbox = []
+ for i, pt in enumerate(pts):
+ cur_pt = int(float(bbox.find(pt).text)) - 1
+ # scale height or width
+ # cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
+ bndbox.append(cur_pt)
+ label_idx = self.class_to_ind[name]
+ bndbox.append(label_idx)
+ res = np.vstack((res, bndbox)) # [xmin, ymin, xmax, ymax, label_ind]
+ # img_id = target.find('filename').text[:-4]
+
+ width = int(target.find("size").find("width").text)
+ height = int(target.find("size").find("height").text)
+ img_info = (height, width)
+
+ return res, img_info
+
+
+class VOCDetection(CacheDataset):
+
+ """
+ VOC Detection Dataset Object
+
+ input is image, target is annotation
+
+ Args:
+ root (string): filepath to VOCdevkit folder.
+ image_set (string): imageset to use (eg. 'train', 'val', 'test')
+ transform (callable, optional): transformation to perform on the
+ input image
+ target_transform (callable, optional): transformation to perform on the
+ target `annotation`
+ (eg: take in caption string, return tensor of word indices)
+ dataset_name (string, optional): which dataset to load
+ (default: 'VOC2007')
+ """
+
+ def __init__(
+ self,
+ data_dir,
+ image_sets=[("2007", "trainval"), ("2012", "trainval")],
+ img_size=(416, 416),
+ preproc=None,
+ target_transform=AnnotationTransform(),
+ dataset_name="VOC0712",
+ cache=False,
+ cache_type="ram",
+ ):
+ self.root = data_dir
+ self.image_set = image_sets
+ self.img_size = img_size
+ self.preproc = preproc
+ self.target_transform = target_transform
+ self.name = dataset_name
+ self._annopath = os.path.join("%s", "Annotations", "%s.xml")
+ self._imgpath = os.path.join("%s", "JPEGImages", "%s.jpg")
+ self._classes = VOC_CLASSES
+ self.cats = [
+ {"id": idx, "name": val} for idx, val in enumerate(VOC_CLASSES)
+ ]
+ self.class_ids = list(range(len(VOC_CLASSES)))
+ self.ids = list()
+ for (year, name) in image_sets:
+ self._year = year
+ rootpath = os.path.join(self.root, "VOC" + year)
+ for line in open(
+ os.path.join(rootpath, "ImageSets", "Main", name + ".txt")
+ ):
+ self.ids.append((rootpath, line.strip()))
+ self.num_imgs = len(self.ids)
+
+ self.annotations = self._load_coco_annotations()
+
+ path_filename = [
+ (self._imgpath % self.ids[i]).split(self.root + "/")[1]
+ for i in range(self.num_imgs)
+ ]
+ super().__init__(
+ input_dimension=img_size,
+ num_imgs=self.num_imgs,
+ data_dir=self.root,
+ cache_dir_name=f"cache_{self.name}",
+ path_filename=path_filename,
+ cache=cache,
+ cache_type=cache_type
+ )
+
+ def __len__(self):
+ return self.num_imgs
+
+ def _load_coco_annotations(self):
+ return [self.load_anno_from_ids(_ids) for _ids in range(self.num_imgs)]
+
+ def load_anno_from_ids(self, index):
+ img_id = self.ids[index]
+ target = ET.parse(self._annopath % img_id).getroot()
+
+ assert self.target_transform is not None
+ res, img_info = self.target_transform(target)
+ height, width = img_info
+
+ r = min(self.img_size[0] / height, self.img_size[1] / width)
+ res[:, :4] *= r
+ resized_info = (int(height * r), int(width * r))
+
+ return (res, img_info, resized_info)
+
+ def load_anno(self, index):
+ return self.annotations[index][0]
+
+ def load_resized_img(self, index):
+ img = self.load_image(index)
+ r = min(self.img_size[0] / img.shape[0], self.img_size[1] / img.shape[1])
+ resized_img = cv2.resize(
+ img,
+ (int(img.shape[1] * r), int(img.shape[0] * r)),
+ interpolation=cv2.INTER_LINEAR,
+ ).astype(np.uint8)
+
+ return resized_img
+
+ def load_image(self, index):
+ img_id = self.ids[index]
+ img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)
+ assert img is not None, f"file named {self._imgpath % img_id} not found"
+
+ return img
+
+ @cache_read_img(use_cache=True)
+ def read_img(self, index):
+ return self.load_resized_img(index)
+
+ def pull_item(self, index):
+ """Returns the original image and target at an index for mixup
+
+ Note: not using self.__getitem__(), as any transformations passed in
+ could mess up this functionality.
+
+ Argument:
+ index (int): index of img to show
+ Return:
+ img, target
+ """
+ target, img_info, _ = self.annotations[index]
+ img = self.read_img(index)
+
+ return img, target, img_info, index
+
+ @CacheDataset.mosaic_getitem
+ def __getitem__(self, index):
+ img, target, img_info, img_id = self.pull_item(index)
+
+ if self.preproc is not None:
+ img, target = self.preproc(img, target, self.input_dim)
+
+ return img, target, img_info, img_id
+
+ def evaluate_detections(self, all_boxes, output_dir=None):
+ """
+ all_boxes is a list of length number-of-classes.
+ Each list element is a list of length number-of-images.
+ Each of those list elements is either an empty list []
+ or a numpy array of detection.
+
+ all_boxes[class][image] = [] or np.array of shape #dets x 5
+ """
+ self._write_voc_results_file(all_boxes)
+ IouTh = np.linspace(
+ 0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True
+ )
+ mAPs = []
+ for iou in IouTh:
+ mAP = self._do_python_eval(output_dir, iou)
+ mAPs.append(mAP)
+
+ print("--------------------------------------------------------------")
+ print("map_5095:", np.mean(mAPs))
+ print("map_50:", mAPs[0])
+ print("--------------------------------------------------------------")
+ return np.mean(mAPs), mAPs[0]
+
+ def _get_voc_results_file_template(self):
+ filename = "comp4_det_test" + "_{:s}.txt"
+ filedir = os.path.join(self.root, "results", "VOC" + self._year, "Main")
+ if not os.path.exists(filedir):
+ os.makedirs(filedir)
+ path = os.path.join(filedir, filename)
+ return path
+
+ def _write_voc_results_file(self, all_boxes):
+ for cls_ind, cls in enumerate(VOC_CLASSES):
+ cls_ind = cls_ind
+ if cls == "__background__":
+ continue
+ print("Writing {} VOC results file".format(cls))
+ filename = self._get_voc_results_file_template().format(cls)
+ with open(filename, "wt") as f:
+ for im_ind, index in enumerate(self.ids):
+ index = index[1]
+ dets = all_boxes[cls_ind][im_ind]
+ if dets == []:
+ continue
+ for k in range(dets.shape[0]):
+ f.write(
+ "{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n".format(
+ index,
+ dets[k, -1],
+ dets[k, 0] + 1,
+ dets[k, 1] + 1,
+ dets[k, 2] + 1,
+ dets[k, 3] + 1,
+ )
+ )
+
+ def _do_python_eval(self, output_dir="output", iou=0.5):
+ rootpath = os.path.join(self.root, "VOC" + self._year)
+ name = self.image_set[0][1]
+ annopath = os.path.join(rootpath, "Annotations", "{:s}.xml")
+ imagesetfile = os.path.join(rootpath, "ImageSets", "Main", name + ".txt")
+ cachedir = os.path.join(
+ self.root, "annotations_cache", "VOC" + self._year, name
+ )
+ if not os.path.exists(cachedir):
+ os.makedirs(cachedir)
+ aps = []
+ # The PASCAL VOC metric changed in 2010
+ use_07_metric = True if int(self._year) < 2010 else False
+ print("Eval IoU : {:.2f}".format(iou))
+ if output_dir is not None and not os.path.isdir(output_dir):
+ os.mkdir(output_dir)
+ for i, cls in enumerate(VOC_CLASSES):
+
+ if cls == "__background__":
+ continue
+
+ filename = self._get_voc_results_file_template().format(cls)
+ rec, prec, ap = voc_eval(
+ filename,
+ annopath,
+ imagesetfile,
+ cls,
+ cachedir,
+ ovthresh=iou,
+ use_07_metric=use_07_metric,
+ )
+ aps += [ap]
+ if iou == 0.5:
+ print("AP for {} = {:.4f}".format(cls, ap))
+ if output_dir is not None:
+ with open(os.path.join(output_dir, cls + "_pr.pkl"), "wb") as f:
+ pickle.dump({"rec": rec, "prec": prec, "ap": ap}, f)
+ if iou == 0.5:
+ print("Mean AP = {:.4f}".format(np.mean(aps)))
+ print("~~~~~~~~")
+ print("Results:")
+ for ap in aps:
+ print("{:.3f}".format(ap))
+ print("{:.3f}".format(np.mean(aps)))
+ print("~~~~~~~~")
+ print("")
+ print("--------------------------------------------------------------")
+ print("Results computed with the **unofficial** Python eval code.")
+ print("Results should be very close to the official MATLAB eval code.")
+ print("Recompute with `./tools/reval.py --matlab ...` for your paper.")
+ print("-- Thanks, The Management")
+ print("--------------------------------------------------------------")
+
+ return np.mean(aps)
diff --git a/what/models/detection/yolox/data/datasets/voc_classes.py b/what/models/detection/yolox/data/datasets/voc_classes.py
new file mode 100644
index 0000000..89354b3
--- /dev/null
+++ b/what/models/detection/yolox/data/datasets/voc_classes.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+# VOC_CLASSES = ( '__background__', # always index 0
+VOC_CLASSES = (
+ "aeroplane",
+ "bicycle",
+ "bird",
+ "boat",
+ "bottle",
+ "bus",
+ "car",
+ "cat",
+ "chair",
+ "cow",
+ "diningtable",
+ "dog",
+ "horse",
+ "motorbike",
+ "person",
+ "pottedplant",
+ "sheep",
+ "sofa",
+ "train",
+ "tvmonitor",
+)
diff --git a/what/models/detection/yolox/data/samplers.py b/what/models/detection/yolox/data/samplers.py
new file mode 100644
index 0000000..6b7ea38
--- /dev/null
+++ b/what/models/detection/yolox/data/samplers.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import itertools
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+from torch.utils.data.sampler import BatchSampler as torchBatchSampler
+from torch.utils.data.sampler import Sampler
+
+
+class YoloBatchSampler(torchBatchSampler):
+ """
+ This batch sampler will generate mini-batches of (mosaic, index) tuples from another sampler.
+ It works just like the :class:`torch.utils.data.sampler.BatchSampler`,
+ but it will turn on/off the mosaic aug.
+ """
+
+ def __init__(self, *args, mosaic=True, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.mosaic = mosaic
+
+ def __iter__(self):
+ for batch in super().__iter__():
+ yield [(self.mosaic, idx) for idx in batch]
+
+
+class InfiniteSampler(Sampler):
+ """
+ In training, we only care about the "infinite stream" of training data.
+ So this sampler produces an infinite stream of indices and
+ all workers cooperate to correctly shuffle the indices and sample different indices.
+ The samplers in each worker effectively produces `indices[worker_id::num_workers]`
+ where `indices` is an infinite stream of indices consisting of
+ `shuffle(range(size)) + shuffle(range(size)) + ...` (if shuffle is True)
+ or `range(size) + range(size) + ...` (if shuffle is False)
+ """
+
+ def __init__(
+ self,
+ size: int,
+ shuffle: bool = True,
+ seed: Optional[int] = 0,
+ rank=0,
+ world_size=1,
+ ):
+ """
+ Args:
+ size (int): the total number of data of the underlying dataset to sample from
+ shuffle (bool): whether to shuffle the indices or not
+ seed (int): the initial seed of the shuffle. Must be the same
+ across all workers. If None, will use a random seed shared
+ among workers (require synchronization among all workers).
+ """
+ self._size = size
+ assert size > 0
+ self._shuffle = shuffle
+ self._seed = int(seed)
+
+ if dist.is_available() and dist.is_initialized():
+ self._rank = dist.get_rank()
+ self._world_size = dist.get_world_size()
+ else:
+ self._rank = rank
+ self._world_size = world_size
+
+ def __iter__(self):
+ start = self._rank
+ yield from itertools.islice(
+ self._infinite_indices(), start, None, self._world_size
+ )
+
+ def _infinite_indices(self):
+ g = torch.Generator()
+ g.manual_seed(self._seed)
+ while True:
+ if self._shuffle:
+ yield from torch.randperm(self._size, generator=g)
+ else:
+ yield from torch.arange(self._size)
+
+ def __len__(self):
+ return self._size // self._world_size
diff --git a/what/models/detection/yolox/evaluators/__init__.py b/what/models/detection/yolox/evaluators/__init__.py
new file mode 100644
index 0000000..1a99047
--- /dev/null
+++ b/what/models/detection/yolox/evaluators/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+from .coco_evaluator import COCOEvaluator
+from .voc_evaluator import VOCEvaluator
diff --git a/what/models/detection/yolox/evaluators/coco_evaluator.py b/what/models/detection/yolox/evaluators/coco_evaluator.py
new file mode 100644
index 0000000..d1fc384
--- /dev/null
+++ b/what/models/detection/yolox/evaluators/coco_evaluator.py
@@ -0,0 +1,317 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import contextlib
+import io
+import itertools
+import json
+import tempfile
+import time
+from collections import ChainMap, defaultdict
+from loguru import logger
+from tabulate import tabulate
+from tqdm import tqdm
+
+import numpy as np
+
+import torch
+
+from what.models.detection.yolox.data.datasets import COCO_CLASSES
+from what.models.detection.yolox.utils import (
+ gather,
+ is_main_process,
+ postprocess,
+ synchronize,
+ time_synchronized,
+ xyxy2xywh
+)
+
+
+def per_class_AR_table(coco_eval, class_names=COCO_CLASSES, headers=["class", "AR"], colums=6):
+ per_class_AR = {}
+ recalls = coco_eval.eval["recall"]
+ # dimension of recalls: [TxKxAxM]
+ # recall has dims (iou, cls, area range, max dets)
+ assert len(class_names) == recalls.shape[1]
+
+ for idx, name in enumerate(class_names):
+ recall = recalls[:, idx, 0, -1]
+ recall = recall[recall > -1]
+ ar = np.mean(recall) if recall.size else float("nan")
+ per_class_AR[name] = float(ar * 100)
+
+ num_cols = min(colums, len(per_class_AR) * len(headers))
+ result_pair = [x for pair in per_class_AR.items() for x in pair]
+ row_pair = itertools.zip_longest(*[result_pair[i::num_cols] for i in range(num_cols)])
+ table_headers = headers * (num_cols // len(headers))
+ table = tabulate(
+ row_pair, tablefmt="pipe", floatfmt=".3f", headers=table_headers, numalign="left",
+ )
+ return table
+
+
+def per_class_AP_table(coco_eval, class_names=COCO_CLASSES, headers=["class", "AP"], colums=6):
+ per_class_AP = {}
+ precisions = coco_eval.eval["precision"]
+ # dimension of precisions: [TxRxKxAxM]
+ # precision has dims (iou, recall, cls, area range, max dets)
+ assert len(class_names) == precisions.shape[2]
+
+ for idx, name in enumerate(class_names):
+ # area range index 0: all area ranges
+ # max dets index -1: typically 100 per image
+ precision = precisions[:, :, idx, 0, -1]
+ precision = precision[precision > -1]
+ ap = np.mean(precision) if precision.size else float("nan")
+ per_class_AP[name] = float(ap * 100)
+
+ num_cols = min(colums, len(per_class_AP) * len(headers))
+ result_pair = [x for pair in per_class_AP.items() for x in pair]
+ row_pair = itertools.zip_longest(*[result_pair[i::num_cols] for i in range(num_cols)])
+ table_headers = headers * (num_cols // len(headers))
+ table = tabulate(
+ row_pair, tablefmt="pipe", floatfmt=".3f", headers=table_headers, numalign="left",
+ )
+ return table
+
+
+class COCOEvaluator:
+ """
+ COCO AP Evaluation class. All the data in the val2017 dataset are processed
+ and evaluated by COCO API.
+ """
+
+ def __init__(
+ self,
+ dataloader,
+ img_size: int,
+ confthre: float,
+ nmsthre: float,
+ num_classes: int,
+ testdev: bool = False,
+ per_class_AP: bool = True,
+ per_class_AR: bool = True,
+ ):
+ """
+ Args:
+ dataloader (Dataloader): evaluate dataloader.
+ img_size: image size after preprocess. images are resized
+ to squares whose shape is (img_size, img_size).
+ confthre: confidence threshold ranging from 0 to 1, which
+ is defined in the config file.
+ nmsthre: IoU threshold of non-max supression ranging from 0 to 1.
+ per_class_AP: Show per class AP during evalution or not. Default to True.
+ per_class_AR: Show per class AR during evalution or not. Default to True.
+ """
+ self.dataloader = dataloader
+ self.img_size = img_size
+ self.confthre = confthre
+ self.nmsthre = nmsthre
+ self.num_classes = num_classes
+ self.testdev = testdev
+ self.per_class_AP = per_class_AP
+ self.per_class_AR = per_class_AR
+
+ def evaluate(
+ self, model, distributed=False, half=False, trt_file=None,
+ decoder=None, test_size=None, return_outputs=False
+ ):
+ """
+ COCO average precision (AP) Evaluation. Iterate inference on the test dataset
+ and the results are evaluated by COCO API.
+
+ NOTE: This function will change training mode to False, please save states if needed.
+
+ Args:
+ model : model to evaluate.
+
+ Returns:
+ ap50_95 (float) : COCO AP of IoU=50:95
+ ap50 (float) : COCO AP of IoU=50
+ summary (sr): summary info of evaluation.
+ """
+ # TODO half to amp_test
+ tensor_type = torch.cuda.HalfTensor if half else torch.cuda.FloatTensor
+ model = model.eval()
+ if half:
+ model = model.half()
+ ids = []
+ data_list = []
+ output_data = defaultdict()
+ progress_bar = tqdm if is_main_process() else iter
+
+ inference_time = 0
+ nms_time = 0
+ n_samples = max(len(self.dataloader) - 1, 1)
+
+ if trt_file is not None:
+ from torch2trt import TRTModule
+
+ model_trt = TRTModule()
+ model_trt.load_state_dict(torch.load(trt_file))
+
+ x = torch.ones(1, 3, test_size[0], test_size[1]).cuda()
+ model(x)
+ model = model_trt
+
+ for cur_iter, (imgs, _, info_imgs, ids) in enumerate(
+ progress_bar(self.dataloader)
+ ):
+ with torch.no_grad():
+ imgs = imgs.type(tensor_type)
+
+ # skip the last iters since batchsize might be not enough for batch inference
+ is_time_record = cur_iter < len(self.dataloader) - 1
+ if is_time_record:
+ start = time.time()
+
+ outputs = model(imgs)
+ if decoder is not None:
+ outputs = decoder(outputs, dtype=outputs.type())
+
+ if is_time_record:
+ infer_end = time_synchronized()
+ inference_time += infer_end - start
+
+ outputs = postprocess(
+ outputs, self.num_classes, self.confthre, self.nmsthre
+ )
+ if is_time_record:
+ nms_end = time_synchronized()
+ nms_time += nms_end - infer_end
+
+ data_list_elem, image_wise_data = self.convert_to_coco_format(
+ outputs, info_imgs, ids, return_outputs=True)
+ data_list.extend(data_list_elem)
+ output_data.update(image_wise_data)
+
+ statistics = torch.cuda.FloatTensor([inference_time, nms_time, n_samples])
+ if distributed:
+ # different process/device might have different speed,
+ # to make sure the process will not be stucked, sync func is used here.
+ synchronize()
+ data_list = gather(data_list, dst=0)
+ output_data = gather(output_data, dst=0)
+ data_list = list(itertools.chain(*data_list))
+ output_data = dict(ChainMap(*output_data))
+ torch.distributed.reduce(statistics, dst=0)
+
+ eval_results = self.evaluate_prediction(data_list, statistics)
+ synchronize()
+
+ if return_outputs:
+ return eval_results, output_data
+ return eval_results
+
+ def convert_to_coco_format(self, outputs, info_imgs, ids, return_outputs=False):
+ data_list = []
+ image_wise_data = defaultdict(dict)
+ for (output, img_h, img_w, img_id) in zip(
+ outputs, info_imgs[0], info_imgs[1], ids
+ ):
+ if output is None:
+ continue
+ output = output.cpu()
+
+ bboxes = output[:, 0:4]
+
+ # preprocessing: resize
+ scale = min(
+ self.img_size[0] / float(img_h), self.img_size[1] / float(img_w)
+ )
+ bboxes /= scale
+ cls = output[:, 6]
+ scores = output[:, 4] * output[:, 5]
+
+ image_wise_data.update({
+ int(img_id): {
+ "bboxes": [box.numpy().tolist() for box in bboxes],
+ "scores": [score.numpy().item() for score in scores],
+ "categories": [
+ self.dataloader.dataset.class_ids[int(cls[ind])]
+ for ind in range(bboxes.shape[0])
+ ],
+ }
+ })
+
+ bboxes = xyxy2xywh(bboxes)
+
+ for ind in range(bboxes.shape[0]):
+ label = self.dataloader.dataset.class_ids[int(cls[ind])]
+ pred_data = {
+ "image_id": int(img_id),
+ "category_id": label,
+ "bbox": bboxes[ind].numpy().tolist(),
+ "score": scores[ind].numpy().item(),
+ "segmentation": [],
+ } # COCO json format
+ data_list.append(pred_data)
+
+ if return_outputs:
+ return data_list, image_wise_data
+ return data_list
+
+ def evaluate_prediction(self, data_dict, statistics):
+ if not is_main_process():
+ return 0, 0, None
+
+ logger.info("Evaluate in main process...")
+
+ annType = ["segm", "bbox", "keypoints"]
+
+ inference_time = statistics[0].item()
+ nms_time = statistics[1].item()
+ n_samples = statistics[2].item()
+
+ a_infer_time = 1000 * inference_time / (n_samples * self.dataloader.batch_size)
+ a_nms_time = 1000 * nms_time / (n_samples * self.dataloader.batch_size)
+
+ time_info = ", ".join(
+ [
+ "Average {} time: {:.2f} ms".format(k, v)
+ for k, v in zip(
+ ["forward", "NMS", "inference"],
+ [a_infer_time, a_nms_time, (a_infer_time + a_nms_time)],
+ )
+ ]
+ )
+
+ info = time_info + "\n"
+
+ # Evaluate the Dt (detection) json comparing with the ground truth
+ if len(data_dict) > 0:
+ cocoGt = self.dataloader.dataset.coco
+ # TODO: since pycocotools can't process dict in py36, write data to json file.
+ if self.testdev:
+ json.dump(data_dict, open("./yolox_testdev_2017.json", "w"))
+ cocoDt = cocoGt.loadRes("./yolox_testdev_2017.json")
+ else:
+ _, tmp = tempfile.mkstemp()
+ json.dump(data_dict, open(tmp, "w"))
+ cocoDt = cocoGt.loadRes(tmp)
+ try:
+ from yolox.layers import COCOeval_opt as COCOeval
+ except ImportError:
+ from pycocotools.cocoeval import COCOeval
+
+ logger.warning("Use standard COCOeval.")
+
+ cocoEval = COCOeval(cocoGt, cocoDt, annType[1])
+ cocoEval.evaluate()
+ cocoEval.accumulate()
+ redirect_string = io.StringIO()
+ with contextlib.redirect_stdout(redirect_string):
+ cocoEval.summarize()
+ info += redirect_string.getvalue()
+ cat_ids = list(cocoGt.cats.keys())
+ cat_names = [cocoGt.cats[catId]['name'] for catId in sorted(cat_ids)]
+ if self.per_class_AP:
+ AP_table = per_class_AP_table(cocoEval, class_names=cat_names)
+ info += "per class AP:\n" + AP_table + "\n"
+ if self.per_class_AR:
+ AR_table = per_class_AR_table(cocoEval, class_names=cat_names)
+ info += "per class AR:\n" + AR_table + "\n"
+ return cocoEval.stats[0], cocoEval.stats[1], info
+ else:
+ return 0, 0, info
diff --git a/what/models/detection/yolox/evaluators/voc_eval.py b/what/models/detection/yolox/evaluators/voc_eval.py
new file mode 100644
index 0000000..d1a4748
--- /dev/null
+++ b/what/models/detection/yolox/evaluators/voc_eval.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+# Code are based on
+# https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/datasets/voc_eval.py
+# Copyright (c) Bharath Hariharan.
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+import pickle
+import xml.etree.ElementTree as ET
+
+import numpy as np
+
+
+def parse_rec(filename):
+ """Parse a PASCAL VOC xml file"""
+ tree = ET.parse(filename)
+ objects = []
+ for obj in tree.findall("object"):
+ obj_struct = {}
+ obj_struct["name"] = obj.find("name").text
+ obj_struct["pose"] = obj.find("pose").text
+ obj_struct["truncated"] = int(obj.find("truncated").text)
+ obj_struct["difficult"] = int(obj.find("difficult").text)
+ bbox = obj.find("bndbox")
+ obj_struct["bbox"] = [
+ int(bbox.find("xmin").text),
+ int(bbox.find("ymin").text),
+ int(bbox.find("xmax").text),
+ int(bbox.find("ymax").text),
+ ]
+ objects.append(obj_struct)
+
+ return objects
+
+
+def voc_ap(rec, prec, use_07_metric=False):
+ """
+ Compute VOC AP given precision and recall.
+ If use_07_metric is true, uses the
+ VOC 07 11 point method (default:False).
+ """
+ if use_07_metric:
+ # 11 point metric
+ ap = 0.0
+ for t in np.arange(0.0, 1.1, 0.1):
+ if np.sum(rec >= t) == 0:
+ p = 0
+ else:
+ p = np.max(prec[rec >= t])
+ ap = ap + p / 11.0
+ else:
+ # correct AP calculation
+ # first append sentinel values at the end
+ mrec = np.concatenate(([0.0], rec, [1.0]))
+ mpre = np.concatenate(([0.0], prec, [0.0]))
+
+ # compute the precision envelope
+ for i in range(mpre.size - 1, 0, -1):
+ mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+ # to calculate area under PR curve, look for points
+ # where X axis (recall) changes value
+ i = np.where(mrec[1:] != mrec[:-1])[0]
+
+ # and sum (\Delta recall) * prec
+ ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+ return ap
+
+
+def voc_eval(
+ detpath,
+ annopath,
+ imagesetfile,
+ classname,
+ cachedir,
+ ovthresh=0.5,
+ use_07_metric=False,
+):
+ # first load gt
+ if not os.path.isdir(cachedir):
+ os.mkdir(cachedir)
+ cachefile = os.path.join(cachedir, "annots.pkl")
+ # read list of images
+ with open(imagesetfile, "r") as f:
+ lines = f.readlines()
+ imagenames = [x.strip() for x in lines]
+
+ if not os.path.isfile(cachefile):
+ # load annots
+ recs = {}
+ for i, imagename in enumerate(imagenames):
+ recs[imagename] = parse_rec(annopath.format(imagename))
+ if i % 100 == 0:
+ print(f"Reading annotation for {i + 1}/{len(imagenames)}")
+ # save
+ print(f"Saving cached annotations to {cachefile}")
+ with open(cachefile, "wb") as f:
+ pickle.dump(recs, f)
+ else:
+ # load
+ with open(cachefile, "rb") as f:
+ recs = pickle.load(f)
+
+ # extract gt objects for this class
+ class_recs = {}
+ npos = 0
+ for imagename in imagenames:
+ R = [obj for obj in recs[imagename] if obj["name"] == classname]
+ bbox = np.array([x["bbox"] for x in R])
+ difficult = np.array([x["difficult"] for x in R]).astype(bool)
+ det = [False] * len(R)
+ npos = npos + sum(~difficult)
+ class_recs[imagename] = {"bbox": bbox, "difficult": difficult, "det": det}
+
+ # read dets
+ detfile = detpath.format(classname)
+ with open(detfile, "r") as f:
+ lines = f.readlines()
+
+ if len(lines) == 0:
+ return 0, 0, 0
+
+ splitlines = [x.strip().split(" ") for x in lines]
+ image_ids = [x[0] for x in splitlines]
+ confidence = np.array([float(x[1]) for x in splitlines])
+ BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
+
+ # sort by confidence
+ sorted_ind = np.argsort(-confidence)
+ BB = BB[sorted_ind, :]
+ image_ids = [image_ids[x] for x in sorted_ind]
+
+ # go down dets and mark TPs and FPs
+ nd = len(image_ids)
+ tp = np.zeros(nd)
+ fp = np.zeros(nd)
+ for d in range(nd):
+ R = class_recs[image_ids[d]]
+ bb = BB[d, :].astype(float)
+ ovmax = -np.inf
+ BBGT = R["bbox"].astype(float)
+
+ if BBGT.size > 0:
+ # compute overlaps
+ # intersection
+ ixmin = np.maximum(BBGT[:, 0], bb[0])
+ iymin = np.maximum(BBGT[:, 1], bb[1])
+ ixmax = np.minimum(BBGT[:, 2], bb[2])
+ iymax = np.minimum(BBGT[:, 3], bb[3])
+ iw = np.maximum(ixmax - ixmin + 1.0, 0.0)
+ ih = np.maximum(iymax - iymin + 1.0, 0.0)
+ inters = iw * ih
+
+ # union
+ uni = (
+ (bb[2] - bb[0] + 1.0) * (bb[3] - bb[1] + 1.0)
+ + (BBGT[:, 2] - BBGT[:, 0] + 1.0) * (BBGT[:, 3] - BBGT[:, 1] + 1.0) - inters
+ )
+
+ overlaps = inters / uni
+ ovmax = np.max(overlaps)
+ jmax = np.argmax(overlaps)
+
+ if ovmax > ovthresh:
+ if not R["difficult"][jmax]:
+ if not R["det"][jmax]:
+ tp[d] = 1.0
+ R["det"][jmax] = 1
+ else:
+ fp[d] = 1.0
+ else:
+ fp[d] = 1.0
+
+ # compute precision recall
+ fp = np.cumsum(fp)
+ tp = np.cumsum(tp)
+ rec = tp / float(npos)
+ # avoid divide by zero in case the first detection matches a difficult
+ # ground truth
+ prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+ ap = voc_ap(rec, prec, use_07_metric)
+
+ return rec, prec, ap
diff --git a/what/models/detection/yolox/evaluators/voc_evaluator.py b/what/models/detection/yolox/evaluators/voc_evaluator.py
new file mode 100644
index 0000000..e568811
--- /dev/null
+++ b/what/models/detection/yolox/evaluators/voc_evaluator.py
@@ -0,0 +1,187 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import sys
+import tempfile
+import time
+from collections import ChainMap
+from loguru import logger
+from tqdm import tqdm
+
+import numpy as np
+
+import torch
+
+from what.models.detection.yolox.utils import gather, is_main_process, postprocess, synchronize, time_synchronized
+
+
+class VOCEvaluator:
+ """
+ VOC AP Evaluation class.
+ """
+
+ def __init__(self, dataloader, img_size, confthre, nmsthre, num_classes):
+ """
+ Args:
+ dataloader (Dataloader): evaluate dataloader.
+ img_size (int): image size after preprocess. images are resized
+ to squares whose shape is (img_size, img_size).
+ confthre (float): confidence threshold ranging from 0 to 1, which
+ is defined in the config file.
+ nmsthre (float): IoU threshold of non-max supression ranging from 0 to 1.
+ """
+ self.dataloader = dataloader
+ self.img_size = img_size
+ self.confthre = confthre
+ self.nmsthre = nmsthre
+ self.num_classes = num_classes
+ self.num_images = len(dataloader.dataset)
+
+ def evaluate(
+ self, model, distributed=False, half=False, trt_file=None,
+ decoder=None, test_size=None, return_outputs=False,
+ ):
+ """
+ VOC average precision (AP) Evaluation. Iterate inference on the test dataset
+ and the results are evaluated by COCO API.
+
+ NOTE: This function will change training mode to False, please save states if needed.
+
+ Args:
+ model : model to evaluate.
+
+ Returns:
+ ap50_95 (float) : COCO style AP of IoU=50:95
+ ap50 (float) : VOC 2007 metric AP of IoU=50
+ summary (sr): summary info of evaluation.
+ """
+ # TODO half to amp_test
+ tensor_type = torch.cuda.HalfTensor if half else torch.cuda.FloatTensor
+ model = model.eval()
+ if half:
+ model = model.half()
+ ids = []
+ data_list = {}
+ progress_bar = tqdm if is_main_process() else iter
+
+ inference_time = 0
+ nms_time = 0
+ n_samples = max(len(self.dataloader) - 1, 1)
+
+ if trt_file is not None:
+ from torch2trt import TRTModule
+
+ model_trt = TRTModule()
+ model_trt.load_state_dict(torch.load(trt_file))
+
+ x = torch.ones(1, 3, test_size[0], test_size[1]).cuda()
+ model(x)
+ model = model_trt
+
+ for cur_iter, (imgs, _, info_imgs, ids) in enumerate(progress_bar(self.dataloader)):
+ with torch.no_grad():
+ imgs = imgs.type(tensor_type)
+
+ # skip the last iters since batchsize might be not enough for batch inference
+ is_time_record = cur_iter < len(self.dataloader) - 1
+ if is_time_record:
+ start = time.time()
+
+ outputs = model(imgs)
+ if decoder is not None:
+ outputs = decoder(outputs, dtype=outputs.type())
+
+ if is_time_record:
+ infer_end = time_synchronized()
+ inference_time += infer_end - start
+
+ outputs = postprocess(
+ outputs, self.num_classes, self.confthre, self.nmsthre
+ )
+ if is_time_record:
+ nms_end = time_synchronized()
+ nms_time += nms_end - infer_end
+
+ data_list.update(self.convert_to_voc_format(outputs, info_imgs, ids))
+
+ statistics = torch.cuda.FloatTensor([inference_time, nms_time, n_samples])
+ if distributed:
+ data_list = gather(data_list, dst=0)
+ data_list = ChainMap(*data_list)
+ torch.distributed.reduce(statistics, dst=0)
+
+ eval_results = self.evaluate_prediction(data_list, statistics)
+ synchronize()
+ if return_outputs:
+ return eval_results, data_list
+ return eval_results
+
+ def convert_to_voc_format(self, outputs, info_imgs, ids):
+ predictions = {}
+ for output, img_h, img_w, img_id in zip(outputs, info_imgs[0], info_imgs[1], ids):
+ if output is None:
+ predictions[int(img_id)] = (None, None, None)
+ continue
+ output = output.cpu()
+
+ bboxes = output[:, 0:4]
+
+ # preprocessing: resize
+ scale = min(self.img_size[0] / float(img_h), self.img_size[1] / float(img_w))
+ bboxes /= scale
+
+ cls = output[:, 6]
+ scores = output[:, 4] * output[:, 5]
+
+ predictions[int(img_id)] = (bboxes, cls, scores)
+ return predictions
+
+ def evaluate_prediction(self, data_dict, statistics):
+ if not is_main_process():
+ return 0, 0, None
+
+ logger.info("Evaluate in main process...")
+
+ inference_time = statistics[0].item()
+ nms_time = statistics[1].item()
+ n_samples = statistics[2].item()
+
+ a_infer_time = 1000 * inference_time / (n_samples * self.dataloader.batch_size)
+ a_nms_time = 1000 * nms_time / (n_samples * self.dataloader.batch_size)
+
+ time_info = ", ".join(
+ [
+ "Average {} time: {:.2f} ms".format(k, v)
+ for k, v in zip(
+ ["forward", "NMS", "inference"],
+ [a_infer_time, a_nms_time, (a_infer_time + a_nms_time)],
+ )
+ ]
+ )
+ info = time_info + "\n"
+
+ all_boxes = [
+ [[] for _ in range(self.num_images)] for _ in range(self.num_classes)
+ ]
+ for img_num in range(self.num_images):
+ bboxes, cls, scores = data_dict[img_num]
+ if bboxes is None:
+ for j in range(self.num_classes):
+ all_boxes[j][img_num] = np.empty([0, 5], dtype=np.float32)
+ continue
+ for j in range(self.num_classes):
+ mask_c = cls == j
+ if sum(mask_c) == 0:
+ all_boxes[j][img_num] = np.empty([0, 5], dtype=np.float32)
+ continue
+
+ c_dets = torch.cat((bboxes, scores.unsqueeze(1)), dim=1)
+ all_boxes[j][img_num] = c_dets[mask_c].numpy()
+
+ sys.stdout.write(f"im_eval: {img_num + 1}/{self.num_images} \r")
+ sys.stdout.flush()
+
+ with tempfile.TemporaryDirectory() as tempdir:
+ mAP50, mAP70 = self.dataloader.dataset.evaluate_detections(all_boxes, tempdir)
+ return mAP50, mAP70, info
diff --git a/what/models/detection/yolox/exp/__init__.py b/what/models/detection/yolox/exp/__init__.py
new file mode 100644
index 0000000..40e5f58
--- /dev/null
+++ b/what/models/detection/yolox/exp/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python3
+# Copyright (c) Megvii Inc. All rights reserved.
+
+from .base_exp import BaseExp
+from .build import get_exp
+from .yolox_base import Exp, check_exp_value
diff --git a/what/models/detection/yolox/exp/base_exp.py b/what/models/detection/yolox/exp/base_exp.py
new file mode 100644
index 0000000..97aa769
--- /dev/null
+++ b/what/models/detection/yolox/exp/base_exp.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import ast
+import pprint
+from abc import ABCMeta, abstractmethod
+from typing import Dict, List, Tuple
+from tabulate import tabulate
+
+import torch
+from torch.nn import Module
+
+from what.models.detection.yolox.utils import LRScheduler
+
+
+class BaseExp(metaclass=ABCMeta):
+ """Basic class for any experiment."""
+
+ def __init__(self):
+ self.seed = None
+ self.output_dir = "./YOLOX_outputs"
+ self.print_interval = 100
+ self.eval_interval = 10
+ self.dataset = None
+
+ @abstractmethod
+ def get_model(self) -> Module:
+ pass
+
+ @abstractmethod
+ def get_dataset(self, cache: bool = False, cache_type: str = "ram"):
+ pass
+
+ @abstractmethod
+ def get_data_loader(
+ self, batch_size: int, is_distributed: bool
+ ) -> Dict[str, torch.utils.data.DataLoader]:
+ pass
+
+ @abstractmethod
+ def get_optimizer(self, batch_size: int) -> torch.optim.Optimizer:
+ pass
+
+ @abstractmethod
+ def get_lr_scheduler(
+ self, lr: float, iters_per_epoch: int, **kwargs
+ ) -> LRScheduler:
+ pass
+
+ @abstractmethod
+ def get_evaluator(self):
+ pass
+
+ @abstractmethod
+ def eval(self, model, evaluator, weights):
+ pass
+
+ def __repr__(self):
+ table_header = ["keys", "values"]
+ exp_table = [
+ (str(k), pprint.pformat(v))
+ for k, v in vars(self).items()
+ if not k.startswith("_")
+ ]
+ return tabulate(exp_table, headers=table_header, tablefmt="fancy_grid")
+
+ def merge(self, cfg_list):
+ assert len(cfg_list) % 2 == 0, f"length must be even, check value here: {cfg_list}"
+ for k, v in zip(cfg_list[0::2], cfg_list[1::2]):
+ # only update value with same key
+ if hasattr(self, k):
+ src_value = getattr(self, k)
+ src_type = type(src_value)
+
+ # pre-process input if source type is list or tuple
+ if isinstance(src_value, (List, Tuple)):
+ v = v.strip("[]()")
+ v = [t.strip() for t in v.split(",")]
+
+ # find type of tuple
+ if len(src_value) > 0:
+ src_item_type = type(src_value[0])
+ v = [src_item_type(t) for t in v]
+
+ if src_value is not None and src_type != type(v):
+ try:
+ v = src_type(v)
+ except Exception:
+ v = ast.literal_eval(v)
+ setattr(self, k, v)
diff --git a/what/models/detection/yolox/exp/build.py b/what/models/detection/yolox/exp/build.py
new file mode 100644
index 0000000..146954a
--- /dev/null
+++ b/what/models/detection/yolox/exp/build.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import importlib
+import os
+import sys
+
+
+def get_exp_by_file(exp_file):
+ try:
+ sys.path.append(os.path.dirname(exp_file))
+ current_exp = importlib.import_module(os.path.basename(exp_file).split(".")[0])
+ exp = current_exp.Exp()
+ except Exception:
+ raise ImportError("{} doesn't contains class named 'Exp'".format(exp_file))
+ return exp
+
+
+def get_exp_by_name(exp_name):
+ exp = exp_name.replace("-", "_") # convert string like "yolox-s" to "yolox_s"
+ module_name = ".".join(["what", "models", "detection", "yolox", "exp", "default", exp])
+ exp_object = importlib.import_module(module_name).Exp()
+ return exp_object
+
+
+def get_exp(exp_file=None, exp_name=None):
+ """
+ get Exp object by file or name. If exp_file and exp_name
+ are both provided, get Exp by exp_file.
+
+ Args:
+ exp_file (str): file path of experiment.
+ exp_name (str): name of experiment. "yolo-s",
+ """
+ assert (
+ exp_file is not None or exp_name is not None
+ ), "plz provide exp file or exp name."
+ if exp_file is not None:
+ return get_exp_by_file(exp_file)
+ else:
+ return get_exp_by_name(exp_name)
diff --git a/what/models/detection/yolox/exp/default/__init__.py b/what/models/detection/yolox/exp/default/__init__.py
new file mode 100644
index 0000000..d482183
--- /dev/null
+++ b/what/models/detection/yolox/exp/default/__init__.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+# This file is used for package installation and find default exp file
+
+import sys
+from importlib import abc, util
+from pathlib import Path
+
+_EXP_PATH = Path(__file__).resolve().parent.parent.parent / "exps" / "default"
+
+if _EXP_PATH.is_dir():
+ # This is true only for in-place installation (pip install -e, setup.py develop),
+ # where setup(package_dir=) does not work: https://github.com/pypa/setuptools/issues/230
+
+ class _ExpFinder(abc.MetaPathFinder):
+
+ def find_spec(self, name, path, target=None):
+ if not name.startswith("what.models.detection.yolox.exp.default"):
+ return
+ project_name = name.split(".")[-1] + ".py"
+ target_file = _EXP_PATH / project_name
+ if not target_file.is_file():
+ return
+ return util.spec_from_file_location(name, target_file)
+
+ sys.meta_path.append(_ExpFinder())
diff --git a/what/models/detection/yolox/exp/yolox_base.py b/what/models/detection/yolox/exp/yolox_base.py
new file mode 100644
index 0000000..a0e3718
--- /dev/null
+++ b/what/models/detection/yolox/exp/yolox_base.py
@@ -0,0 +1,358 @@
+#!/usr/bin/env python3
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import os
+import random
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+from .base_exp import BaseExp
+
+__all__ = ["Exp", "check_exp_value"]
+
+
+class Exp(BaseExp):
+ def __init__(self):
+ super().__init__()
+
+ # ---------------- model config ---------------- #
+ # detect classes number of model
+ self.num_classes = 80
+ # factor of model depth
+ self.depth = 1.00
+ # factor of model width
+ self.width = 1.00
+ # activation name. For example, if using "relu", then "silu" will be replaced to "relu".
+ self.act = "silu"
+
+ # ---------------- dataloader config ---------------- #
+ # set worker to 4 for shorter dataloader init time
+ # If your training process cost many memory, reduce this value.
+ self.data_num_workers = 4
+ self.input_size = (640, 640) # (height, width)
+ # Actual multiscale ranges: [640 - 5 * 32, 640 + 5 * 32].
+ # To disable multiscale training, set the value to 0.
+ self.multiscale_range = 5
+ # You can uncomment this line to specify a multiscale range
+ # self.random_size = (14, 26)
+ # dir of dataset images, if data_dir is None, this project will use `datasets` dir
+ self.data_dir = None
+ # name of annotation file for training
+ self.train_ann = "instances_train2017.json"
+ # name of annotation file for evaluation
+ self.val_ann = "instances_val2017.json"
+ # name of annotation file for testing
+ self.test_ann = "instances_test2017.json"
+
+ # --------------- transform config ----------------- #
+ # prob of applying mosaic aug
+ self.mosaic_prob = 1.0
+ # prob of applying mixup aug
+ self.mixup_prob = 1.0
+ # prob of applying hsv aug
+ self.hsv_prob = 1.0
+ # prob of applying flip aug
+ self.flip_prob = 0.5
+ # rotation angle range, for example, if set to 2, the true range is (-2, 2)
+ self.degrees = 10.0
+ # translate range, for example, if set to 0.1, the true range is (-0.1, 0.1)
+ self.translate = 0.1
+ self.mosaic_scale = (0.1, 2)
+ # apply mixup aug or not
+ self.enable_mixup = True
+ self.mixup_scale = (0.5, 1.5)
+ # shear angle range, for example, if set to 2, the true range is (-2, 2)
+ self.shear = 2.0
+
+ # -------------- training config --------------------- #
+ # epoch number used for warmup
+ self.warmup_epochs = 5
+ # max training epoch
+ self.max_epoch = 300
+ # minimum learning rate during warmup
+ self.warmup_lr = 0
+ self.min_lr_ratio = 0.05
+ # learning rate for one image. During training, lr will multiply batchsize.
+ self.basic_lr_per_img = 0.01 / 64.0
+ # name of LRScheduler
+ self.scheduler = "yoloxwarmcos"
+ # last #epoch to close augmention like mosaic
+ self.no_aug_epochs = 15
+ # apply EMA during training
+ self.ema = True
+
+ # weight decay of optimizer
+ self.weight_decay = 5e-4
+ # momentum of optimizer
+ self.momentum = 0.9
+ # log period in iter, for example,
+ # if set to 1, user could see log every iteration.
+ self.print_interval = 10
+ # eval period in epoch, for example,
+ # if set to 1, model will be evaluate after every epoch.
+ self.eval_interval = 10
+ # save history checkpoint or not.
+ # If set to False, yolox will only save latest and best ckpt.
+ self.save_history_ckpt = True
+ # name of experiment
+ self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+
+ # ----------------- testing config ------------------ #
+ # output image size during evaluation/test
+ self.test_size = (640, 640)
+ # confidence threshold during evaluation/test,
+ # boxes whose scores are less than test_conf will be filtered
+ self.test_conf = 0.01
+ # nms threshold
+ self.nmsthre = 0.65
+
+ def get_model(self):
+ from what.models.detection.yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
+
+ def init_yolo(M):
+ for m in M.modules():
+ if isinstance(m, nn.BatchNorm2d):
+ m.eps = 1e-3
+ m.momentum = 0.03
+
+ if getattr(self, "model", None) is None:
+ in_channels = [256, 512, 1024]
+ backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, act=self.act)
+ head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, act=self.act)
+ self.model = YOLOX(backbone, head)
+
+ self.model.apply(init_yolo)
+ self.model.head.initialize_biases(1e-2)
+ self.model.train()
+ return self.model
+
+ def get_dataset(self, cache: bool = False, cache_type: str = "ram"):
+ """
+ Get dataset according to cache and cache_type parameters.
+ Args:
+ cache (bool): Whether to cache imgs to ram or disk.
+ cache_type (str, optional): Defaults to "ram".
+ "ram" : Caching imgs to ram for fast training.
+ "disk": Caching imgs to disk for fast training.
+ """
+ from yolox.data import COCODataset, TrainTransform
+
+ return COCODataset(
+ data_dir=self.data_dir,
+ json_file=self.train_ann,
+ img_size=self.input_size,
+ preproc=TrainTransform(
+ max_labels=50,
+ flip_prob=self.flip_prob,
+ hsv_prob=self.hsv_prob
+ ),
+ cache=cache,
+ cache_type=cache_type,
+ )
+
+ def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img: str = None):
+ """
+ Get dataloader according to cache_img parameter.
+ Args:
+ no_aug (bool, optional): Whether to turn off mosaic data enhancement. Defaults to False.
+ cache_img (str, optional): cache_img is equivalent to cache_type. Defaults to None.
+ "ram" : Caching imgs to ram for fast training.
+ "disk": Caching imgs to disk for fast training.
+ None: Do not use cache, in this case cache_data is also None.
+ """
+ from yolox.data import (
+ TrainTransform,
+ YoloBatchSampler,
+ DataLoader,
+ InfiniteSampler,
+ MosaicDetection,
+ worker_init_reset_seed,
+ )
+ from yolox.utils import wait_for_the_master
+
+ # if cache is True, we will create self.dataset before launch
+ # else we will create self.dataset after launch
+ if self.dataset is None:
+ with wait_for_the_master():
+ assert cache_img is None, \
+ "cache_img must be None if you didn't create self.dataset before launch"
+ self.dataset = self.get_dataset(cache=False, cache_type=cache_img)
+
+ self.dataset = MosaicDetection(
+ dataset=self.dataset,
+ mosaic=not no_aug,
+ img_size=self.input_size,
+ preproc=TrainTransform(
+ max_labels=120,
+ flip_prob=self.flip_prob,
+ hsv_prob=self.hsv_prob),
+ degrees=self.degrees,
+ translate=self.translate,
+ mosaic_scale=self.mosaic_scale,
+ mixup_scale=self.mixup_scale,
+ shear=self.shear,
+ enable_mixup=self.enable_mixup,
+ mosaic_prob=self.mosaic_prob,
+ mixup_prob=self.mixup_prob,
+ )
+
+ if is_distributed:
+ batch_size = batch_size // dist.get_world_size()
+
+ sampler = InfiniteSampler(len(self.dataset), seed=self.seed if self.seed else 0)
+
+ batch_sampler = YoloBatchSampler(
+ sampler=sampler,
+ batch_size=batch_size,
+ drop_last=False,
+ mosaic=not no_aug,
+ )
+
+ dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
+ dataloader_kwargs["batch_sampler"] = batch_sampler
+
+ # Make sure each process has different random seed, especially for 'fork' method.
+ # Check https://github.com/pytorch/pytorch/issues/63311 for more details.
+ dataloader_kwargs["worker_init_fn"] = worker_init_reset_seed
+
+ train_loader = DataLoader(self.dataset, **dataloader_kwargs)
+
+ return train_loader
+
+ def random_resize(self, data_loader, epoch, rank, is_distributed):
+ tensor = torch.LongTensor(2).cuda()
+
+ if rank == 0:
+ size_factor = self.input_size[1] * 1.0 / self.input_size[0]
+ if not hasattr(self, 'random_size'):
+ min_size = int(self.input_size[0] / 32) - self.multiscale_range
+ max_size = int(self.input_size[0] / 32) + self.multiscale_range
+ self.random_size = (min_size, max_size)
+ size = random.randint(*self.random_size)
+ size = (int(32 * size), 32 * int(size * size_factor))
+ tensor[0] = size[0]
+ tensor[1] = size[1]
+
+ if is_distributed:
+ dist.barrier()
+ dist.broadcast(tensor, 0)
+
+ input_size = (tensor[0].item(), tensor[1].item())
+ return input_size
+
+ def preprocess(self, inputs, targets, tsize):
+ scale_y = tsize[0] / self.input_size[0]
+ scale_x = tsize[1] / self.input_size[1]
+ if scale_x != 1 or scale_y != 1:
+ inputs = nn.functional.interpolate(
+ inputs, size=tsize, mode="bilinear", align_corners=False
+ )
+ targets[..., 1::2] = targets[..., 1::2] * scale_x
+ targets[..., 2::2] = targets[..., 2::2] * scale_y
+ return inputs, targets
+
+ def get_optimizer(self, batch_size):
+ if "optimizer" not in self.__dict__:
+ if self.warmup_epochs > 0:
+ lr = self.warmup_lr
+ else:
+ lr = self.basic_lr_per_img * batch_size
+
+ pg0, pg1, pg2 = [], [], [] # optimizer parameter groups
+
+ for k, v in self.model.named_modules():
+ if hasattr(v, "bias") and isinstance(v.bias, nn.Parameter):
+ pg2.append(v.bias) # biases
+ if isinstance(v, nn.BatchNorm2d) or "bn" in k:
+ pg0.append(v.weight) # no decay
+ elif hasattr(v, "weight") and isinstance(v.weight, nn.Parameter):
+ pg1.append(v.weight) # apply decay
+
+ optimizer = torch.optim.SGD(
+ pg0, lr=lr, momentum=self.momentum, nesterov=True
+ )
+ optimizer.add_param_group(
+ {"params": pg1, "weight_decay": self.weight_decay}
+ ) # add pg1 with weight_decay
+ optimizer.add_param_group({"params": pg2})
+ self.optimizer = optimizer
+
+ return self.optimizer
+
+ def get_lr_scheduler(self, lr, iters_per_epoch):
+ from yolox.utils import LRScheduler
+
+ scheduler = LRScheduler(
+ self.scheduler,
+ lr,
+ iters_per_epoch,
+ self.max_epoch,
+ warmup_epochs=self.warmup_epochs,
+ warmup_lr_start=self.warmup_lr,
+ no_aug_epochs=self.no_aug_epochs,
+ min_lr_ratio=self.min_lr_ratio,
+ )
+ return scheduler
+
+ def get_eval_dataset(self, **kwargs):
+ from yolox.data import COCODataset, ValTransform
+ testdev = kwargs.get("testdev", False)
+ legacy = kwargs.get("legacy", False)
+
+ return COCODataset(
+ data_dir=self.data_dir,
+ json_file=self.val_ann if not testdev else self.test_ann,
+ name="val2017" if not testdev else "test2017",
+ img_size=self.test_size,
+ preproc=ValTransform(legacy=legacy),
+ )
+
+ def get_eval_loader(self, batch_size, is_distributed, **kwargs):
+ valdataset = self.get_eval_dataset(**kwargs)
+
+ if is_distributed:
+ batch_size = batch_size // dist.get_world_size()
+ sampler = torch.utils.data.distributed.DistributedSampler(
+ valdataset, shuffle=False
+ )
+ else:
+ sampler = torch.utils.data.SequentialSampler(valdataset)
+
+ dataloader_kwargs = {
+ "num_workers": self.data_num_workers,
+ "pin_memory": True,
+ "sampler": sampler,
+ }
+ dataloader_kwargs["batch_size"] = batch_size
+ val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
+
+ return val_loader
+
+ def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):
+ from yolox.evaluators import COCOEvaluator
+
+ return COCOEvaluator(
+ dataloader=self.get_eval_loader(batch_size, is_distributed,
+ testdev=testdev, legacy=legacy),
+ img_size=self.test_size,
+ confthre=self.test_conf,
+ nmsthre=self.nmsthre,
+ num_classes=self.num_classes,
+ testdev=testdev,
+ )
+
+ def get_trainer(self, args):
+ from yolox.core import Trainer
+ trainer = Trainer(self, args)
+ # NOTE: trainer shouldn't be an attribute of exp object
+ return trainer
+
+ def eval(self, model, evaluator, is_distributed, half=False, return_outputs=False):
+ return evaluator.evaluate(model, is_distributed, half, return_outputs=return_outputs)
+
+
+def check_exp_value(exp: Exp):
+ h, w = exp.input_size
+ assert h % 32 == 0 and w % 32 == 0, "input size must be multiples of 32"
diff --git a/what/models/detection/yolox/exps/default/__init__.py b/what/models/detection/yolox/exps/default/__init__.py
new file mode 100644
index 0000000..ce9fae0
--- /dev/null
+++ b/what/models/detection/yolox/exps/default/__init__.py
@@ -0,0 +1,3 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
diff --git a/what/models/detection/yolox/exps/default/yolov3.py b/what/models/detection/yolox/exps/default/yolov3.py
new file mode 100644
index 0000000..2019dad
--- /dev/null
+++ b/what/models/detection/yolox/exps/default/yolov3.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+import torch.nn as nn
+
+from what.models.detection.yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+ def __init__(self):
+ super(Exp, self).__init__()
+ self.depth = 1.0
+ self.width = 1.0
+ self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+
+ def get_model(self, sublinear=False):
+ def init_yolo(M):
+ for m in M.modules():
+ if isinstance(m, nn.BatchNorm2d):
+ m.eps = 1e-3
+ m.momentum = 0.03
+ if "model" not in self.__dict__:
+ from yolox.models import YOLOX, YOLOFPN, YOLOXHead
+ backbone = YOLOFPN()
+ head = YOLOXHead(self.num_classes, self.width, in_channels=[128, 256, 512], act="lrelu")
+ self.model = YOLOX(backbone, head)
+ self.model.apply(init_yolo)
+ self.model.head.initialize_biases(1e-2)
+
+ return self.model
diff --git a/what/models/detection/yolox/exps/default/yolox_l.py b/what/models/detection/yolox/exps/default/yolox_l.py
new file mode 100644
index 0000000..2705c52
--- /dev/null
+++ b/what/models/detection/yolox/exps/default/yolox_l.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+from what.models.detection.yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+ def __init__(self):
+ super(Exp, self).__init__()
+ self.depth = 1.0
+ self.width = 1.0
+ self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
diff --git a/what/models/detection/yolox/exps/default/yolox_m.py b/what/models/detection/yolox/exps/default/yolox_m.py
new file mode 100644
index 0000000..8c75e17
--- /dev/null
+++ b/what/models/detection/yolox/exps/default/yolox_m.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+from what.models.detection.yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+ def __init__(self):
+ super(Exp, self).__init__()
+ self.depth = 0.67
+ self.width = 0.75
+ self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
diff --git a/what/models/detection/yolox/exps/default/yolox_nano.py b/what/models/detection/yolox/exps/default/yolox_nano.py
new file mode 100644
index 0000000..cecfff6
--- /dev/null
+++ b/what/models/detection/yolox/exps/default/yolox_nano.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+import torch.nn as nn
+
+from what.models.detection.yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+ def __init__(self):
+ super(Exp, self).__init__()
+ self.depth = 0.33
+ self.width = 0.25
+ self.input_size = (416, 416)
+ self.random_size = (10, 20)
+ self.mosaic_scale = (0.5, 1.5)
+ self.test_size = (416, 416)
+ self.mosaic_prob = 0.5
+ self.enable_mixup = False
+ self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+
+ def get_model(self, sublinear=False):
+
+ def init_yolo(M):
+ for m in M.modules():
+ if isinstance(m, nn.BatchNorm2d):
+ m.eps = 1e-3
+ m.momentum = 0.03
+ if "model" not in self.__dict__:
+ from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
+ in_channels = [256, 512, 1024]
+ # NANO model use depthwise = True, which is main difference.
+ backbone = YOLOPAFPN(
+ self.depth, self.width, in_channels=in_channels,
+ act=self.act, depthwise=True,
+ )
+ head = YOLOXHead(
+ self.num_classes, self.width, in_channels=in_channels,
+ act=self.act, depthwise=True
+ )
+ self.model = YOLOX(backbone, head)
+
+ self.model.apply(init_yolo)
+ self.model.head.initialize_biases(1e-2)
+ return self.model
diff --git a/what/models/detection/yolox/exps/default/yolox_s.py b/what/models/detection/yolox/exps/default/yolox_s.py
new file mode 100644
index 0000000..dee29a7
--- /dev/null
+++ b/what/models/detection/yolox/exps/default/yolox_s.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+from what.models.detection.yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+ def __init__(self):
+ super(Exp, self).__init__()
+ self.depth = 0.33
+ self.width = 0.50
+ self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
diff --git a/what/models/detection/yolox/exps/default/yolox_tiny.py b/what/models/detection/yolox/exps/default/yolox_tiny.py
new file mode 100644
index 0000000..223f891
--- /dev/null
+++ b/what/models/detection/yolox/exps/default/yolox_tiny.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+from what.models.detection.yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+ def __init__(self):
+ super(Exp, self).__init__()
+ self.depth = 0.33
+ self.width = 0.375
+ self.input_size = (416, 416)
+ self.mosaic_scale = (0.5, 1.5)
+ self.random_size = (10, 20)
+ self.test_size = (416, 416)
+ self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+ self.enable_mixup = False
diff --git a/what/models/detection/yolox/exps/default/yolox_x.py b/what/models/detection/yolox/exps/default/yolox_x.py
new file mode 100644
index 0000000..559b618
--- /dev/null
+++ b/what/models/detection/yolox/exps/default/yolox_x.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+from what.models.detection.yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+ def __init__(self):
+ super(Exp, self).__init__()
+ self.depth = 1.33
+ self.width = 1.25
+ self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
diff --git a/what/models/detection/yolox/exps/example/custom/nano.py b/what/models/detection/yolox/exps/example/custom/nano.py
new file mode 100644
index 0000000..fb10626
--- /dev/null
+++ b/what/models/detection/yolox/exps/example/custom/nano.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+import os
+
+import torch.nn as nn
+
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+ def __init__(self):
+ super(Exp, self).__init__()
+ self.depth = 0.33
+ self.width = 0.25
+ self.input_size = (416, 416)
+ self.mosaic_scale = (0.5, 1.5)
+ self.random_size = (10, 20)
+ self.test_size = (416, 416)
+ self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+ self.enable_mixup = False
+
+ # Define yourself dataset path
+ self.data_dir = "datasets/coco128"
+ self.train_ann = "instances_train2017.json"
+ self.val_ann = "instances_val2017.json"
+
+ self.num_classes = 71
+
+ def get_model(self, sublinear=False):
+
+ def init_yolo(M):
+ for m in M.modules():
+ if isinstance(m, nn.BatchNorm2d):
+ m.eps = 1e-3
+ m.momentum = 0.03
+ if "model" not in self.__dict__:
+ from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
+ in_channels = [256, 512, 1024]
+ # NANO model use depthwise = True, which is main difference.
+ backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, depthwise=True)
+ head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, depthwise=True)
+ self.model = YOLOX(backbone, head)
+
+ self.model.apply(init_yolo)
+ self.model.head.initialize_biases(1e-2)
+ return self.model
diff --git a/what/models/detection/yolox/exps/example/custom/yolox_s.py b/what/models/detection/yolox/exps/example/custom/yolox_s.py
new file mode 100644
index 0000000..2f0b0a5
--- /dev/null
+++ b/what/models/detection/yolox/exps/example/custom/yolox_s.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii, Inc. and its affiliates.
+import os
+
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+ def __init__(self):
+ super(Exp, self).__init__()
+ self.depth = 0.33
+ self.width = 0.50
+ self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+
+ # Define yourself dataset path
+ self.data_dir = "datasets/coco128"
+ self.train_ann = "instances_train2017.json"
+ self.val_ann = "instances_val2017.json"
+
+ self.num_classes = 71
+
+ self.max_epoch = 300
+ self.data_num_workers = 4
+ self.eval_interval = 1
diff --git a/what/models/detection/yolox/exps/example/yolox_voc/yolox_voc_s.py b/what/models/detection/yolox/exps/example/yolox_voc/yolox_voc_s.py
new file mode 100644
index 0000000..379ba9a
--- /dev/null
+++ b/what/models/detection/yolox/exps/example/yolox_voc/yolox_voc_s.py
@@ -0,0 +1,60 @@
+# encoding: utf-8
+import os
+
+from yolox.data import get_yolox_datadir
+from yolox.exp import Exp as MyExp
+
+
+class Exp(MyExp):
+ def __init__(self):
+ super(Exp, self).__init__()
+ self.num_classes = 20
+ self.depth = 0.33
+ self.width = 0.50
+ self.warmup_epochs = 1
+
+ # ---------- transform config ------------ #
+ self.mosaic_prob = 1.0
+ self.mixup_prob = 1.0
+ self.hsv_prob = 1.0
+ self.flip_prob = 0.5
+
+ self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+
+ def get_dataset(self, cache: bool, cache_type: str = "ram"):
+ from yolox.data import VOCDetection, TrainTransform
+
+ return VOCDetection(
+ data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
+ image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
+ img_size=self.input_size,
+ preproc=TrainTransform(
+ max_labels=50,
+ flip_prob=self.flip_prob,
+ hsv_prob=self.hsv_prob),
+ cache=cache,
+ cache_type=cache_type,
+ )
+
+ def get_eval_dataset(self, **kwargs):
+ from yolox.data import VOCDetection, ValTransform
+ legacy = kwargs.get("legacy", False)
+
+ return VOCDetection(
+ data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
+ image_sets=[('2007', 'test')],
+ img_size=self.test_size,
+ preproc=ValTransform(legacy=legacy),
+ )
+
+ def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):
+ from yolox.evaluators import VOCEvaluator
+
+ return VOCEvaluator(
+ dataloader=self.get_eval_loader(batch_size, is_distributed,
+ testdev=testdev, legacy=legacy),
+ img_size=self.test_size,
+ confthre=self.test_conf,
+ nmsthre=self.nmsthre,
+ num_classes=self.num_classes,
+ )
diff --git a/what/models/detection/yolox/layers/__init__.py b/what/models/detection/yolox/layers/__init__.py
new file mode 100644
index 0000000..fc9cf51
--- /dev/null
+++ b/what/models/detection/yolox/layers/__init__.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+# import torch first to make jit op work without `ImportError of libc10.so`
+import torch # noqa
+
+from .jit_ops import FastCOCOEvalOp, JitOp
+
+try:
+ from .fast_coco_eval_api import COCOeval_opt
+except ImportError: # exception will be raised when users build yolox from source
+ pass
diff --git a/what/models/detection/yolox/layers/cocoeval/cocoeval.cpp b/what/models/detection/yolox/layers/cocoeval/cocoeval.cpp
new file mode 100644
index 0000000..2e63bc9
--- /dev/null
+++ b/what/models/detection/yolox/layers/cocoeval/cocoeval.cpp
@@ -0,0 +1,502 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "cocoeval.h"
+#include
+#include
+#include
+#include
+
+using namespace pybind11::literals;
+
+namespace COCOeval {
+
+// Sort detections from highest score to lowest, such that
+// detection_instances[detection_sorted_indices[t]] >=
+// detection_instances[detection_sorted_indices[t+1]]. Use stable_sort to match
+// original COCO API
+void SortInstancesByDetectionScore(
+ const std::vector& detection_instances,
+ std::vector* detection_sorted_indices) {
+ detection_sorted_indices->resize(detection_instances.size());
+ std::iota(
+ detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
+ std::stable_sort(
+ detection_sorted_indices->begin(),
+ detection_sorted_indices->end(),
+ [&detection_instances](size_t j1, size_t j2) {
+ return detection_instances[j1].score > detection_instances[j2].score;
+ });
+}
+
+// Partition the ground truth objects based on whether or not to ignore them
+// based on area
+void SortInstancesByIgnore(
+ const std::array& area_range,
+ const std::vector& ground_truth_instances,
+ std::vector* ground_truth_sorted_indices,
+ std::vector* ignores) {
+ ignores->clear();
+ ignores->reserve(ground_truth_instances.size());
+ for (auto o : ground_truth_instances) {
+ ignores->push_back(
+ o.ignore || o.area < area_range[0] || o.area > area_range[1]);
+ }
+
+ ground_truth_sorted_indices->resize(ground_truth_instances.size());
+ std::iota(
+ ground_truth_sorted_indices->begin(),
+ ground_truth_sorted_indices->end(),
+ 0);
+ std::stable_sort(
+ ground_truth_sorted_indices->begin(),
+ ground_truth_sorted_indices->end(),
+ [&ignores](size_t j1, size_t j2) {
+ return (int)(*ignores)[j1] < (int)(*ignores)[j2];
+ });
+}
+
+// For each IOU threshold, greedily match each detected instance to a ground
+// truth instance (if possible) and store the results
+void MatchDetectionsToGroundTruth(
+ const std::vector& detection_instances,
+ const std::vector& detection_sorted_indices,
+ const std::vector& ground_truth_instances,
+ const std::vector& ground_truth_sorted_indices,
+ const std::vector& ignores,
+ const std::vector>& ious,
+ const std::vector& iou_thresholds,
+ const std::array& area_range,
+ ImageEvaluation* results) {
+ // Initialize memory to store return data matches and ignore
+ const int num_iou_thresholds = iou_thresholds.size();
+ const int num_ground_truth = ground_truth_sorted_indices.size();
+ const int num_detections = detection_sorted_indices.size();
+ std::vector ground_truth_matches(
+ num_iou_thresholds * num_ground_truth, 0);
+ std::vector& detection_matches = results->detection_matches;
+ std::vector& detection_ignores = results->detection_ignores;
+ std::vector& ground_truth_ignores = results->ground_truth_ignores;
+ detection_matches.resize(num_iou_thresholds * num_detections, 0);
+ detection_ignores.resize(num_iou_thresholds * num_detections, false);
+ ground_truth_ignores.resize(num_ground_truth);
+ for (auto g = 0; g < num_ground_truth; ++g) {
+ ground_truth_ignores[g] = ignores[ground_truth_sorted_indices[g]];
+ }
+
+ for (auto t = 0; t < num_iou_thresholds; ++t) {
+ for (auto d = 0; d < num_detections; ++d) {
+ // information about best match so far (match=-1 -> unmatched)
+ double best_iou = std::min(iou_thresholds[t], 1 - 1e-10);
+ int match = -1;
+ for (auto g = 0; g < num_ground_truth; ++g) {
+ // if this ground truth instance is already matched and not a
+ // crowd, it cannot be matched to another detection
+ if (ground_truth_matches[t * num_ground_truth + g] > 0 &&
+ !ground_truth_instances[ground_truth_sorted_indices[g]].is_crowd) {
+ continue;
+ }
+
+ // if detected instance matched to a regular ground truth
+ // instance, we can break on the first ground truth instance
+ // tagged as ignore (because they are sorted by the ignore tag)
+ if (match >= 0 && !ground_truth_ignores[match] &&
+ ground_truth_ignores[g]) {
+ break;
+ }
+
+ // if IOU overlap is the best so far, store the match appropriately
+ if (ious[d][ground_truth_sorted_indices[g]] >= best_iou) {
+ best_iou = ious[d][ground_truth_sorted_indices[g]];
+ match = g;
+ }
+ }
+ // if match was made, store id of match for both detection and
+ // ground truth
+ if (match >= 0) {
+ detection_ignores[t * num_detections + d] = ground_truth_ignores[match];
+ detection_matches[t * num_detections + d] =
+ ground_truth_instances[ground_truth_sorted_indices[match]].id;
+ ground_truth_matches[t * num_ground_truth + match] =
+ detection_instances[detection_sorted_indices[d]].id;
+ }
+
+ // set unmatched detections outside of area range to ignore
+ const InstanceAnnotation& detection =
+ detection_instances[detection_sorted_indices[d]];
+ detection_ignores[t * num_detections + d] =
+ detection_ignores[t * num_detections + d] ||
+ (detection_matches[t * num_detections + d] == 0 &&
+ (detection.area < area_range[0] || detection.area > area_range[1]));
+ }
+ }
+
+ // store detection score results
+ results->detection_scores.resize(detection_sorted_indices.size());
+ for (size_t d = 0; d < detection_sorted_indices.size(); ++d) {
+ results->detection_scores[d] =
+ detection_instances[detection_sorted_indices[d]].score;
+ }
+}
+
+std::vector EvaluateImages(
+ const std::vector>& area_ranges,
+ int max_detections,
+ const std::vector& iou_thresholds,
+ const ImageCategoryInstances>& image_category_ious,
+ const ImageCategoryInstances&
+ image_category_ground_truth_instances,
+ const ImageCategoryInstances&
+ image_category_detection_instances) {
+ const int num_area_ranges = area_ranges.size();
+ const int num_images = image_category_ground_truth_instances.size();
+ const int num_categories =
+ image_category_ious.size() > 0 ? image_category_ious[0].size() : 0;
+ std::vector detection_sorted_indices;
+ std::vector ground_truth_sorted_indices;
+ std::vector ignores;
+ std::vector results_all(
+ num_images * num_area_ranges * num_categories);
+
+ // Store results for each image, category, and area range combination. Results
+ // for each IOU threshold are packed into the same ImageEvaluation object
+ for (auto i = 0; i < num_images; ++i) {
+ for (auto c = 0; c < num_categories; ++c) {
+ const std::vector& ground_truth_instances =
+ image_category_ground_truth_instances[i][c];
+ const std::vector& detection_instances =
+ image_category_detection_instances[i][c];
+
+ SortInstancesByDetectionScore(
+ detection_instances, &detection_sorted_indices);
+ if ((int)detection_sorted_indices.size() > max_detections) {
+ detection_sorted_indices.resize(max_detections);
+ }
+
+ for (size_t a = 0; a < area_ranges.size(); ++a) {
+ SortInstancesByIgnore(
+ area_ranges[a],
+ ground_truth_instances,
+ &ground_truth_sorted_indices,
+ &ignores);
+
+ MatchDetectionsToGroundTruth(
+ detection_instances,
+ detection_sorted_indices,
+ ground_truth_instances,
+ ground_truth_sorted_indices,
+ ignores,
+ image_category_ious[i][c],
+ iou_thresholds,
+ area_ranges[a],
+ &results_all
+ [c * num_area_ranges * num_images + a * num_images + i]);
+ }
+ }
+ }
+
+ return results_all;
+}
+
+// Convert a python list to a vector
+template
+std::vector list_to_vec(const py::list& l) {
+ std::vector v(py::len(l));
+ for (int i = 0; i < (int)py::len(l); ++i) {
+ v[i] = l[i].cast();
+ }
+ return v;
+}
+
+// Helper function to Accumulate()
+// Considers the evaluation results applicable to a particular category, area
+// range, and max_detections parameter setting, which begin at
+// evaluations[evaluation_index]. Extracts a sorted list of length n of all
+// applicable detection instances concatenated across all images in the dataset,
+// which are represented by the outputs evaluation_indices, detection_scores,
+// image_detection_indices, and detection_sorted_indices--all of which are
+// length n. evaluation_indices[i] stores the applicable index into
+// evaluations[] for instance i, which has detection score detection_score[i],
+// and is the image_detection_indices[i]'th of the list of detections
+// for the image containing i. detection_sorted_indices[] defines a sorted
+// permutation of the 3 other outputs
+int BuildSortedDetectionList(
+ const std::vector& evaluations,
+ const int64_t evaluation_index,
+ const int64_t num_images,
+ const int max_detections,
+ std::vector* evaluation_indices,
+ std::vector* detection_scores,
+ std::vector* detection_sorted_indices,
+ std::vector* image_detection_indices) {
+ assert(evaluations.size() >= evaluation_index + num_images);
+
+ // Extract a list of object instances of the applicable category, area
+ // range, and max detections requirements such that they can be sorted
+ image_detection_indices->clear();
+ evaluation_indices->clear();
+ detection_scores->clear();
+ image_detection_indices->reserve(num_images * max_detections);
+ evaluation_indices->reserve(num_images * max_detections);
+ detection_scores->reserve(num_images * max_detections);
+ int num_valid_ground_truth = 0;
+ for (auto i = 0; i < num_images; ++i) {
+ const ImageEvaluation& evaluation = evaluations[evaluation_index + i];
+
+ for (int d = 0;
+ d < (int)evaluation.detection_scores.size() && d < max_detections;
+ ++d) { // detected instances
+ evaluation_indices->push_back(evaluation_index + i);
+ image_detection_indices->push_back(d);
+ detection_scores->push_back(evaluation.detection_scores[d]);
+ }
+ for (auto ground_truth_ignore : evaluation.ground_truth_ignores) {
+ if (!ground_truth_ignore) {
+ ++num_valid_ground_truth;
+ }
+ }
+ }
+
+ // Sort detections by decreasing score, using stable sort to match
+ // python implementation
+ detection_sorted_indices->resize(detection_scores->size());
+ std::iota(
+ detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
+ std::stable_sort(
+ detection_sorted_indices->begin(),
+ detection_sorted_indices->end(),
+ [&detection_scores](size_t j1, size_t j2) {
+ return (*detection_scores)[j1] > (*detection_scores)[j2];
+ });
+
+ return num_valid_ground_truth;
+}
+
+// Helper function to Accumulate()
+// Compute a precision recall curve given a sorted list of detected instances
+// encoded in evaluations, evaluation_indices, detection_scores,
+// detection_sorted_indices, image_detection_indices (see
+// BuildSortedDetectionList()). Using vectors precisions and recalls
+// and temporary storage, output the results into precisions_out, recalls_out,
+// and scores_out, which are large buffers containing many precion/recall curves
+// for all possible parameter settings, with precisions_out_index and
+// recalls_out_index defining the applicable indices to store results.
+void ComputePrecisionRecallCurve(
+ const int64_t precisions_out_index,
+ const int64_t precisions_out_stride,
+ const int64_t recalls_out_index,
+ const std::vector& recall_thresholds,
+ const int iou_threshold_index,
+ const int num_iou_thresholds,
+ const int num_valid_ground_truth,
+ const std::vector& evaluations,
+ const std::vector& evaluation_indices,
+ const std::vector& detection_scores,
+ const std::vector& detection_sorted_indices,
+ const std::vector& image_detection_indices,
+ std::vector* precisions,
+ std::vector* recalls,
+ std::vector* precisions_out,
+ std::vector* scores_out,
+ std::vector* recalls_out) {
+ assert(recalls_out->size() > recalls_out_index);
+
+ // Compute precision/recall for each instance in the sorted list of detections
+ int64_t true_positives_sum = 0, false_positives_sum = 0;
+ precisions->clear();
+ recalls->clear();
+ precisions->reserve(detection_sorted_indices.size());
+ recalls->reserve(detection_sorted_indices.size());
+ assert(!evaluations.empty() || detection_sorted_indices.empty());
+ for (auto detection_sorted_index : detection_sorted_indices) {
+ const ImageEvaluation& evaluation =
+ evaluations[evaluation_indices[detection_sorted_index]];
+ const auto num_detections =
+ evaluation.detection_matches.size() / num_iou_thresholds;
+ const auto detection_index = iou_threshold_index * num_detections +
+ image_detection_indices[detection_sorted_index];
+ assert(evaluation.detection_matches.size() > detection_index);
+ assert(evaluation.detection_ignores.size() > detection_index);
+ const int64_t detection_match =
+ evaluation.detection_matches[detection_index];
+ const bool detection_ignores =
+ evaluation.detection_ignores[detection_index];
+ const auto true_positive = detection_match > 0 && !detection_ignores;
+ const auto false_positive = detection_match == 0 && !detection_ignores;
+ if (true_positive) {
+ ++true_positives_sum;
+ }
+ if (false_positive) {
+ ++false_positives_sum;
+ }
+
+ const double recall =
+ static_cast(true_positives_sum) / num_valid_ground_truth;
+ recalls->push_back(recall);
+ const int64_t num_valid_detections =
+ true_positives_sum + false_positives_sum;
+ const double precision = num_valid_detections > 0
+ ? static_cast(true_positives_sum) / num_valid_detections
+ : 0.0;
+ precisions->push_back(precision);
+ }
+
+ (*recalls_out)[recalls_out_index] = !recalls->empty() ? recalls->back() : 0;
+
+ for (int64_t i = static_cast(precisions->size()) - 1; i > 0; --i) {
+ if ((*precisions)[i] > (*precisions)[i - 1]) {
+ (*precisions)[i - 1] = (*precisions)[i];
+ }
+ }
+
+ // Sample the per instance precision/recall list at each recall threshold
+ for (size_t r = 0; r < recall_thresholds.size(); ++r) {
+ // first index in recalls >= recall_thresholds[r]
+ std::vector::iterator low = std::lower_bound(
+ recalls->begin(), recalls->end(), recall_thresholds[r]);
+ size_t precisions_index = low - recalls->begin();
+
+ const auto results_ind = precisions_out_index + r * precisions_out_stride;
+ assert(results_ind < precisions_out->size());
+ assert(results_ind < scores_out->size());
+ if (precisions_index < precisions->size()) {
+ (*precisions_out)[results_ind] = (*precisions)[precisions_index];
+ (*scores_out)[results_ind] =
+ detection_scores[detection_sorted_indices[precisions_index]];
+ } else {
+ (*precisions_out)[results_ind] = 0;
+ (*scores_out)[results_ind] = 0;
+ }
+ }
+}
+py::dict Accumulate(
+ const py::object& params,
+ const std::vector& evaluations) {
+ const std::vector recall_thresholds =
+ list_to_vec(params.attr("recThrs"));
+ const std::vector max_detections =
+ list_to_vec(params.attr("maxDets"));
+ const int num_iou_thresholds = py::len(params.attr("iouThrs"));
+ const int num_recall_thresholds = py::len(params.attr("recThrs"));
+ const int num_categories = params.attr("useCats").cast() == 1
+ ? py::len(params.attr("catIds"))
+ : 1;
+ const int num_area_ranges = py::len(params.attr("areaRng"));
+ const int num_max_detections = py::len(params.attr("maxDets"));
+ const int num_images = py::len(params.attr("imgIds"));
+
+ std::vector precisions_out(
+ num_iou_thresholds * num_recall_thresholds * num_categories *
+ num_area_ranges * num_max_detections,
+ -1);
+ std::vector recalls_out(
+ num_iou_thresholds * num_categories * num_area_ranges *
+ num_max_detections,
+ -1);
+ std::vector scores_out(
+ num_iou_thresholds * num_recall_thresholds * num_categories *
+ num_area_ranges * num_max_detections,
+ -1);
+
+ // Consider the list of all detected instances in the entire dataset in one
+ // large list. evaluation_indices, detection_scores,
+ // image_detection_indices, and detection_sorted_indices all have the same
+ // length as this list, such that each entry corresponds to one detected
+ // instance
+ std::vector evaluation_indices; // indices into evaluations[]
+ std::vector detection_scores; // detection scores of each instance
+ std::vector detection_sorted_indices; // sorted indices of all
+ // instances in the dataset
+ std::vector
+ image_detection_indices; // indices into the list of detected instances in
+ // the same image as each instance
+ std::vector precisions, recalls;
+
+ for (auto c = 0; c < num_categories; ++c) {
+ for (auto a = 0; a < num_area_ranges; ++a) {
+ for (auto m = 0; m < num_max_detections; ++m) {
+ // The COCO PythonAPI assumes evaluations[] (the return value of
+ // COCOeval::EvaluateImages() is one long list storing results for each
+ // combination of category, area range, and image id, with categories in
+ // the outermost loop and images in the innermost loop.
+ const int64_t evaluations_index =
+ c * num_area_ranges * num_images + a * num_images;
+ int num_valid_ground_truth = BuildSortedDetectionList(
+ evaluations,
+ evaluations_index,
+ num_images,
+ max_detections[m],
+ &evaluation_indices,
+ &detection_scores,
+ &detection_sorted_indices,
+ &image_detection_indices);
+
+ if (num_valid_ground_truth == 0) {
+ continue;
+ }
+
+ for (auto t = 0; t < num_iou_thresholds; ++t) {
+ // recalls_out is a flattened vectors representing a
+ // num_iou_thresholds X num_categories X num_area_ranges X
+ // num_max_detections matrix
+ const int64_t recalls_out_index =
+ t * num_categories * num_area_ranges * num_max_detections +
+ c * num_area_ranges * num_max_detections +
+ a * num_max_detections + m;
+
+ // precisions_out and scores_out are flattened vectors
+ // representing a num_iou_thresholds X num_recall_thresholds X
+ // num_categories X num_area_ranges X num_max_detections matrix
+ const int64_t precisions_out_stride =
+ num_categories * num_area_ranges * num_max_detections;
+ const int64_t precisions_out_index = t * num_recall_thresholds *
+ num_categories * num_area_ranges * num_max_detections +
+ c * num_area_ranges * num_max_detections +
+ a * num_max_detections + m;
+
+ ComputePrecisionRecallCurve(
+ precisions_out_index,
+ precisions_out_stride,
+ recalls_out_index,
+ recall_thresholds,
+ t,
+ num_iou_thresholds,
+ num_valid_ground_truth,
+ evaluations,
+ evaluation_indices,
+ detection_scores,
+ detection_sorted_indices,
+ image_detection_indices,
+ &precisions,
+ &recalls,
+ &precisions_out,
+ &scores_out,
+ &recalls_out);
+ }
+ }
+ }
+ }
+
+ time_t rawtime;
+ struct tm local_time;
+ std::array buffer;
+ time(&rawtime);
+#ifdef _WIN32
+ localtime_s(&local_time, &rawtime);
+#else
+ localtime_r(&rawtime, &local_time);
+#endif
+ strftime(
+ buffer.data(), 200, "%Y-%m-%d %H:%num_max_detections:%S", &local_time);
+ return py::dict(
+ "params"_a = params,
+ "counts"_a = std::vector({num_iou_thresholds,
+ num_recall_thresholds,
+ num_categories,
+ num_area_ranges,
+ num_max_detections}),
+ "date"_a = buffer,
+ "precision"_a = precisions_out,
+ "recall"_a = recalls_out,
+ "scores"_a = scores_out);
+}
+
+} // namespace COCOeval
diff --git a/what/models/detection/yolox/layers/cocoeval/cocoeval.h b/what/models/detection/yolox/layers/cocoeval/cocoeval.h
new file mode 100644
index 0000000..dbf5aab
--- /dev/null
+++ b/what/models/detection/yolox/layers/cocoeval/cocoeval.h
@@ -0,0 +1,98 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#pragma once
+
+#include
+#include
+#include
+#include
+#include
+
+namespace py = pybind11;
+
+namespace COCOeval {
+
+// Annotation data for a single object instance in an image
+struct InstanceAnnotation {
+ InstanceAnnotation(
+ uint64_t id,
+ double score,
+ double area,
+ bool is_crowd,
+ bool ignore)
+ : id{id}, score{score}, area{area}, is_crowd{is_crowd}, ignore{ignore} {}
+ uint64_t id;
+ double score = 0.;
+ double area = 0.;
+ bool is_crowd = false;
+ bool ignore = false;
+};
+
+// Stores intermediate results for evaluating detection results for a single
+// image that has D detected instances and G ground truth instances. This stores
+// matches between detected and ground truth instances
+struct ImageEvaluation {
+ // For each of the D detected instances, the id of the matched ground truth
+ // instance, or 0 if unmatched
+ std::vector detection_matches;
+
+ // The detection score of each of the D detected instances
+ std::vector detection_scores;
+
+ // Marks whether or not each of G instances was ignored from evaluation (e.g.,
+ // because it's outside area_range)
+ std::vector ground_truth_ignores;
+
+ // Marks whether or not each of D instances was ignored from evaluation (e.g.,
+ // because it's outside aRng)
+ std::vector detection_ignores;
+};
+
+template
+using ImageCategoryInstances = std::vector>>;
+
+// C++ implementation of COCO API cocoeval.py::COCOeval.evaluateImg(). For each
+// combination of image, category, area range settings, and IOU thresholds to
+// evaluate, it matches detected instances to ground truth instances and stores
+// the results into a vector of ImageEvaluation results, which will be
+// interpreted by the COCOeval::Accumulate() function to produce precion-recall
+// curves. The parameters of nested vectors have the following semantics:
+// image_category_ious[i][c][d][g] is the intersection over union of the d'th
+// detected instance and g'th ground truth instance of
+// category category_ids[c] in image image_ids[i]
+// image_category_ground_truth_instances[i][c] is a vector of ground truth
+// instances in image image_ids[i] of category category_ids[c]
+// image_category_detection_instances[i][c] is a vector of detected
+// instances in image image_ids[i] of category category_ids[c]
+std::vector EvaluateImages(
+ const std::vector>& area_ranges, // vector of 2-tuples
+ int max_detections,
+ const std::vector& iou_thresholds,
+ const ImageCategoryInstances>& image_category_ious,
+ const ImageCategoryInstances&
+ image_category_ground_truth_instances,
+ const ImageCategoryInstances&
+ image_category_detection_instances);
+
+// C++ implementation of COCOeval.accumulate(), which generates precision
+// recall curves for each set of category, IOU threshold, detection area range,
+// and max number of detections parameters. It is assumed that the parameter
+// evaluations is the return value of the functon COCOeval::EvaluateImages(),
+// which was called with the same parameter settings params
+py::dict Accumulate(
+ const py::object& params,
+ const std::vector& evalutations);
+
+} // namespace COCOeval
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+ m.def("COCOevalAccumulate", &COCOeval::Accumulate, "COCOeval::Accumulate");
+ m.def(
+ "COCOevalEvaluateImages",
+ &COCOeval::EvaluateImages,
+ "COCOeval::EvaluateImages");
+ pybind11::class_(m, "InstanceAnnotation")
+ .def(pybind11::init());
+ pybind11::class_(m, "ImageEvaluation")
+ .def(pybind11::init<>());
+}
diff --git a/what/models/detection/yolox/layers/fast_coco_eval_api.py b/what/models/detection/yolox/layers/fast_coco_eval_api.py
new file mode 100644
index 0000000..5f3aeb5
--- /dev/null
+++ b/what/models/detection/yolox/layers/fast_coco_eval_api.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# This file comes from
+# https://github.com/facebookresearch/detectron2/blob/master/detectron2/evaluation/fast_eval_api.py
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import copy
+import time
+
+import numpy as np
+from pycocotools.cocoeval import COCOeval
+
+from .jit_ops import FastCOCOEvalOp
+
+
+class COCOeval_opt(COCOeval):
+ """
+ This is a slightly modified version of the original COCO API, where the functions evaluateImg()
+ and accumulate() are implemented in C++ to speedup evaluation
+ """
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.module = FastCOCOEvalOp().load()
+
+ def evaluate(self):
+ """
+ Run per image evaluation on given images and store results in self.evalImgs_cpp, a
+ datastructure that isn't readable from Python but is used by a c++ implementation of
+ accumulate(). Unlike the original COCO PythonAPI, we don't populate the datastructure
+ self.evalImgs because this datastructure is a computational bottleneck.
+ :return: None
+ """
+ tic = time.time()
+
+ print("Running per image evaluation...")
+ p = self.params
+ # add backward compatibility if useSegm is specified in params
+ if p.useSegm is not None:
+ p.iouType = "segm" if p.useSegm == 1 else "bbox"
+ print(
+ "useSegm (deprecated) is not None. Running {} evaluation".format(
+ p.iouType
+ )
+ )
+ print("Evaluate annotation type *{}*".format(p.iouType))
+ p.imgIds = list(np.unique(p.imgIds))
+ if p.useCats:
+ p.catIds = list(np.unique(p.catIds))
+ p.maxDets = sorted(p.maxDets)
+ self.params = p
+
+ self._prepare()
+
+ # loop through images, area range, max detection number
+ catIds = p.catIds if p.useCats else [-1]
+
+ if p.iouType == "segm" or p.iouType == "bbox":
+ computeIoU = self.computeIoU
+ elif p.iouType == "keypoints":
+ computeIoU = self.computeOks
+ self.ious = {
+ (imgId, catId): computeIoU(imgId, catId)
+ for imgId in p.imgIds
+ for catId in catIds
+ }
+
+ maxDet = p.maxDets[-1]
+
+ # <<<< Beginning of code differences with original COCO API
+ def convert_instances_to_cpp(instances, is_det=False):
+ # Convert annotations for a list of instances in an image to a format that's fast
+ # to access in C++
+ instances_cpp = []
+ for instance in instances:
+ instance_cpp = self.module.InstanceAnnotation(
+ int(instance["id"]),
+ instance["score"] if is_det else instance.get("score", 0.0),
+ instance["area"],
+ bool(instance.get("iscrowd", 0)),
+ bool(instance.get("ignore", 0)),
+ )
+ instances_cpp.append(instance_cpp)
+ return instances_cpp
+
+ # Convert GT annotations, detections, and IOUs to a format that's fast to access in C++
+ ground_truth_instances = [
+ [convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds]
+ for imgId in p.imgIds
+ ]
+ detected_instances = [
+ [
+ convert_instances_to_cpp(self._dts[imgId, catId], is_det=True)
+ for catId in p.catIds
+ ]
+ for imgId in p.imgIds
+ ]
+ ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds]
+
+ if not p.useCats:
+ # For each image, flatten per-category lists into a single list
+ ground_truth_instances = [
+ [[o for c in i for o in c]] for i in ground_truth_instances
+ ]
+ detected_instances = [
+ [[o for c in i for o in c]] for i in detected_instances
+ ]
+
+ # Call C++ implementation of self.evaluateImgs()
+ self._evalImgs_cpp = self.module.COCOevalEvaluateImages(
+ p.areaRng,
+ maxDet,
+ p.iouThrs,
+ ious,
+ ground_truth_instances,
+ detected_instances,
+ )
+ self._evalImgs = None
+
+ self._paramsEval = copy.deepcopy(self.params)
+ toc = time.time()
+ print("COCOeval_opt.evaluate() finished in {:0.2f} seconds.".format(toc - tic))
+ # >>>> End of code differences with original COCO API
+
+ def accumulate(self):
+ """
+ Accumulate per image evaluation results and store the result in self.eval. Does not
+ support changing parameter settings from those used by self.evaluate()
+ """
+ print("Accumulating evaluation results...")
+ tic = time.time()
+ if not hasattr(self, "_evalImgs_cpp"):
+ print("Please run evaluate() first")
+
+ self.eval = self.module.COCOevalAccumulate(self._paramsEval, self._evalImgs_cpp)
+
+ # recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections
+ self.eval["recall"] = np.array(self.eval["recall"]).reshape(
+ self.eval["counts"][:1] + self.eval["counts"][2:]
+ )
+
+ # precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X
+ # num_area_ranges X num_max_detections
+ self.eval["precision"] = np.array(self.eval["precision"]).reshape(
+ self.eval["counts"]
+ )
+ self.eval["scores"] = np.array(self.eval["scores"]).reshape(self.eval["counts"])
+ toc = time.time()
+ print(
+ "COCOeval_opt.accumulate() finished in {:0.2f} seconds.".format(toc - tic)
+ )
diff --git a/what/models/detection/yolox/layers/jit_ops.py b/what/models/detection/yolox/layers/jit_ops.py
new file mode 100644
index 0000000..0fdac4d
--- /dev/null
+++ b/what/models/detection/yolox/layers/jit_ops.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+# Copyright (c) Megvii, Inc. and its affiliates. All Rights Reserved
+
+import glob
+import importlib
+import os
+import sys
+import time
+from typing import List
+
+__all__ = ["JitOp", "FastCOCOEvalOp"]
+
+
+class JitOp:
+ """
+ Just-in-time compilation of ops.
+
+ Some code of `JitOp` is inspired by `deepspeed.op_builder`,
+ check the following link for more details:
+ https://github.com/microsoft/DeepSpeed/blob/master/op_builder/builder.py
+ """
+
+ def __init__(self, name):
+ self.name = name
+
+ def absolute_name(self) -> str:
+ """Get absolute build path for cases where the op is pre-installed."""
+ pass
+
+ def sources(self) -> List:
+ """Get path list of source files of op.
+
+ NOTE: the path should be elative to root of package during building,
+ Otherwise, exception will be raised when building package.
+ However, for runtime building, path will be absolute.
+ """
+ pass
+
+ def include_dirs(self) -> List:
+ """
+ Get list of include paths, relative to root of package.
+
+ NOTE: the path should be elative to root of package.
+ Otherwise, exception will be raised when building package.
+ """
+ return []
+
+ def define_macros(self) -> List:
+ """Get list of macros to define for op"""
+ return []
+
+ def cxx_args(self) -> List:
+ """Get optional list of compiler flags to forward"""
+ args = ["-O2"] if sys.platform == "win32" else ["-O3", "-std=c++14", "-g", "-Wno-reorder"]
+ return args
+
+ def nvcc_args(self) -> List:
+ """Get optional list of compiler flags to forward to nvcc when building CUDA sources"""
+ args = [
+ "-O3", "--use_fast_math",
+ "-std=c++17" if sys.platform == "win32" else "-std=c++14",
+ "-U__CUDA_NO_HALF_OPERATORS__",
+ "-U__CUDA_NO_HALF_CONVERSIONS__",
+ "-U__CUDA_NO_HALF2_OPERATORS__",
+ ]
+ return args
+
+ def build_op(self):
+ from torch.utils.cpp_extension import CppExtension
+ return CppExtension(
+ name=self.absolute_name(),
+ sources=self.sources(),
+ include_dirs=self.include_dirs(),
+ define_macros=self.define_macros(),
+ extra_compile_args={
+ "cxx": self.cxx_args(),
+ },
+ )
+
+ def load(self, verbose=True):
+ try:
+ # try to import op from pre-installed package
+ return importlib.import_module(self.absolute_name())
+ except Exception: # op not compiled, jit load
+ from yolox.utils import wait_for_the_master
+ with wait_for_the_master(): # to avoid race condition
+ return self.jit_load(verbose)
+
+ def jit_load(self, verbose=True):
+ from torch.utils.cpp_extension import load
+ from loguru import logger
+ try:
+ import ninja # noqa
+ except ImportError:
+ if verbose:
+ logger.warning(
+ f"Ninja is not installed, fall back to normal installation for {self.name}."
+ )
+
+ build_tik = time.time()
+ # build op and load
+ op_module = load(
+ name=self.name,
+ sources=self.sources(),
+ extra_cflags=self.cxx_args(),
+ extra_cuda_cflags=self.nvcc_args(),
+ verbose=verbose,
+ )
+ build_duration = time.time() - build_tik
+ if verbose:
+ logger.info(f"Load {self.name} op in {build_duration:.3f}s.")
+ return op_module
+
+ def clear_dynamic_library(self):
+ """Remove dynamic libraray files generated by JIT compilation."""
+ module = self.load()
+ os.remove(module.__file__)
+
+
+class FastCOCOEvalOp(JitOp):
+
+ def __init__(self, name="fast_cocoeval"):
+ super().__init__(name=name)
+
+ def absolute_name(self):
+ return f'yolox.layers.{self.name}'
+
+ def sources(self):
+ sources = glob.glob(os.path.join("yolox", "layers", "cocoeval", "*.cpp"))
+ if not sources: # source will be empty list if the so file is removed after install
+ # use abosolute path to compile
+ import yolox
+ code_path = os.path.join(yolox.__path__[0], "layers", "cocoeval", "*.cpp")
+ sources = glob.glob(code_path)
+ return sources
+
+ def include_dirs(self):
+ return [os.path.join("yolox", "layers", "cocoeval")]
diff --git a/what/models/detection/yolox/models/__init__.py b/what/models/detection/yolox/models/__init__.py
new file mode 100644
index 0000000..c74fd30
--- /dev/null
+++ b/what/models/detection/yolox/models/__init__.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+from .build import *
+from .darknet import CSPDarknet, Darknet
+from .losses import IOUloss
+from .yolo_fpn import YOLOFPN
+from .yolo_head import YOLOXHead
+from .yolo_pafpn import YOLOPAFPN
+from .yolox import YOLOX
diff --git a/what/models/detection/yolox/models/build.py b/what/models/detection/yolox/models/build.py
new file mode 100644
index 0000000..8edc87d
--- /dev/null
+++ b/what/models/detection/yolox/models/build.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+import torch
+from torch import nn
+from torch.hub import load_state_dict_from_url
+
+__all__ = [
+ "create_yolox_model",
+ "yolox_nano",
+ "yolox_tiny",
+ "yolox_s",
+ "yolox_m",
+ "yolox_l",
+ "yolox_x",
+ "yolov3",
+ "yolox_custom"
+]
+
+_CKPT_ROOT_URL = "https://github.com/Megvii-BaseDetection/YOLOX/releases/download"
+_CKPT_FULL_PATH = {
+ "yolox-nano": f"{_CKPT_ROOT_URL}/0.1.1rc0/yolox_nano.pth",
+ "yolox-tiny": f"{_CKPT_ROOT_URL}/0.1.1rc0/yolox_tiny.pth",
+ "yolox-s": f"{_CKPT_ROOT_URL}/0.1.1rc0/yolox_s.pth",
+ "yolox-m": f"{_CKPT_ROOT_URL}/0.1.1rc0/yolox_m.pth",
+ "yolox-l": f"{_CKPT_ROOT_URL}/0.1.1rc0/yolox_l.pth",
+ "yolox-x": f"{_CKPT_ROOT_URL}/0.1.1rc0/yolox_x.pth",
+ "yolov3": f"{_CKPT_ROOT_URL}/0.1.1rc0/yolox_darknet.pth",
+}
+
+
+def create_yolox_model(name: str, pretrained: bool = True, num_classes: int = 80, device=None,
+ exp_path: str = None, ckpt_path: str = None) -> nn.Module:
+ """creates and loads a YOLOX model
+
+ Args:
+ name (str): name of model. for example, "yolox-s", "yolox-tiny" or "yolox_custom"
+ if you want to load your own model.
+ pretrained (bool): load pretrained weights into the model. Default to True.
+ device (str): default device to for model. Default to None.
+ num_classes (int): number of model classes. Default to 80.
+ exp_path (str): path to your own experiment file. Required if name="yolox_custom"
+ ckpt_path (str): path to your own ckpt. Required if name="yolox_custom" and you want to
+ load a pretrained model
+
+
+ Returns:
+ YOLOX model (nn.Module)
+ """
+ from yolox.exp import get_exp, Exp
+
+ if device is None:
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
+ device = torch.device(device)
+
+ assert name in _CKPT_FULL_PATH or name == "yolox_custom", \
+ f"user should use one of value in {_CKPT_FULL_PATH.keys()} or \"yolox_custom\""
+ if name in _CKPT_FULL_PATH:
+ exp: Exp = get_exp(exp_name=name)
+ exp.num_classes = num_classes
+ yolox_model = exp.get_model()
+ if pretrained and num_classes == 80:
+ weights_url = _CKPT_FULL_PATH[name]
+ ckpt = load_state_dict_from_url(weights_url, map_location="cpu")
+ if "model" in ckpt:
+ ckpt = ckpt["model"]
+ yolox_model.load_state_dict(ckpt)
+ else:
+ assert exp_path is not None, "for a \"yolox_custom\" model exp_path must be provided"
+ exp: Exp = get_exp(exp_file=exp_path)
+ yolox_model = exp.get_model()
+ if ckpt_path:
+ ckpt = torch.load(ckpt_path, map_location="cpu")
+ if "model" in ckpt:
+ ckpt = ckpt["model"]
+ yolox_model.load_state_dict(ckpt)
+
+ yolox_model.to(device)
+ return yolox_model
+
+
+def yolox_nano(pretrained: bool = True, num_classes: int = 80, device: str = None) -> nn.Module:
+ return create_yolox_model("yolox-nano", pretrained, num_classes, device)
+
+
+def yolox_tiny(pretrained: bool = True, num_classes: int = 80, device: str = None) -> nn.Module:
+ return create_yolox_model("yolox-tiny", pretrained, num_classes, device)
+
+
+def yolox_s(pretrained: bool = True, num_classes: int = 80, device: str = None) -> nn.Module:
+ return create_yolox_model("yolox-s", pretrained, num_classes, device)
+
+
+def yolox_m(pretrained: bool = True, num_classes: int = 80, device: str = None) -> nn.Module:
+ return create_yolox_model("yolox-m", pretrained, num_classes, device)
+
+
+def yolox_l(pretrained: bool = True, num_classes: int = 80, device: str = None) -> nn.Module:
+ return create_yolox_model("yolox-l", pretrained, num_classes, device)
+
+
+def yolox_x(pretrained: bool = True, num_classes: int = 80, device: str = None) -> nn.Module:
+ return create_yolox_model("yolox-x", pretrained, num_classes, device)
+
+
+def yolov3(pretrained: bool = True, num_classes: int = 80, device: str = None) -> nn.Module:
+ return create_yolox_model("yolov3", pretrained, num_classes, device)
+
+
+def yolox_custom(ckpt_path: str = None, exp_path: str = None, device: str = None) -> nn.Module:
+ return create_yolox_model("yolox_custom", ckpt_path=ckpt_path, exp_path=exp_path, device=device)
diff --git a/what/models/detection/yolox/models/darknet.py b/what/models/detection/yolox/models/darknet.py
new file mode 100644
index 0000000..b3e053f
--- /dev/null
+++ b/what/models/detection/yolox/models/darknet.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+from torch import nn
+
+from .network_blocks import BaseConv, CSPLayer, DWConv, Focus, ResLayer, SPPBottleneck
+
+
+class Darknet(nn.Module):
+ # number of blocks from dark2 to dark5.
+ depth2blocks = {21: [1, 2, 2, 1], 53: [2, 8, 8, 4]}
+
+ def __init__(
+ self,
+ depth,
+ in_channels=3,
+ stem_out_channels=32,
+ out_features=("dark3", "dark4", "dark5"),
+ ):
+ """
+ Args:
+ depth (int): depth of darknet used in model, usually use [21, 53] for this param.
+ in_channels (int): number of input channels, for example, use 3 for RGB image.
+ stem_out_channels (int): number of output channels of darknet stem.
+ It decides channels of darknet layer2 to layer5.
+ out_features (Tuple[str]): desired output layer name.
+ """
+ super().__init__()
+ assert out_features, "please provide output features of Darknet"
+ self.out_features = out_features
+ self.stem = nn.Sequential(
+ BaseConv(in_channels, stem_out_channels, ksize=3, stride=1, act="lrelu"),
+ *self.make_group_layer(stem_out_channels, num_blocks=1, stride=2),
+ )
+ in_channels = stem_out_channels * 2 # 64
+
+ num_blocks = Darknet.depth2blocks[depth]
+ # create darknet with `stem_out_channels` and `num_blocks` layers.
+ # to make model structure more clear, we don't use `for` statement in python.
+ self.dark2 = nn.Sequential(
+ *self.make_group_layer(in_channels, num_blocks[0], stride=2)
+ )
+ in_channels *= 2 # 128
+ self.dark3 = nn.Sequential(
+ *self.make_group_layer(in_channels, num_blocks[1], stride=2)
+ )
+ in_channels *= 2 # 256
+ self.dark4 = nn.Sequential(
+ *self.make_group_layer(in_channels, num_blocks[2], stride=2)
+ )
+ in_channels *= 2 # 512
+
+ self.dark5 = nn.Sequential(
+ *self.make_group_layer(in_channels, num_blocks[3], stride=2),
+ *self.make_spp_block([in_channels, in_channels * 2], in_channels * 2),
+ )
+
+ def make_group_layer(self, in_channels: int, num_blocks: int, stride: int = 1):
+ "starts with conv layer then has `num_blocks` `ResLayer`"
+ return [
+ BaseConv(in_channels, in_channels * 2, ksize=3, stride=stride, act="lrelu"),
+ *[(ResLayer(in_channels * 2)) for _ in range(num_blocks)],
+ ]
+
+ def make_spp_block(self, filters_list, in_filters):
+ m = nn.Sequential(
+ *[
+ BaseConv(in_filters, filters_list[0], 1, stride=1, act="lrelu"),
+ BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"),
+ SPPBottleneck(
+ in_channels=filters_list[1],
+ out_channels=filters_list[0],
+ activation="lrelu",
+ ),
+ BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"),
+ BaseConv(filters_list[1], filters_list[0], 1, stride=1, act="lrelu"),
+ ]
+ )
+ return m
+
+ def forward(self, x):
+ outputs = {}
+ x = self.stem(x)
+ outputs["stem"] = x
+ x = self.dark2(x)
+ outputs["dark2"] = x
+ x = self.dark3(x)
+ outputs["dark3"] = x
+ x = self.dark4(x)
+ outputs["dark4"] = x
+ x = self.dark5(x)
+ outputs["dark5"] = x
+ return {k: v for k, v in outputs.items() if k in self.out_features}
+
+
+class CSPDarknet(nn.Module):
+ def __init__(
+ self,
+ dep_mul,
+ wid_mul,
+ out_features=("dark3", "dark4", "dark5"),
+ depthwise=False,
+ act="silu",
+ ):
+ super().__init__()
+ assert out_features, "please provide output features of Darknet"
+ self.out_features = out_features
+ Conv = DWConv if depthwise else BaseConv
+
+ base_channels = int(wid_mul * 64) # 64
+ base_depth = max(round(dep_mul * 3), 1) # 3
+
+ # stem
+ self.stem = Focus(3, base_channels, ksize=3, act=act)
+
+ # dark2
+ self.dark2 = nn.Sequential(
+ Conv(base_channels, base_channels * 2, 3, 2, act=act),
+ CSPLayer(
+ base_channels * 2,
+ base_channels * 2,
+ n=base_depth,
+ depthwise=depthwise,
+ act=act,
+ ),
+ )
+
+ # dark3
+ self.dark3 = nn.Sequential(
+ Conv(base_channels * 2, base_channels * 4, 3, 2, act=act),
+ CSPLayer(
+ base_channels * 4,
+ base_channels * 4,
+ n=base_depth * 3,
+ depthwise=depthwise,
+ act=act,
+ ),
+ )
+
+ # dark4
+ self.dark4 = nn.Sequential(
+ Conv(base_channels * 4, base_channels * 8, 3, 2, act=act),
+ CSPLayer(
+ base_channels * 8,
+ base_channels * 8,
+ n=base_depth * 3,
+ depthwise=depthwise,
+ act=act,
+ ),
+ )
+
+ # dark5
+ self.dark5 = nn.Sequential(
+ Conv(base_channels * 8, base_channels * 16, 3, 2, act=act),
+ SPPBottleneck(base_channels * 16, base_channels * 16, activation=act),
+ CSPLayer(
+ base_channels * 16,
+ base_channels * 16,
+ n=base_depth,
+ shortcut=False,
+ depthwise=depthwise,
+ act=act,
+ ),
+ )
+
+ def forward(self, x):
+ outputs = {}
+ x = self.stem(x)
+ outputs["stem"] = x
+ x = self.dark2(x)
+ outputs["dark2"] = x
+ x = self.dark3(x)
+ outputs["dark3"] = x
+ x = self.dark4(x)
+ outputs["dark4"] = x
+ x = self.dark5(x)
+ outputs["dark5"] = x
+ return {k: v for k, v in outputs.items() if k in self.out_features}
diff --git a/what/models/detection/yolox/models/losses.py b/what/models/detection/yolox/models/losses.py
new file mode 100644
index 0000000..77b4d8e
--- /dev/null
+++ b/what/models/detection/yolox/models/losses.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import torch
+import torch.nn as nn
+
+
+class IOUloss(nn.Module):
+ def __init__(self, reduction="none", loss_type="iou"):
+ super(IOUloss, self).__init__()
+ self.reduction = reduction
+ self.loss_type = loss_type
+
+ def forward(self, pred, target):
+ assert pred.shape[0] == target.shape[0]
+
+ pred = pred.view(-1, 4)
+ target = target.view(-1, 4)
+ tl = torch.max(
+ (pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2)
+ )
+ br = torch.min(
+ (pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2)
+ )
+
+ area_p = torch.prod(pred[:, 2:], 1)
+ area_g = torch.prod(target[:, 2:], 1)
+
+ en = (tl < br).type(tl.type()).prod(dim=1)
+ area_i = torch.prod(br - tl, 1) * en
+ area_u = area_p + area_g - area_i
+ iou = (area_i) / (area_u + 1e-16)
+
+ if self.loss_type == "iou":
+ loss = 1 - iou ** 2
+ elif self.loss_type == "giou":
+ c_tl = torch.min(
+ (pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2)
+ )
+ c_br = torch.max(
+ (pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2)
+ )
+ area_c = torch.prod(c_br - c_tl, 1)
+ giou = iou - (area_c - area_u) / area_c.clamp(1e-16)
+ loss = 1 - giou.clamp(min=-1.0, max=1.0)
+
+ if self.reduction == "mean":
+ loss = loss.mean()
+ elif self.reduction == "sum":
+ loss = loss.sum()
+
+ return loss
diff --git a/what/models/detection/yolox/models/network_blocks.py b/what/models/detection/yolox/models/network_blocks.py
new file mode 100644
index 0000000..68aacfc
--- /dev/null
+++ b/what/models/detection/yolox/models/network_blocks.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import torch
+import torch.nn as nn
+
+
+class SiLU(nn.Module):
+ """export-friendly version of nn.SiLU()"""
+
+ @staticmethod
+ def forward(x):
+ return x * torch.sigmoid(x)
+
+
+def get_activation(name="silu", inplace=True):
+ if name == "silu":
+ module = nn.SiLU(inplace=inplace)
+ elif name == "relu":
+ module = nn.ReLU(inplace=inplace)
+ elif name == "lrelu":
+ module = nn.LeakyReLU(0.1, inplace=inplace)
+ else:
+ raise AttributeError("Unsupported act type: {}".format(name))
+ return module
+
+
+class BaseConv(nn.Module):
+ """A Conv2d -> Batchnorm -> silu/leaky relu block"""
+
+ def __init__(
+ self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu"
+ ):
+ super().__init__()
+ # same padding
+ pad = (ksize - 1) // 2
+ self.conv = nn.Conv2d(
+ in_channels,
+ out_channels,
+ kernel_size=ksize,
+ stride=stride,
+ padding=pad,
+ groups=groups,
+ bias=bias,
+ )
+ self.bn = nn.BatchNorm2d(out_channels)
+ self.act = get_activation(act, inplace=True)
+
+ def forward(self, x):
+ return self.act(self.bn(self.conv(x)))
+
+ def fuseforward(self, x):
+ return self.act(self.conv(x))
+
+
+class DWConv(nn.Module):
+ """Depthwise Conv + Conv"""
+
+ def __init__(self, in_channels, out_channels, ksize, stride=1, act="silu"):
+ super().__init__()
+ self.dconv = BaseConv(
+ in_channels,
+ in_channels,
+ ksize=ksize,
+ stride=stride,
+ groups=in_channels,
+ act=act,
+ )
+ self.pconv = BaseConv(
+ in_channels, out_channels, ksize=1, stride=1, groups=1, act=act
+ )
+
+ def forward(self, x):
+ x = self.dconv(x)
+ return self.pconv(x)
+
+
+class Bottleneck(nn.Module):
+ # Standard bottleneck
+ def __init__(
+ self,
+ in_channels,
+ out_channels,
+ shortcut=True,
+ expansion=0.5,
+ depthwise=False,
+ act="silu",
+ ):
+ super().__init__()
+ hidden_channels = int(out_channels * expansion)
+ Conv = DWConv if depthwise else BaseConv
+ self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
+ self.conv2 = Conv(hidden_channels, out_channels, 3, stride=1, act=act)
+ self.use_add = shortcut and in_channels == out_channels
+
+ def forward(self, x):
+ y = self.conv2(self.conv1(x))
+ if self.use_add:
+ y = y + x
+ return y
+
+
+class ResLayer(nn.Module):
+ "Residual layer with `in_channels` inputs."
+
+ def __init__(self, in_channels: int):
+ super().__init__()
+ mid_channels = in_channels // 2
+ self.layer1 = BaseConv(
+ in_channels, mid_channels, ksize=1, stride=1, act="lrelu"
+ )
+ self.layer2 = BaseConv(
+ mid_channels, in_channels, ksize=3, stride=1, act="lrelu"
+ )
+
+ def forward(self, x):
+ out = self.layer2(self.layer1(x))
+ return x + out
+
+
+class SPPBottleneck(nn.Module):
+ """Spatial pyramid pooling layer used in YOLOv3-SPP"""
+
+ def __init__(
+ self, in_channels, out_channels, kernel_sizes=(5, 9, 13), activation="silu"
+ ):
+ super().__init__()
+ hidden_channels = in_channels // 2
+ self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=activation)
+ self.m = nn.ModuleList(
+ [
+ nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
+ for ks in kernel_sizes
+ ]
+ )
+ conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
+ self.conv2 = BaseConv(conv2_channels, out_channels, 1, stride=1, act=activation)
+
+ def forward(self, x):
+ x = self.conv1(x)
+ x = torch.cat([x] + [m(x) for m in self.m], dim=1)
+ x = self.conv2(x)
+ return x
+
+
+class CSPLayer(nn.Module):
+ """C3 in yolov5, CSP Bottleneck with 3 convolutions"""
+
+ def __init__(
+ self,
+ in_channels,
+ out_channels,
+ n=1,
+ shortcut=True,
+ expansion=0.5,
+ depthwise=False,
+ act="silu",
+ ):
+ """
+ Args:
+ in_channels (int): input channels.
+ out_channels (int): output channels.
+ n (int): number of Bottlenecks. Default value: 1.
+ """
+ # ch_in, ch_out, number, shortcut, groups, expansion
+ super().__init__()
+ hidden_channels = int(out_channels * expansion) # hidden channels
+ self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
+ self.conv2 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
+ self.conv3 = BaseConv(2 * hidden_channels, out_channels, 1, stride=1, act=act)
+ module_list = [
+ Bottleneck(
+ hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act
+ )
+ for _ in range(n)
+ ]
+ self.m = nn.Sequential(*module_list)
+
+ def forward(self, x):
+ x_1 = self.conv1(x)
+ x_2 = self.conv2(x)
+ x_1 = self.m(x_1)
+ x = torch.cat((x_1, x_2), dim=1)
+ return self.conv3(x)
+
+
+class Focus(nn.Module):
+ """Focus width and height information into channel space."""
+
+ def __init__(self, in_channels, out_channels, ksize=1, stride=1, act="silu"):
+ super().__init__()
+ self.conv = BaseConv(in_channels * 4, out_channels, ksize, stride, act=act)
+
+ def forward(self, x):
+ # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
+ patch_top_left = x[..., ::2, ::2]
+ patch_top_right = x[..., ::2, 1::2]
+ patch_bot_left = x[..., 1::2, ::2]
+ patch_bot_right = x[..., 1::2, 1::2]
+ x = torch.cat(
+ (
+ patch_top_left,
+ patch_bot_left,
+ patch_top_right,
+ patch_bot_right,
+ ),
+ dim=1,
+ )
+ return self.conv(x)
diff --git a/what/models/detection/yolox/models/yolo_fpn.py b/what/models/detection/yolox/models/yolo_fpn.py
new file mode 100644
index 0000000..224271f
--- /dev/null
+++ b/what/models/detection/yolox/models/yolo_fpn.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import torch
+import torch.nn as nn
+
+from .darknet import Darknet
+from .network_blocks import BaseConv
+
+
+class YOLOFPN(nn.Module):
+ """
+ YOLOFPN module. Darknet 53 is the default backbone of this model.
+ """
+
+ def __init__(
+ self,
+ depth=53,
+ in_features=["dark3", "dark4", "dark5"],
+ ):
+ super().__init__()
+
+ self.backbone = Darknet(depth)
+ self.in_features = in_features
+
+ # out 1
+ self.out1_cbl = self._make_cbl(512, 256, 1)
+ self.out1 = self._make_embedding([256, 512], 512 + 256)
+
+ # out 2
+ self.out2_cbl = self._make_cbl(256, 128, 1)
+ self.out2 = self._make_embedding([128, 256], 256 + 128)
+
+ # upsample
+ self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
+
+ def _make_cbl(self, _in, _out, ks):
+ return BaseConv(_in, _out, ks, stride=1, act="lrelu")
+
+ def _make_embedding(self, filters_list, in_filters):
+ m = nn.Sequential(
+ *[
+ self._make_cbl(in_filters, filters_list[0], 1),
+ self._make_cbl(filters_list[0], filters_list[1], 3),
+ self._make_cbl(filters_list[1], filters_list[0], 1),
+ self._make_cbl(filters_list[0], filters_list[1], 3),
+ self._make_cbl(filters_list[1], filters_list[0], 1),
+ ]
+ )
+ return m
+
+ def load_pretrained_model(self, filename="./weights/darknet53.mix.pth"):
+ with open(filename, "rb") as f:
+ state_dict = torch.load(f, map_location="cpu")
+ print("loading pretrained weights...")
+ self.backbone.load_state_dict(state_dict)
+
+ def forward(self, inputs):
+ """
+ Args:
+ inputs (Tensor): input image.
+
+ Returns:
+ Tuple[Tensor]: FPN output features..
+ """
+ # backbone
+ out_features = self.backbone(inputs)
+ x2, x1, x0 = [out_features[f] for f in self.in_features]
+
+ # yolo branch 1
+ x1_in = self.out1_cbl(x0)
+ x1_in = self.upsample(x1_in)
+ x1_in = torch.cat([x1_in, x1], 1)
+ out_dark4 = self.out1(x1_in)
+
+ # yolo branch 2
+ x2_in = self.out2_cbl(out_dark4)
+ x2_in = self.upsample(x2_in)
+ x2_in = torch.cat([x2_in, x2], 1)
+ out_dark3 = self.out2(x2_in)
+
+ outputs = (out_dark3, out_dark4, x0)
+ return outputs
diff --git a/what/models/detection/yolox/models/yolo_head.py b/what/models/detection/yolox/models/yolo_head.py
new file mode 100644
index 0000000..6247a75
--- /dev/null
+++ b/what/models/detection/yolox/models/yolo_head.py
@@ -0,0 +1,641 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import math
+from loguru import logger
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from what.models.detection.yolox.utils import bboxes_iou, cxcywh2xyxy, meshgrid, visualize_assign
+
+from .losses import IOUloss
+from .network_blocks import BaseConv, DWConv
+
+
+class YOLOXHead(nn.Module):
+ def __init__(
+ self,
+ num_classes,
+ width=1.0,
+ strides=[8, 16, 32],
+ in_channels=[256, 512, 1024],
+ act="silu",
+ depthwise=False,
+ ):
+ """
+ Args:
+ act (str): activation type of conv. Defalut value: "silu".
+ depthwise (bool): whether apply depthwise conv in conv branch. Defalut value: False.
+ """
+ super().__init__()
+
+ self.num_classes = num_classes
+ self.decode_in_inference = True # for deploy, set to False
+
+ self.cls_convs = nn.ModuleList()
+ self.reg_convs = nn.ModuleList()
+ self.cls_preds = nn.ModuleList()
+ self.reg_preds = nn.ModuleList()
+ self.obj_preds = nn.ModuleList()
+ self.stems = nn.ModuleList()
+ Conv = DWConv if depthwise else BaseConv
+
+ for i in range(len(in_channels)):
+ self.stems.append(
+ BaseConv(
+ in_channels=int(in_channels[i] * width),
+ out_channels=int(256 * width),
+ ksize=1,
+ stride=1,
+ act=act,
+ )
+ )
+ self.cls_convs.append(
+ nn.Sequential(
+ *[
+ Conv(
+ in_channels=int(256 * width),
+ out_channels=int(256 * width),
+ ksize=3,
+ stride=1,
+ act=act,
+ ),
+ Conv(
+ in_channels=int(256 * width),
+ out_channels=int(256 * width),
+ ksize=3,
+ stride=1,
+ act=act,
+ ),
+ ]
+ )
+ )
+ self.reg_convs.append(
+ nn.Sequential(
+ *[
+ Conv(
+ in_channels=int(256 * width),
+ out_channels=int(256 * width),
+ ksize=3,
+ stride=1,
+ act=act,
+ ),
+ Conv(
+ in_channels=int(256 * width),
+ out_channels=int(256 * width),
+ ksize=3,
+ stride=1,
+ act=act,
+ ),
+ ]
+ )
+ )
+ self.cls_preds.append(
+ nn.Conv2d(
+ in_channels=int(256 * width),
+ out_channels=self.num_classes,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ )
+ )
+ self.reg_preds.append(
+ nn.Conv2d(
+ in_channels=int(256 * width),
+ out_channels=4,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ )
+ )
+ self.obj_preds.append(
+ nn.Conv2d(
+ in_channels=int(256 * width),
+ out_channels=1,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ )
+ )
+
+ self.use_l1 = False
+ self.l1_loss = nn.L1Loss(reduction="none")
+ self.bcewithlog_loss = nn.BCEWithLogitsLoss(reduction="none")
+ self.iou_loss = IOUloss(reduction="none")
+ self.strides = strides
+ self.grids = [torch.zeros(1)] * len(in_channels)
+
+ def initialize_biases(self, prior_prob):
+ for conv in self.cls_preds:
+ b = conv.bias.view(1, -1)
+ b.data.fill_(-math.log((1 - prior_prob) / prior_prob))
+ conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+
+ for conv in self.obj_preds:
+ b = conv.bias.view(1, -1)
+ b.data.fill_(-math.log((1 - prior_prob) / prior_prob))
+ conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+
+ def forward(self, xin, labels=None, imgs=None):
+ outputs = []
+ origin_preds = []
+ x_shifts = []
+ y_shifts = []
+ expanded_strides = []
+
+ for k, (cls_conv, reg_conv, stride_this_level, x) in enumerate(
+ zip(self.cls_convs, self.reg_convs, self.strides, xin)
+ ):
+ x = self.stems[k](x)
+ cls_x = x
+ reg_x = x
+
+ cls_feat = cls_conv(cls_x)
+ cls_output = self.cls_preds[k](cls_feat)
+
+ reg_feat = reg_conv(reg_x)
+ reg_output = self.reg_preds[k](reg_feat)
+ obj_output = self.obj_preds[k](reg_feat)
+
+ if self.training:
+ output = torch.cat([reg_output, obj_output, cls_output], 1)
+ output, grid = self.get_output_and_grid(
+ output, k, stride_this_level, xin[0].type()
+ )
+ x_shifts.append(grid[:, :, 0])
+ y_shifts.append(grid[:, :, 1])
+ expanded_strides.append(
+ torch.zeros(1, grid.shape[1])
+ .fill_(stride_this_level)
+ .type_as(xin[0])
+ )
+ if self.use_l1:
+ batch_size = reg_output.shape[0]
+ hsize, wsize = reg_output.shape[-2:]
+ reg_output = reg_output.view(
+ batch_size, 1, 4, hsize, wsize
+ )
+ reg_output = reg_output.permute(0, 1, 3, 4, 2).reshape(
+ batch_size, -1, 4
+ )
+ origin_preds.append(reg_output.clone())
+
+ else:
+ output = torch.cat(
+ [reg_output, obj_output.sigmoid(), cls_output.sigmoid()], 1
+ )
+
+ outputs.append(output)
+
+ if self.training:
+ return self.get_losses(
+ imgs,
+ x_shifts,
+ y_shifts,
+ expanded_strides,
+ labels,
+ torch.cat(outputs, 1),
+ origin_preds,
+ dtype=xin[0].dtype,
+ )
+ else:
+ self.hw = [x.shape[-2:] for x in outputs]
+ # [batch, n_anchors_all, 85]
+ outputs = torch.cat(
+ [x.flatten(start_dim=2) for x in outputs], dim=2
+ ).permute(0, 2, 1)
+ if self.decode_in_inference:
+ return self.decode_outputs(outputs, dtype=xin[0].type())
+ else:
+ return outputs
+
+ def get_output_and_grid(self, output, k, stride, dtype):
+ grid = self.grids[k]
+
+ batch_size = output.shape[0]
+ n_ch = 5 + self.num_classes
+ hsize, wsize = output.shape[-2:]
+ if grid.shape[2:4] != output.shape[2:4]:
+ yv, xv = meshgrid([torch.arange(hsize), torch.arange(wsize)])
+ grid = torch.stack((xv, yv), 2).view(1, 1, hsize, wsize, 2).type(dtype)
+ self.grids[k] = grid
+
+ output = output.view(batch_size, 1, n_ch, hsize, wsize)
+ output = output.permute(0, 1, 3, 4, 2).reshape(
+ batch_size, hsize * wsize, -1
+ )
+ grid = grid.view(1, -1, 2)
+ output[..., :2] = (output[..., :2] + grid) * stride
+ output[..., 2:4] = torch.exp(output[..., 2:4]) * stride
+ return output, grid
+
+ def decode_outputs(self, outputs, dtype):
+ grids = []
+ strides = []
+ for (hsize, wsize), stride in zip(self.hw, self.strides):
+ yv, xv = meshgrid([torch.arange(hsize), torch.arange(wsize)])
+ grid = torch.stack((xv, yv), 2).view(1, -1, 2)
+ grids.append(grid)
+ shape = grid.shape[:2]
+ strides.append(torch.full((*shape, 1), stride))
+
+ grids = torch.cat(grids, dim=1).type(dtype)
+ strides = torch.cat(strides, dim=1).type(dtype)
+
+ outputs = torch.cat([
+ (outputs[..., 0:2] + grids) * strides,
+ torch.exp(outputs[..., 2:4]) * strides,
+ outputs[..., 4:]
+ ], dim=-1)
+ return outputs
+
+ def get_losses(
+ self,
+ imgs,
+ x_shifts,
+ y_shifts,
+ expanded_strides,
+ labels,
+ outputs,
+ origin_preds,
+ dtype,
+ ):
+ bbox_preds = outputs[:, :, :4] # [batch, n_anchors_all, 4]
+ obj_preds = outputs[:, :, 4:5] # [batch, n_anchors_all, 1]
+ cls_preds = outputs[:, :, 5:] # [batch, n_anchors_all, n_cls]
+
+ # calculate targets
+ nlabel = (labels.sum(dim=2) > 0).sum(dim=1) # number of objects
+
+ total_num_anchors = outputs.shape[1]
+ x_shifts = torch.cat(x_shifts, 1) # [1, n_anchors_all]
+ y_shifts = torch.cat(y_shifts, 1) # [1, n_anchors_all]
+ expanded_strides = torch.cat(expanded_strides, 1)
+ if self.use_l1:
+ origin_preds = torch.cat(origin_preds, 1)
+
+ cls_targets = []
+ reg_targets = []
+ l1_targets = []
+ obj_targets = []
+ fg_masks = []
+
+ num_fg = 0.0
+ num_gts = 0.0
+
+ for batch_idx in range(outputs.shape[0]):
+ num_gt = int(nlabel[batch_idx])
+ num_gts += num_gt
+ if num_gt == 0:
+ cls_target = outputs.new_zeros((0, self.num_classes))
+ reg_target = outputs.new_zeros((0, 4))
+ l1_target = outputs.new_zeros((0, 4))
+ obj_target = outputs.new_zeros((total_num_anchors, 1))
+ fg_mask = outputs.new_zeros(total_num_anchors).bool()
+ else:
+ gt_bboxes_per_image = labels[batch_idx, :num_gt, 1:5]
+ gt_classes = labels[batch_idx, :num_gt, 0]
+ bboxes_preds_per_image = bbox_preds[batch_idx]
+
+ try:
+ (
+ gt_matched_classes,
+ fg_mask,
+ pred_ious_this_matching,
+ matched_gt_inds,
+ num_fg_img,
+ ) = self.get_assignments( # noqa
+ batch_idx,
+ num_gt,
+ gt_bboxes_per_image,
+ gt_classes,
+ bboxes_preds_per_image,
+ expanded_strides,
+ x_shifts,
+ y_shifts,
+ cls_preds,
+ obj_preds,
+ )
+ except RuntimeError as e:
+ # TODO: the string might change, consider a better way
+ if "CUDA out of memory. " not in str(e):
+ raise # RuntimeError might not caused by CUDA OOM
+
+ logger.error(
+ "OOM RuntimeError is raised due to the huge memory cost during label assignment. \
+ CPU mode is applied in this batch. If you want to avoid this issue, \
+ try to reduce the batch size or image size."
+ )
+ torch.cuda.empty_cache()
+ (
+ gt_matched_classes,
+ fg_mask,
+ pred_ious_this_matching,
+ matched_gt_inds,
+ num_fg_img,
+ ) = self.get_assignments( # noqa
+ batch_idx,
+ num_gt,
+ gt_bboxes_per_image,
+ gt_classes,
+ bboxes_preds_per_image,
+ expanded_strides,
+ x_shifts,
+ y_shifts,
+ cls_preds,
+ obj_preds,
+ "cpu",
+ )
+
+ torch.cuda.empty_cache()
+ num_fg += num_fg_img
+
+ cls_target = F.one_hot(
+ gt_matched_classes.to(torch.int64), self.num_classes
+ ) * pred_ious_this_matching.unsqueeze(-1)
+ obj_target = fg_mask.unsqueeze(-1)
+ reg_target = gt_bboxes_per_image[matched_gt_inds]
+ if self.use_l1:
+ l1_target = self.get_l1_target(
+ outputs.new_zeros((num_fg_img, 4)),
+ gt_bboxes_per_image[matched_gt_inds],
+ expanded_strides[0][fg_mask],
+ x_shifts=x_shifts[0][fg_mask],
+ y_shifts=y_shifts[0][fg_mask],
+ )
+
+ cls_targets.append(cls_target)
+ reg_targets.append(reg_target)
+ obj_targets.append(obj_target.to(dtype))
+ fg_masks.append(fg_mask)
+ if self.use_l1:
+ l1_targets.append(l1_target)
+
+ cls_targets = torch.cat(cls_targets, 0)
+ reg_targets = torch.cat(reg_targets, 0)
+ obj_targets = torch.cat(obj_targets, 0)
+ fg_masks = torch.cat(fg_masks, 0)
+ if self.use_l1:
+ l1_targets = torch.cat(l1_targets, 0)
+
+ num_fg = max(num_fg, 1)
+ loss_iou = (
+ self.iou_loss(bbox_preds.view(-1, 4)[fg_masks], reg_targets)
+ ).sum() / num_fg
+ loss_obj = (
+ self.bcewithlog_loss(obj_preds.view(-1, 1), obj_targets)
+ ).sum() / num_fg
+ loss_cls = (
+ self.bcewithlog_loss(
+ cls_preds.view(-1, self.num_classes)[fg_masks], cls_targets
+ )
+ ).sum() / num_fg
+ if self.use_l1:
+ loss_l1 = (
+ self.l1_loss(origin_preds.view(-1, 4)[fg_masks], l1_targets)
+ ).sum() / num_fg
+ else:
+ loss_l1 = 0.0
+
+ reg_weight = 5.0
+ loss = reg_weight * loss_iou + loss_obj + loss_cls + loss_l1
+
+ return (
+ loss,
+ reg_weight * loss_iou,
+ loss_obj,
+ loss_cls,
+ loss_l1,
+ num_fg / max(num_gts, 1),
+ )
+
+ def get_l1_target(self, l1_target, gt, stride, x_shifts, y_shifts, eps=1e-8):
+ l1_target[:, 0] = gt[:, 0] / stride - x_shifts
+ l1_target[:, 1] = gt[:, 1] / stride - y_shifts
+ l1_target[:, 2] = torch.log(gt[:, 2] / stride + eps)
+ l1_target[:, 3] = torch.log(gt[:, 3] / stride + eps)
+ return l1_target
+
+ @torch.no_grad()
+ def get_assignments(
+ self,
+ batch_idx,
+ num_gt,
+ gt_bboxes_per_image,
+ gt_classes,
+ bboxes_preds_per_image,
+ expanded_strides,
+ x_shifts,
+ y_shifts,
+ cls_preds,
+ obj_preds,
+ mode="gpu",
+ ):
+
+ if mode == "cpu":
+ print("-----------Using CPU for the Current Batch-------------")
+ gt_bboxes_per_image = gt_bboxes_per_image.cpu().float()
+ bboxes_preds_per_image = bboxes_preds_per_image.cpu().float()
+ gt_classes = gt_classes.cpu().float()
+ expanded_strides = expanded_strides.cpu().float()
+ x_shifts = x_shifts.cpu()
+ y_shifts = y_shifts.cpu()
+
+ fg_mask, geometry_relation = self.get_geometry_constraint(
+ gt_bboxes_per_image,
+ expanded_strides,
+ x_shifts,
+ y_shifts,
+ )
+
+ bboxes_preds_per_image = bboxes_preds_per_image[fg_mask]
+ cls_preds_ = cls_preds[batch_idx][fg_mask]
+ obj_preds_ = obj_preds[batch_idx][fg_mask]
+ num_in_boxes_anchor = bboxes_preds_per_image.shape[0]
+
+ if mode == "cpu":
+ gt_bboxes_per_image = gt_bboxes_per_image.cpu()
+ bboxes_preds_per_image = bboxes_preds_per_image.cpu()
+
+ pair_wise_ious = bboxes_iou(gt_bboxes_per_image, bboxes_preds_per_image, False)
+
+ gt_cls_per_image = (
+ F.one_hot(gt_classes.to(torch.int64), self.num_classes)
+ .float()
+ )
+ pair_wise_ious_loss = -torch.log(pair_wise_ious + 1e-8)
+
+ if mode == "cpu":
+ cls_preds_, obj_preds_ = cls_preds_.cpu(), obj_preds_.cpu()
+
+ with torch.cuda.amp.autocast(enabled=False):
+ cls_preds_ = (
+ cls_preds_.float().sigmoid_() * obj_preds_.float().sigmoid_()
+ ).sqrt()
+ pair_wise_cls_loss = F.binary_cross_entropy(
+ cls_preds_.unsqueeze(0).repeat(num_gt, 1, 1),
+ gt_cls_per_image.unsqueeze(1).repeat(1, num_in_boxes_anchor, 1),
+ reduction="none"
+ ).sum(-1)
+ del cls_preds_
+
+ cost = (
+ pair_wise_cls_loss
+ + 3.0 * pair_wise_ious_loss
+ + float(1e6) * (~geometry_relation)
+ )
+
+ (
+ num_fg,
+ gt_matched_classes,
+ pred_ious_this_matching,
+ matched_gt_inds,
+ ) = self.simota_matching(cost, pair_wise_ious, gt_classes, num_gt, fg_mask)
+ del pair_wise_cls_loss, cost, pair_wise_ious, pair_wise_ious_loss
+
+ if mode == "cpu":
+ gt_matched_classes = gt_matched_classes.cuda()
+ fg_mask = fg_mask.cuda()
+ pred_ious_this_matching = pred_ious_this_matching.cuda()
+ matched_gt_inds = matched_gt_inds.cuda()
+
+ return (
+ gt_matched_classes,
+ fg_mask,
+ pred_ious_this_matching,
+ matched_gt_inds,
+ num_fg,
+ )
+
+ def get_geometry_constraint(
+ self, gt_bboxes_per_image, expanded_strides, x_shifts, y_shifts,
+ ):
+ """
+ Calculate whether the center of an object is located in a fixed range of
+ an anchor. This is used to avert inappropriate matching. It can also reduce
+ the number of candidate anchors so that the GPU memory is saved.
+ """
+ expanded_strides_per_image = expanded_strides[0]
+ x_centers_per_image = ((x_shifts[0] + 0.5) * expanded_strides_per_image).unsqueeze(0)
+ y_centers_per_image = ((y_shifts[0] + 0.5) * expanded_strides_per_image).unsqueeze(0)
+
+ # in fixed center
+ center_radius = 1.5
+ center_dist = expanded_strides_per_image.unsqueeze(0) * center_radius
+ gt_bboxes_per_image_l = (gt_bboxes_per_image[:, 0:1]) - center_dist
+ gt_bboxes_per_image_r = (gt_bboxes_per_image[:, 0:1]) + center_dist
+ gt_bboxes_per_image_t = (gt_bboxes_per_image[:, 1:2]) - center_dist
+ gt_bboxes_per_image_b = (gt_bboxes_per_image[:, 1:2]) + center_dist
+
+ c_l = x_centers_per_image - gt_bboxes_per_image_l
+ c_r = gt_bboxes_per_image_r - x_centers_per_image
+ c_t = y_centers_per_image - gt_bboxes_per_image_t
+ c_b = gt_bboxes_per_image_b - y_centers_per_image
+ center_deltas = torch.stack([c_l, c_t, c_r, c_b], 2)
+ is_in_centers = center_deltas.min(dim=-1).values > 0.0
+ anchor_filter = is_in_centers.sum(dim=0) > 0
+ geometry_relation = is_in_centers[:, anchor_filter]
+
+ return anchor_filter, geometry_relation
+
+ def simota_matching(self, cost, pair_wise_ious, gt_classes, num_gt, fg_mask):
+ matching_matrix = torch.zeros_like(cost, dtype=torch.uint8)
+
+ n_candidate_k = min(10, pair_wise_ious.size(1))
+ topk_ious, _ = torch.topk(pair_wise_ious, n_candidate_k, dim=1)
+ dynamic_ks = torch.clamp(topk_ious.sum(1).int(), min=1)
+ for gt_idx in range(num_gt):
+ _, pos_idx = torch.topk(
+ cost[gt_idx], k=dynamic_ks[gt_idx], largest=False
+ )
+ matching_matrix[gt_idx][pos_idx] = 1
+
+ del topk_ious, dynamic_ks, pos_idx
+
+ anchor_matching_gt = matching_matrix.sum(0)
+ # deal with the case that one anchor matches multiple ground-truths
+ if anchor_matching_gt.max() > 1:
+ multiple_match_mask = anchor_matching_gt > 1
+ _, cost_argmin = torch.min(cost[:, multiple_match_mask], dim=0)
+ matching_matrix[:, multiple_match_mask] *= 0
+ matching_matrix[cost_argmin, multiple_match_mask] = 1
+ fg_mask_inboxes = anchor_matching_gt > 0
+ num_fg = fg_mask_inboxes.sum().item()
+
+ fg_mask[fg_mask.clone()] = fg_mask_inboxes
+
+ matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0)
+ gt_matched_classes = gt_classes[matched_gt_inds]
+
+ pred_ious_this_matching = (matching_matrix * pair_wise_ious).sum(0)[
+ fg_mask_inboxes
+ ]
+ return num_fg, gt_matched_classes, pred_ious_this_matching, matched_gt_inds
+
+ def visualize_assign_result(self, xin, labels=None, imgs=None, save_prefix="assign_vis_"):
+ # original forward logic
+ outputs, x_shifts, y_shifts, expanded_strides = [], [], [], []
+ # TODO: use forward logic here.
+
+ for k, (cls_conv, reg_conv, stride_this_level, x) in enumerate(
+ zip(self.cls_convs, self.reg_convs, self.strides, xin)
+ ):
+ x = self.stems[k](x)
+ cls_x = x
+ reg_x = x
+
+ cls_feat = cls_conv(cls_x)
+ cls_output = self.cls_preds[k](cls_feat)
+ reg_feat = reg_conv(reg_x)
+ reg_output = self.reg_preds[k](reg_feat)
+ obj_output = self.obj_preds[k](reg_feat)
+
+ output = torch.cat([reg_output, obj_output, cls_output], 1)
+ output, grid = self.get_output_and_grid(output, k, stride_this_level, xin[0].type())
+ x_shifts.append(grid[:, :, 0])
+ y_shifts.append(grid[:, :, 1])
+ expanded_strides.append(
+ torch.full((1, grid.shape[1]), stride_this_level).type_as(xin[0])
+ )
+ outputs.append(output)
+
+ outputs = torch.cat(outputs, 1)
+ bbox_preds = outputs[:, :, :4] # [batch, n_anchors_all, 4]
+ obj_preds = outputs[:, :, 4:5] # [batch, n_anchors_all, 1]
+ cls_preds = outputs[:, :, 5:] # [batch, n_anchors_all, n_cls]
+
+ # calculate targets
+ total_num_anchors = outputs.shape[1]
+ x_shifts = torch.cat(x_shifts, 1) # [1, n_anchors_all]
+ y_shifts = torch.cat(y_shifts, 1) # [1, n_anchors_all]
+ expanded_strides = torch.cat(expanded_strides, 1)
+
+ nlabel = (labels.sum(dim=2) > 0).sum(dim=1) # number of objects
+ for batch_idx, (img, num_gt, label) in enumerate(zip(imgs, nlabel, labels)):
+ img = imgs[batch_idx].permute(1, 2, 0).to(torch.uint8)
+ num_gt = int(num_gt)
+ if num_gt == 0:
+ fg_mask = outputs.new_zeros(total_num_anchors).bool()
+ else:
+ gt_bboxes_per_image = label[:num_gt, 1:5]
+ gt_classes = label[:num_gt, 0]
+ bboxes_preds_per_image = bbox_preds[batch_idx]
+ _, fg_mask, _, matched_gt_inds, _ = self.get_assignments( # noqa
+ batch_idx, num_gt, gt_bboxes_per_image, gt_classes,
+ bboxes_preds_per_image, expanded_strides, x_shifts,
+ y_shifts, cls_preds, obj_preds,
+ )
+
+ img = img.cpu().numpy().copy() # copy is crucial here
+ coords = torch.stack([
+ ((x_shifts + 0.5) * expanded_strides).flatten()[fg_mask],
+ ((y_shifts + 0.5) * expanded_strides).flatten()[fg_mask],
+ ], 1)
+
+ xyxy_boxes = cxcywh2xyxy(gt_bboxes_per_image)
+ save_name = save_prefix + str(batch_idx) + ".png"
+ img = visualize_assign(img, xyxy_boxes, coords, matched_gt_inds, save_name)
+ logger.info(f"save img to {save_name}")
diff --git a/what/models/detection/yolox/models/yolo_pafpn.py b/what/models/detection/yolox/models/yolo_pafpn.py
new file mode 100644
index 0000000..4c4e18a
--- /dev/null
+++ b/what/models/detection/yolox/models/yolo_pafpn.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import torch
+import torch.nn as nn
+
+from .darknet import CSPDarknet
+from .network_blocks import BaseConv, CSPLayer, DWConv
+
+
+class YOLOPAFPN(nn.Module):
+ """
+ YOLOv3 model. Darknet 53 is the default backbone of this model.
+ """
+
+ def __init__(
+ self,
+ depth=1.0,
+ width=1.0,
+ in_features=("dark3", "dark4", "dark5"),
+ in_channels=[256, 512, 1024],
+ depthwise=False,
+ act="silu",
+ ):
+ super().__init__()
+ self.backbone = CSPDarknet(depth, width, depthwise=depthwise, act=act)
+ self.in_features = in_features
+ self.in_channels = in_channels
+ Conv = DWConv if depthwise else BaseConv
+
+ self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
+ self.lateral_conv0 = BaseConv(
+ int(in_channels[2] * width), int(in_channels[1] * width), 1, 1, act=act
+ )
+ self.C3_p4 = CSPLayer(
+ int(2 * in_channels[1] * width),
+ int(in_channels[1] * width),
+ round(3 * depth),
+ False,
+ depthwise=depthwise,
+ act=act,
+ ) # cat
+
+ self.reduce_conv1 = BaseConv(
+ int(in_channels[1] * width), int(in_channels[0] * width), 1, 1, act=act
+ )
+ self.C3_p3 = CSPLayer(
+ int(2 * in_channels[0] * width),
+ int(in_channels[0] * width),
+ round(3 * depth),
+ False,
+ depthwise=depthwise,
+ act=act,
+ )
+
+ # bottom-up conv
+ self.bu_conv2 = Conv(
+ int(in_channels[0] * width), int(in_channels[0] * width), 3, 2, act=act
+ )
+ self.C3_n3 = CSPLayer(
+ int(2 * in_channels[0] * width),
+ int(in_channels[1] * width),
+ round(3 * depth),
+ False,
+ depthwise=depthwise,
+ act=act,
+ )
+
+ # bottom-up conv
+ self.bu_conv1 = Conv(
+ int(in_channels[1] * width), int(in_channels[1] * width), 3, 2, act=act
+ )
+ self.C3_n4 = CSPLayer(
+ int(2 * in_channels[1] * width),
+ int(in_channels[2] * width),
+ round(3 * depth),
+ False,
+ depthwise=depthwise,
+ act=act,
+ )
+
+ def forward(self, input):
+ """
+ Args:
+ inputs: input images.
+
+ Returns:
+ Tuple[Tensor]: FPN feature.
+ """
+
+ # backbone
+ out_features = self.backbone(input)
+ features = [out_features[f] for f in self.in_features]
+ [x2, x1, x0] = features
+
+ fpn_out0 = self.lateral_conv0(x0) # 1024->512/32
+ f_out0 = self.upsample(fpn_out0) # 512/16
+ f_out0 = torch.cat([f_out0, x1], 1) # 512->1024/16
+ f_out0 = self.C3_p4(f_out0) # 1024->512/16
+
+ fpn_out1 = self.reduce_conv1(f_out0) # 512->256/16
+ f_out1 = self.upsample(fpn_out1) # 256/8
+ f_out1 = torch.cat([f_out1, x2], 1) # 256->512/8
+ pan_out2 = self.C3_p3(f_out1) # 512->256/8
+
+ p_out1 = self.bu_conv2(pan_out2) # 256->256/16
+ p_out1 = torch.cat([p_out1, fpn_out1], 1) # 256->512/16
+ pan_out1 = self.C3_n3(p_out1) # 512->512/16
+
+ p_out0 = self.bu_conv1(pan_out1) # 512->512/32
+ p_out0 = torch.cat([p_out0, fpn_out0], 1) # 512->1024/32
+ pan_out0 = self.C3_n4(p_out0) # 1024->1024/32
+
+ outputs = (pan_out2, pan_out1, pan_out0)
+ return outputs
diff --git a/what/models/detection/yolox/models/yolox.py b/what/models/detection/yolox/models/yolox.py
new file mode 100644
index 0000000..744ceea
--- /dev/null
+++ b/what/models/detection/yolox/models/yolox.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import torch.nn as nn
+
+from .yolo_head import YOLOXHead
+from .yolo_pafpn import YOLOPAFPN
+
+
+class YOLOX(nn.Module):
+ """
+ YOLOX model module. The module list is defined by create_yolov3_modules function.
+ The network returns loss values from three YOLO layers during training
+ and detection results during test.
+ """
+
+ def __init__(self, backbone=None, head=None):
+ super().__init__()
+ if backbone is None:
+ backbone = YOLOPAFPN()
+ if head is None:
+ head = YOLOXHead(80)
+
+ self.backbone = backbone
+ self.head = head
+
+ def forward(self, x, targets=None):
+ # fpn output content features of [dark3, dark4, dark5]
+ fpn_outs = self.backbone(x)
+
+ if self.training:
+ assert targets is not None
+ loss, iou_loss, conf_loss, cls_loss, l1_loss, num_fg = self.head(
+ fpn_outs, targets, x
+ )
+ outputs = {
+ "total_loss": loss,
+ "iou_loss": iou_loss,
+ "l1_loss": l1_loss,
+ "conf_loss": conf_loss,
+ "cls_loss": cls_loss,
+ "num_fg": num_fg,
+ }
+ else:
+ outputs = self.head(fpn_outs)
+
+ return outputs
+
+ def visualize(self, x, targets, save_prefix="assign_vis_"):
+ fpn_outs = self.backbone(x)
+ self.head.visualize_assign_result(fpn_outs, targets, x, save_prefix)
diff --git a/what/models/detection/yolox/predictor.py b/what/models/detection/yolox/predictor.py
new file mode 100644
index 0000000..6909e93
--- /dev/null
+++ b/what/models/detection/yolox/predictor.py
@@ -0,0 +1,60 @@
+import torch
+
+from .data.data_augment import ValTransform
+from .data.datasets import COCO_CLASSES
+from .utils import postprocess
+
+class Predictor(object):
+ def __init__(
+ self,
+ model,
+ exp,
+ cls_names=COCO_CLASSES,
+ trt_file=None,
+ decoder=None,
+ device="cpu",
+ fp16=False,
+ legacy=False,
+ ):
+ self.model = model
+ self.cls_names = cls_names
+ self.decoder = decoder
+ self.num_classes = exp.num_classes
+ self.confthre = exp.test_conf
+ self.nmsthre = exp.nmsthre
+ self.test_size = exp.test_size
+ self.device = device
+ self.fp16 = fp16
+ self.preproc = ValTransform(legacy=legacy)
+
+ if self.device == "gpu":
+ model.cuda()
+ model.eval()
+
+ def inference(self, img):
+ img_info = {"id": 0}
+ height, width = img.shape[:2]
+ img_info["height"] = height
+ img_info["width"] = width
+ img_info["raw_img"] = img
+
+ ratio = min(self.test_size[0] / img.shape[0], self.test_size[1] / img.shape[1])
+ img_info["ratio"] = ratio
+
+ img, _ = self.preproc(img, None, self.test_size)
+ img = torch.from_numpy(img).unsqueeze(0)
+ img = img.float()
+ if self.device == "gpu":
+ img = img.cuda()
+ if self.fp16:
+ img = img.half() # to FP16
+
+ with torch.no_grad():
+ outputs = self.model(img)
+ if self.decoder is not None:
+ outputs = self.decoder(outputs, dtype=outputs.type())
+ outputs = postprocess(
+ outputs, self.num_classes, self.confthre,
+ self.nmsthre, class_agnostic=True
+ )
+ return outputs, img_info
diff --git a/what/models/detection/yolox/tools/__init__.py b/what/models/detection/yolox/tools/__init__.py
new file mode 100644
index 0000000..0944290
--- /dev/null
+++ b/what/models/detection/yolox/tools/__init__.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+# Copyright (c) Megvii Inc. All rights reserved.
+
+# This file is used for package installation. Script of train/eval/export will be available.
+
+import sys
+from importlib import abc, util
+from pathlib import Path
+
+_TOOLS_PATH = Path(__file__).resolve().parent.parent.parent / "tools"
+
+if _TOOLS_PATH.is_dir():
+ # This is true only for in-place installation (pip install -e, setup.py develop),
+ # where setup(package_dir=) does not work: https://github.com/pypa/setuptools/issues/230
+
+ class _PathFinder(abc.MetaPathFinder):
+
+ def find_spec(self, name, path, target=None):
+ if not name.startswith("yolox.tools."):
+ return
+ project_name = name.split(".")[-1] + ".py"
+ target_file = _TOOLS_PATH / project_name
+ if not target_file.is_file():
+ return
+ return util.spec_from_file_location(name, target_file)
+
+ sys.meta_path.append(_PathFinder())
diff --git a/what/models/detection/yolox/utils/__init__.py b/what/models/detection/yolox/utils/__init__.py
new file mode 100644
index 0000000..08e6dae
--- /dev/null
+++ b/what/models/detection/yolox/utils/__init__.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python3
+# Copyright (c) Megvii Inc. All rights reserved.
+
+from .allreduce_norm import *
+from .boxes import *
+from .checkpoint import load_ckpt, save_checkpoint
+from .compat import meshgrid
+from .demo_utils import *
+from .dist import *
+from .ema import *
+from .logger import WandbLogger, setup_logger
+from .lr_scheduler import LRScheduler
+from .metric import *
+from .model_utils import *
+from .setup_env import *
+from .visualize import *
diff --git a/what/models/detection/yolox/utils/allreduce_norm.py b/what/models/detection/yolox/utils/allreduce_norm.py
new file mode 100644
index 0000000..142c76c
--- /dev/null
+++ b/what/models/detection/yolox/utils/allreduce_norm.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import pickle
+from collections import OrderedDict
+
+import torch
+from torch import distributed as dist
+from torch import nn
+
+from .dist import _get_global_gloo_group, get_world_size
+
+ASYNC_NORM = (
+ nn.BatchNorm1d,
+ nn.BatchNorm2d,
+ nn.BatchNorm3d,
+ nn.InstanceNorm1d,
+ nn.InstanceNorm2d,
+ nn.InstanceNorm3d,
+)
+
+__all__ = [
+ "get_async_norm_states",
+ "pyobj2tensor",
+ "tensor2pyobj",
+ "all_reduce",
+ "all_reduce_norm",
+]
+
+
+def get_async_norm_states(module):
+ async_norm_states = OrderedDict()
+ for name, child in module.named_modules():
+ if isinstance(child, ASYNC_NORM):
+ for k, v in child.state_dict().items():
+ async_norm_states[".".join([name, k])] = v
+ return async_norm_states
+
+
+def pyobj2tensor(pyobj, device="cuda"):
+ """serialize picklable python object to tensor"""
+ storage = torch.ByteStorage.from_buffer(pickle.dumps(pyobj))
+ return torch.ByteTensor(storage).to(device=device)
+
+
+def tensor2pyobj(tensor):
+ """deserialize tensor to picklable python object"""
+ return pickle.loads(tensor.cpu().numpy().tobytes())
+
+
+def _get_reduce_op(op_name):
+ return {
+ "sum": dist.ReduceOp.SUM,
+ "mean": dist.ReduceOp.SUM,
+ }[op_name.lower()]
+
+
+def all_reduce(py_dict, op="sum", group=None):
+ """
+ Apply all reduce function for python dict object.
+ NOTE: make sure that every py_dict has the same keys and values are in the same shape.
+
+ Args:
+ py_dict (dict): dict to apply all reduce op.
+ op (str): operator, could be "sum" or "mean".
+ """
+ world_size = get_world_size()
+ if world_size == 1:
+ return py_dict
+ if group is None:
+ group = _get_global_gloo_group()
+ if dist.get_world_size(group) == 1:
+ return py_dict
+
+ # all reduce logic across different devices.
+ py_key = list(py_dict.keys())
+ py_key_tensor = pyobj2tensor(py_key)
+ dist.broadcast(py_key_tensor, src=0)
+ py_key = tensor2pyobj(py_key_tensor)
+
+ tensor_shapes = [py_dict[k].shape for k in py_key]
+ tensor_numels = [py_dict[k].numel() for k in py_key]
+
+ flatten_tensor = torch.cat([py_dict[k].flatten() for k in py_key])
+ dist.all_reduce(flatten_tensor, op=_get_reduce_op(op))
+ if op == "mean":
+ flatten_tensor /= world_size
+
+ split_tensors = [
+ x.reshape(shape)
+ for x, shape in zip(torch.split(flatten_tensor, tensor_numels), tensor_shapes)
+ ]
+ return OrderedDict({k: v for k, v in zip(py_key, split_tensors)})
+
+
+def all_reduce_norm(module):
+ """
+ All reduce norm statistics in different devices.
+ """
+ states = get_async_norm_states(module)
+ states = all_reduce(states, op="mean")
+ module.load_state_dict(states, strict=False)
diff --git a/what/models/detection/yolox/utils/boxes.py b/what/models/detection/yolox/utils/boxes.py
new file mode 100644
index 0000000..f71e8d9
--- /dev/null
+++ b/what/models/detection/yolox/utils/boxes.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import numpy as np
+
+import torch
+import torchvision
+
+__all__ = [
+ "filter_box",
+ "postprocess",
+ "bboxes_iou",
+ "matrix_iou",
+ "adjust_box_anns",
+ "xyxy2xywh",
+ "xyxy2cxcywh",
+ "cxcywh2xyxy",
+]
+
+
+def filter_box(output, scale_range):
+ """
+ output: (N, 5+class) shape
+ """
+ min_scale, max_scale = scale_range
+ w = output[:, 2] - output[:, 0]
+ h = output[:, 3] - output[:, 1]
+ keep = (w * h > min_scale * min_scale) & (w * h < max_scale * max_scale)
+ return output[keep]
+
+
+def postprocess(prediction, num_classes, conf_thre=0.7, nms_thre=0.45, class_agnostic=False):
+ box_corner = prediction.new(prediction.shape)
+ box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
+ box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
+ box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
+ box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
+ prediction[:, :, :4] = box_corner[:, :, :4]
+
+ output = [None for _ in range(len(prediction))]
+ for i, image_pred in enumerate(prediction):
+
+ # If none are remaining => process next image
+ if not image_pred.size(0):
+ continue
+ # Get score and class with highest confidence
+ class_conf, class_pred = torch.max(image_pred[:, 5: 5 + num_classes], 1, keepdim=True)
+
+ conf_mask = (image_pred[:, 4] * class_conf.squeeze() >= conf_thre).squeeze()
+ # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
+ detections = torch.cat((image_pred[:, :5], class_conf, class_pred.float()), 1)
+ detections = detections[conf_mask]
+ if not detections.size(0):
+ continue
+
+ if class_agnostic:
+ nms_out_index = torchvision.ops.nms(
+ detections[:, :4],
+ detections[:, 4] * detections[:, 5],
+ nms_thre,
+ )
+ else:
+ nms_out_index = torchvision.ops.batched_nms(
+ detections[:, :4],
+ detections[:, 4] * detections[:, 5],
+ detections[:, 6],
+ nms_thre,
+ )
+
+ detections = detections[nms_out_index]
+ if output[i] is None:
+ output[i] = detections
+ else:
+ output[i] = torch.cat((output[i], detections))
+
+ return output
+
+
+def bboxes_iou(bboxes_a, bboxes_b, xyxy=True):
+ if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
+ raise IndexError
+
+ if xyxy:
+ tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2])
+ br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:])
+ area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
+ area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
+ else:
+ tl = torch.max(
+ (bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2),
+ (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2),
+ )
+ br = torch.min(
+ (bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2),
+ (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2),
+ )
+
+ area_a = torch.prod(bboxes_a[:, 2:], 1)
+ area_b = torch.prod(bboxes_b[:, 2:], 1)
+ en = (tl < br).type(tl.type()).prod(dim=2)
+ area_i = torch.prod(br - tl, 2) * en # * ((tl < br).all())
+ return area_i / (area_a[:, None] + area_b - area_i)
+
+
+def matrix_iou(a, b):
+ """
+ return iou of a and b, numpy version for data augenmentation
+ """
+ lt = np.maximum(a[:, np.newaxis, :2], b[:, :2])
+ rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
+
+ area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2)
+ area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
+ area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
+ return area_i / (area_a[:, np.newaxis] + area_b - area_i + 1e-12)
+
+
+def adjust_box_anns(bbox, scale_ratio, padw, padh, w_max, h_max):
+ bbox[:, 0::2] = np.clip(bbox[:, 0::2] * scale_ratio + padw, 0, w_max)
+ bbox[:, 1::2] = np.clip(bbox[:, 1::2] * scale_ratio + padh, 0, h_max)
+ return bbox
+
+
+def xyxy2xywh(bboxes):
+ bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
+ bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+ return bboxes
+
+
+def xyxy2cxcywh(bboxes):
+ bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
+ bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+ bboxes[:, 0] = bboxes[:, 0] + bboxes[:, 2] * 0.5
+ bboxes[:, 1] = bboxes[:, 1] + bboxes[:, 3] * 0.5
+ return bboxes
+
+
+def cxcywh2xyxy(bboxes):
+ bboxes[:, 0] = bboxes[:, 0] - bboxes[:, 2] * 0.5
+ bboxes[:, 1] = bboxes[:, 1] - bboxes[:, 3] * 0.5
+ bboxes[:, 2] = bboxes[:, 0] + bboxes[:, 2]
+ bboxes[:, 3] = bboxes[:, 1] + bboxes[:, 3]
+ return bboxes
diff --git a/what/models/detection/yolox/utils/checkpoint.py b/what/models/detection/yolox/utils/checkpoint.py
new file mode 100644
index 0000000..a0c200e
--- /dev/null
+++ b/what/models/detection/yolox/utils/checkpoint.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+import os
+import shutil
+from loguru import logger
+
+import torch
+
+
+def load_ckpt(model, ckpt):
+ model_state_dict = model.state_dict()
+ load_dict = {}
+ for key_model, v in model_state_dict.items():
+ if key_model not in ckpt:
+ logger.warning(
+ "{} is not in the ckpt. Please double check and see if this is desired.".format(
+ key_model
+ )
+ )
+ continue
+ v_ckpt = ckpt[key_model]
+ if v.shape != v_ckpt.shape:
+ logger.warning(
+ "Shape of {} in checkpoint is {}, while shape of {} in model is {}.".format(
+ key_model, v_ckpt.shape, key_model, v.shape
+ )
+ )
+ continue
+ load_dict[key_model] = v_ckpt
+
+ model.load_state_dict(load_dict, strict=False)
+ return model
+
+
+def save_checkpoint(state, is_best, save_dir, model_name=""):
+ if not os.path.exists(save_dir):
+ os.makedirs(save_dir)
+ filename = os.path.join(save_dir, model_name + "_ckpt.pth")
+ torch.save(state, filename)
+ if is_best:
+ best_filename = os.path.join(save_dir, "best_ckpt.pth")
+ shutil.copyfile(filename, best_filename)
diff --git a/what/models/detection/yolox/utils/compat.py b/what/models/detection/yolox/utils/compat.py
new file mode 100644
index 0000000..1324077
--- /dev/null
+++ b/what/models/detection/yolox/utils/compat.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+import torch
+
+_TORCH_VER = [int(x) for x in torch.__version__.split(".")[:2]]
+
+__all__ = ["meshgrid"]
+
+
+def meshgrid(*tensors):
+ if _TORCH_VER >= [1, 10]:
+ return torch.meshgrid(*tensors, indexing="ij")
+ else:
+ return torch.meshgrid(*tensors)
diff --git a/what/models/detection/yolox/utils/demo_utils.py b/what/models/detection/yolox/utils/demo_utils.py
new file mode 100644
index 0000000..56dd336
--- /dev/null
+++ b/what/models/detection/yolox/utils/demo_utils.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import os
+import random
+
+import cv2
+import numpy as np
+
+__all__ = [
+ "mkdir", "nms", "multiclass_nms", "demo_postprocess", "random_color", "visualize_assign"
+]
+
+
+def random_color():
+ return random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)
+
+
+def visualize_assign(img, boxes, coords, match_results, save_name=None) -> np.ndarray:
+ """visualize label assign result.
+
+ Args:
+ img: img to visualize
+ boxes: gt boxes in xyxy format
+ coords: coords of matched anchors
+ match_results: match results of each gt box and coord.
+ save_name: name of save image, if None, image will not be saved. Default: None.
+ """
+ for box_id, box in enumerate(boxes):
+ x1, y1, x2, y2 = box
+ color = random_color()
+ assign_coords = coords[match_results == box_id]
+ if assign_coords.numel() == 0:
+ # unmatched boxes are red
+ color = (0, 0, 255)
+ cv2.putText(
+ img, "unmatched", (int(x1), int(y1) - 5),
+ cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 1
+ )
+ else:
+ for coord in assign_coords:
+ # draw assigned anchor
+ cv2.circle(img, (int(coord[0]), int(coord[1])), 3, color, -1)
+ cv2.rectangle(img, (int(x1), int(y1)), (int(x2), int(y2)), color, 2)
+
+ if save_name is not None:
+ cv2.imwrite(save_name, img)
+
+ return img
+
+
+def mkdir(path):
+ if not os.path.exists(path):
+ os.makedirs(path)
+
+
+def nms(boxes, scores, nms_thr):
+ """Single class NMS implemented in Numpy."""
+ x1 = boxes[:, 0]
+ y1 = boxes[:, 1]
+ x2 = boxes[:, 2]
+ y2 = boxes[:, 3]
+
+ areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+ order = scores.argsort()[::-1]
+
+ keep = []
+ while order.size > 0:
+ i = order[0]
+ keep.append(i)
+ xx1 = np.maximum(x1[i], x1[order[1:]])
+ yy1 = np.maximum(y1[i], y1[order[1:]])
+ xx2 = np.minimum(x2[i], x2[order[1:]])
+ yy2 = np.minimum(y2[i], y2[order[1:]])
+
+ w = np.maximum(0.0, xx2 - xx1 + 1)
+ h = np.maximum(0.0, yy2 - yy1 + 1)
+ inter = w * h
+ ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+ inds = np.where(ovr <= nms_thr)[0]
+ order = order[inds + 1]
+
+ return keep
+
+
+def multiclass_nms(boxes, scores, nms_thr, score_thr, class_agnostic=True):
+ """Multiclass NMS implemented in Numpy"""
+ if class_agnostic:
+ nms_method = multiclass_nms_class_agnostic
+ else:
+ nms_method = multiclass_nms_class_aware
+ return nms_method(boxes, scores, nms_thr, score_thr)
+
+
+def multiclass_nms_class_aware(boxes, scores, nms_thr, score_thr):
+ """Multiclass NMS implemented in Numpy. Class-aware version."""
+ final_dets = []
+ num_classes = scores.shape[1]
+ for cls_ind in range(num_classes):
+ cls_scores = scores[:, cls_ind]
+ valid_score_mask = cls_scores > score_thr
+ if valid_score_mask.sum() == 0:
+ continue
+ else:
+ valid_scores = cls_scores[valid_score_mask]
+ valid_boxes = boxes[valid_score_mask]
+ keep = nms(valid_boxes, valid_scores, nms_thr)
+ if len(keep) > 0:
+ cls_inds = np.ones((len(keep), 1)) * cls_ind
+ dets = np.concatenate(
+ [valid_boxes[keep], valid_scores[keep, None], cls_inds], 1
+ )
+ final_dets.append(dets)
+ if len(final_dets) == 0:
+ return None
+ return np.concatenate(final_dets, 0)
+
+
+def multiclass_nms_class_agnostic(boxes, scores, nms_thr, score_thr):
+ """Multiclass NMS implemented in Numpy. Class-agnostic version."""
+ cls_inds = scores.argmax(1)
+ cls_scores = scores[np.arange(len(cls_inds)), cls_inds]
+
+ valid_score_mask = cls_scores > score_thr
+ if valid_score_mask.sum() == 0:
+ return None
+ valid_scores = cls_scores[valid_score_mask]
+ valid_boxes = boxes[valid_score_mask]
+ valid_cls_inds = cls_inds[valid_score_mask]
+ keep = nms(valid_boxes, valid_scores, nms_thr)
+ if keep:
+ dets = np.concatenate(
+ [valid_boxes[keep], valid_scores[keep, None], valid_cls_inds[keep, None]], 1
+ )
+ return dets
+
+
+def demo_postprocess(outputs, img_size, p6=False):
+ grids = []
+ expanded_strides = []
+ strides = [8, 16, 32] if not p6 else [8, 16, 32, 64]
+
+ hsizes = [img_size[0] // stride for stride in strides]
+ wsizes = [img_size[1] // stride for stride in strides]
+
+ for hsize, wsize, stride in zip(hsizes, wsizes, strides):
+ xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
+ grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
+ grids.append(grid)
+ shape = grid.shape[:2]
+ expanded_strides.append(np.full((*shape, 1), stride))
+
+ grids = np.concatenate(grids, 1)
+ expanded_strides = np.concatenate(expanded_strides, 1)
+ outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
+ outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
+
+ return outputs
diff --git a/what/models/detection/yolox/utils/dist.py b/what/models/detection/yolox/utils/dist.py
new file mode 100644
index 0000000..9e8fea9
--- /dev/null
+++ b/what/models/detection/yolox/utils/dist.py
@@ -0,0 +1,294 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# This file mainly comes from
+# https://github.com/facebookresearch/detectron2/blob/master/detectron2/utils/comm.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Megvii Inc. All rights reserved.
+"""
+This file contains primitives for multi-gpu communication.
+This is useful when doing distributed training.
+"""
+
+import functools
+import os
+import pickle
+import time
+from contextlib import contextmanager
+from loguru import logger
+
+import numpy as np
+
+import torch
+from torch import distributed as dist
+
+__all__ = [
+ "get_num_devices",
+ "wait_for_the_master",
+ "is_main_process",
+ "synchronize",
+ "get_world_size",
+ "get_rank",
+ "get_local_rank",
+ "get_local_size",
+ "time_synchronized",
+ "gather",
+ "all_gather",
+]
+
+_LOCAL_PROCESS_GROUP = None
+
+
+def get_num_devices():
+ gpu_list = os.getenv('CUDA_VISIBLE_DEVICES', None)
+ if gpu_list is not None:
+ return len(gpu_list.split(','))
+ else:
+ devices_list_info = os.popen("nvidia-smi -L")
+ devices_list_info = devices_list_info.read().strip().split("\n")
+ return len(devices_list_info)
+
+
+@contextmanager
+def wait_for_the_master(local_rank: int = None):
+ """
+ Make all processes waiting for the master to do some task.
+
+ Args:
+ local_rank (int): the rank of the current process. Default to None.
+ If None, it will use the rank of the current process.
+ """
+ if local_rank is None:
+ local_rank = get_local_rank()
+
+ if local_rank > 0:
+ dist.barrier()
+ yield
+ if local_rank == 0:
+ if not dist.is_available():
+ return
+ if not dist.is_initialized():
+ return
+ else:
+ dist.barrier()
+
+
+def synchronize():
+ """
+ Helper function to synchronize (barrier) among all processes when using distributed training
+ """
+ if not dist.is_available():
+ return
+ if not dist.is_initialized():
+ return
+ world_size = dist.get_world_size()
+ if world_size == 1:
+ return
+ dist.barrier()
+
+
+def get_world_size() -> int:
+ if not dist.is_available():
+ return 1
+ if not dist.is_initialized():
+ return 1
+ return dist.get_world_size()
+
+
+def get_rank() -> int:
+ if not dist.is_available():
+ return 0
+ if not dist.is_initialized():
+ return 0
+ return dist.get_rank()
+
+
+def get_local_rank() -> int:
+ """
+ Returns:
+ The rank of the current process within the local (per-machine) process group.
+ """
+ if _LOCAL_PROCESS_GROUP is None:
+ return get_rank()
+
+ if not dist.is_available():
+ return 0
+ if not dist.is_initialized():
+ return 0
+ return dist.get_rank(group=_LOCAL_PROCESS_GROUP)
+
+
+def get_local_size() -> int:
+ """
+ Returns:
+ The size of the per-machine process group, i.e. the number of processes per machine.
+ """
+ if not dist.is_available():
+ return 1
+ if not dist.is_initialized():
+ return 1
+ return dist.get_world_size(group=_LOCAL_PROCESS_GROUP)
+
+
+def is_main_process() -> bool:
+ return get_rank() == 0
+
+
+@functools.lru_cache()
+def _get_global_gloo_group():
+ """
+ Return a process group based on gloo backend, containing all the ranks
+ The result is cached.
+ """
+ if dist.get_backend() == "nccl":
+ return dist.new_group(backend="gloo")
+ else:
+ return dist.group.WORLD
+
+
+def _serialize_to_tensor(data, group):
+ backend = dist.get_backend(group)
+ assert backend in ["gloo", "nccl"]
+ device = torch.device("cpu" if backend == "gloo" else "cuda")
+
+ buffer = pickle.dumps(data)
+ if len(buffer) > 1024 ** 3:
+ logger.warning(
+ "Rank {} trying to all-gather {:.2f} GB of data on device {}".format(
+ get_rank(), len(buffer) / (1024 ** 3), device
+ )
+ )
+ storage = torch.ByteStorage.from_buffer(buffer)
+ tensor = torch.ByteTensor(storage).to(device=device)
+ return tensor
+
+
+def _pad_to_largest_tensor(tensor, group):
+ """
+ Returns:
+ list[int]: size of the tensor, on each rank
+ Tensor: padded tensor that has the max size
+ """
+ world_size = dist.get_world_size(group=group)
+ assert (
+ world_size >= 1
+ ), "comm.gather/all_gather must be called from ranks within the given group!"
+ local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device)
+ size_list = [
+ torch.zeros([1], dtype=torch.int64, device=tensor.device)
+ for _ in range(world_size)
+ ]
+ dist.all_gather(size_list, local_size, group=group)
+ size_list = [int(size.item()) for size in size_list]
+
+ max_size = max(size_list)
+
+ # we pad the tensor because torch all_gather does not support
+ # gathering tensors of different shapes
+ if local_size != max_size:
+ padding = torch.zeros(
+ (max_size - local_size,), dtype=torch.uint8, device=tensor.device
+ )
+ tensor = torch.cat((tensor, padding), dim=0)
+ return size_list, tensor
+
+
+def all_gather(data, group=None):
+ """
+ Run all_gather on arbitrary picklable data (not necessarily tensors).
+
+ Args:
+ data: any picklable object
+ group: a torch process group. By default, will use a group which
+ contains all ranks on gloo backend.
+ Returns:
+ list[data]: list of data gathered from each rank
+ """
+ if get_world_size() == 1:
+ return [data]
+ if group is None:
+ group = _get_global_gloo_group()
+ if dist.get_world_size(group) == 1:
+ return [data]
+
+ tensor = _serialize_to_tensor(data, group)
+
+ size_list, tensor = _pad_to_largest_tensor(tensor, group)
+ max_size = max(size_list)
+
+ # receiving Tensor from all ranks
+ tensor_list = [
+ torch.empty((max_size,), dtype=torch.uint8, device=tensor.device)
+ for _ in size_list
+ ]
+ dist.all_gather(tensor_list, tensor, group=group)
+
+ data_list = []
+ for size, tensor in zip(size_list, tensor_list):
+ buffer = tensor.cpu().numpy().tobytes()[:size]
+ data_list.append(pickle.loads(buffer))
+
+ return data_list
+
+
+def gather(data, dst=0, group=None):
+ """
+ Run gather on arbitrary picklable data (not necessarily tensors).
+
+ Args:
+ data: any picklable object
+ dst (int): destination rank
+ group: a torch process group. By default, will use a group which
+ contains all ranks on gloo backend.
+
+ Returns:
+ list[data]: on dst, a list of data gathered from each rank. Otherwise,
+ an empty list.
+ """
+ if get_world_size() == 1:
+ return [data]
+ if group is None:
+ group = _get_global_gloo_group()
+ if dist.get_world_size(group=group) == 1:
+ return [data]
+ rank = dist.get_rank(group=group)
+
+ tensor = _serialize_to_tensor(data, group)
+ size_list, tensor = _pad_to_largest_tensor(tensor, group)
+
+ # receiving Tensor from all ranks
+ if rank == dst:
+ max_size = max(size_list)
+ tensor_list = [
+ torch.empty((max_size,), dtype=torch.uint8, device=tensor.device)
+ for _ in size_list
+ ]
+ dist.gather(tensor, tensor_list, dst=dst, group=group)
+
+ data_list = []
+ for size, tensor in zip(size_list, tensor_list):
+ buffer = tensor.cpu().numpy().tobytes()[:size]
+ data_list.append(pickle.loads(buffer))
+ return data_list
+ else:
+ dist.gather(tensor, [], dst=dst, group=group)
+ return []
+
+
+def shared_random_seed():
+ """
+ Returns:
+ int: a random number that is the same across all workers.
+ If workers need a shared RNG, they can use this shared seed to
+ create one.
+ All workers must call this function, otherwise it will deadlock.
+ """
+ ints = np.random.randint(2 ** 31)
+ all_ints = all_gather(ints)
+ return all_ints[0]
+
+
+def time_synchronized():
+ """pytorch-accurate time"""
+ if torch.cuda.is_available():
+ torch.cuda.synchronize()
+ return time.time()
diff --git a/what/models/detection/yolox/utils/ema.py b/what/models/detection/yolox/utils/ema.py
new file mode 100644
index 0000000..73acbca
--- /dev/null
+++ b/what/models/detection/yolox/utils/ema.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+import math
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+
+__all__ = ["ModelEMA", "is_parallel"]
+
+
+def is_parallel(model):
+ """check if model is in parallel mode."""
+ parallel_type = (
+ nn.parallel.DataParallel,
+ nn.parallel.DistributedDataParallel,
+ )
+ return isinstance(model, parallel_type)
+
+
+class ModelEMA:
+ """
+ Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models
+ Keep a moving average of everything in the model state_dict (parameters and buffers).
+ This is intended to allow functionality like
+ https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
+ A smoothed version of the weights is necessary for some training schemes to perform well.
+ This class is sensitive where it is initialized in the sequence of model init,
+ GPU assignment and distributed training wrappers.
+ """
+
+ def __init__(self, model, decay=0.9999, updates=0):
+ """
+ Args:
+ model (nn.Module): model to apply EMA.
+ decay (float): ema decay reate.
+ updates (int): counter of EMA updates.
+ """
+ # Create EMA(FP32)
+ self.ema = deepcopy(model.module if is_parallel(model) else model).eval()
+ self.updates = updates
+ # decay exponential ramp (to help early epochs)
+ self.decay = lambda x: decay * (1 - math.exp(-x / 2000))
+ for p in self.ema.parameters():
+ p.requires_grad_(False)
+
+ def update(self, model):
+ # Update EMA parameters
+ with torch.no_grad():
+ self.updates += 1
+ d = self.decay(self.updates)
+
+ msd = (
+ model.module.state_dict() if is_parallel(model) else model.state_dict()
+ ) # model state_dict
+ for k, v in self.ema.state_dict().items():
+ if v.dtype.is_floating_point:
+ v *= d
+ v += (1.0 - d) * msd[k].detach()
diff --git a/what/models/detection/yolox/utils/logger.py b/what/models/detection/yolox/utils/logger.py
new file mode 100644
index 0000000..1045a7b
--- /dev/null
+++ b/what/models/detection/yolox/utils/logger.py
@@ -0,0 +1,440 @@
+#!/usr/bin/env python3
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import inspect
+import os
+import sys
+from collections import defaultdict
+from loguru import logger
+
+import cv2
+import numpy as np
+
+import torch
+
+
+def get_caller_name(depth=0):
+ """
+ Args:
+ depth (int): Depth of caller conext, use 0 for caller depth.
+ Default value: 0.
+
+ Returns:
+ str: module name of the caller
+ """
+ # the following logic is a little bit faster than inspect.stack() logic
+ frame = inspect.currentframe().f_back
+ for _ in range(depth):
+ frame = frame.f_back
+
+ return frame.f_globals["__name__"]
+
+
+class StreamToLoguru:
+ """
+ stream object that redirects writes to a logger instance.
+ """
+
+ def __init__(self, level="INFO", caller_names=("apex", "pycocotools")):
+ """
+ Args:
+ level(str): log level string of loguru. Default value: "INFO".
+ caller_names(tuple): caller names of redirected module.
+ Default value: (apex, pycocotools).
+ """
+ self.level = level
+ self.linebuf = ""
+ self.caller_names = caller_names
+
+ def write(self, buf):
+ full_name = get_caller_name(depth=1)
+ module_name = full_name.rsplit(".", maxsplit=-1)[0]
+ if module_name in self.caller_names:
+ for line in buf.rstrip().splitlines():
+ # use caller level log
+ logger.opt(depth=2).log(self.level, line.rstrip())
+ else:
+ sys.__stdout__.write(buf)
+
+ def flush(self):
+ # flush is related with CPR(cursor position report) in terminal
+ return sys.__stdout__.flush()
+
+ def isatty(self):
+ # when using colab, jax is installed by default and issue like
+ # https://github.com/Megvii-BaseDetection/YOLOX/issues/1437 might be raised
+ # due to missing attribute like`isatty`.
+ # For more details, checked the following link:
+ # https://github.com/google/jax/blob/10720258ea7fb5bde997dfa2f3f71135ab7a6733/jax/_src/pretty_printer.py#L54 # noqa
+ return sys.__stdout__.isatty()
+
+ def fileno(self):
+ # To solve the issue when using debug tools like pdb
+ return sys.__stdout__.fileno()
+
+
+def redirect_sys_output(log_level="INFO"):
+ redirect_logger = StreamToLoguru(log_level)
+ sys.stderr = redirect_logger
+ sys.stdout = redirect_logger
+
+
+def setup_logger(save_dir, distributed_rank=0, filename="log.txt", mode="a"):
+ """setup logger for training and testing.
+ Args:
+ save_dir(str): location to save log file
+ distributed_rank(int): device rank when multi-gpu environment
+ filename (string): log save name.
+ mode(str): log file write mode, `append` or `override`. default is `a`.
+
+ Return:
+ logger instance.
+ """
+ loguru_format = (
+ "{time:YYYY-MM-DD HH:mm:ss} | "
+ "{level: <8} | "
+ "{name}:{line} - {message}"
+ )
+
+ logger.remove()
+ save_file = os.path.join(save_dir, filename)
+ if mode == "o" and os.path.exists(save_file):
+ os.remove(save_file)
+ # only keep logger in rank0 process
+ if distributed_rank == 0:
+ logger.add(
+ sys.stderr,
+ format=loguru_format,
+ level="INFO",
+ enqueue=True,
+ )
+ logger.add(save_file)
+
+ # redirect stdout/stderr to loguru
+ redirect_sys_output("INFO")
+
+
+class WandbLogger(object):
+ """
+ Log training runs, datasets, models, and predictions to Weights & Biases.
+ This logger sends information to W&B at wandb.ai.
+ By default, this information includes hyperparameters,
+ system configuration and metrics, model metrics,
+ and basic data metrics and analyses.
+
+ For more information, please refer to:
+ https://docs.wandb.ai/guides/track
+ https://docs.wandb.ai/guides/integrations/other/yolox
+ """
+ def __init__(self,
+ project=None,
+ name=None,
+ id=None,
+ entity=None,
+ save_dir=None,
+ config=None,
+ val_dataset=None,
+ num_eval_images=100,
+ log_checkpoints=False,
+ **kwargs):
+ """
+ Args:
+ project (str): wandb project name.
+ name (str): wandb run name.
+ id (str): wandb run id.
+ entity (str): wandb entity name.
+ save_dir (str): save directory.
+ config (dict): config dict.
+ val_dataset (Dataset): validation dataset.
+ num_eval_images (int): number of images from the validation set to log.
+ log_checkpoints (bool): log checkpoints
+ **kwargs: other kwargs.
+
+ Usage:
+ Any arguments for wandb.init can be provided on the command line using
+ the prefix `wandb-`.
+ Example
+ ```
+ python tools/train.py .... --logger wandb wandb-project \
+ wandb-name \
+ wandb-id \
+ wandb-save_dir \
+ wandb-num_eval_imges \
+ wandb-log_checkpoints
+ ```
+ The val_dataset argument is not open to the command line.
+ """
+ try:
+ import wandb
+ self.wandb = wandb
+ except ModuleNotFoundError:
+ raise ModuleNotFoundError(
+ "wandb is not installed."
+ "Please install wandb using pip install wandb"
+ )
+
+ from yolox.data.datasets import VOCDetection
+
+ self.project = project
+ self.name = name
+ self.id = id
+ self.save_dir = save_dir
+ self.config = config
+ self.kwargs = kwargs
+ self.entity = entity
+ self._run = None
+ self.val_artifact = None
+ if num_eval_images == -1:
+ self.num_log_images = len(val_dataset)
+ else:
+ self.num_log_images = min(num_eval_images, len(val_dataset))
+ self.log_checkpoints = (log_checkpoints == "True" or log_checkpoints == "true")
+ self._wandb_init = dict(
+ project=self.project,
+ name=self.name,
+ id=self.id,
+ entity=self.entity,
+ dir=self.save_dir,
+ resume="allow"
+ )
+ self._wandb_init.update(**kwargs)
+
+ _ = self.run
+
+ if self.config:
+ self.run.config.update(self.config)
+ self.run.define_metric("train/epoch")
+ self.run.define_metric("val/*", step_metric="train/epoch")
+ self.run.define_metric("train/step")
+ self.run.define_metric("train/*", step_metric="train/step")
+
+ self.voc_dataset = VOCDetection
+
+ if val_dataset and self.num_log_images != 0:
+ self.val_dataset = val_dataset
+ self.cats = val_dataset.cats
+ self.id_to_class = {
+ cls['id']: cls['name'] for cls in self.cats
+ }
+ self._log_validation_set(val_dataset)
+
+ @property
+ def run(self):
+ if self._run is None:
+ if self.wandb.run is not None:
+ logger.info(
+ "There is a wandb run already in progress "
+ "and newly created instances of `WandbLogger` will reuse"
+ " this run. If this is not desired, call `wandb.finish()`"
+ "before instantiating `WandbLogger`."
+ )
+ self._run = self.wandb.run
+ else:
+ self._run = self.wandb.init(**self._wandb_init)
+ return self._run
+
+ def _log_validation_set(self, val_dataset):
+ """
+ Log validation set to wandb.
+
+ Args:
+ val_dataset (Dataset): validation dataset.
+ """
+ if self.val_artifact is None:
+ self.val_artifact = self.wandb.Artifact(name="validation_images", type="dataset")
+ self.val_table = self.wandb.Table(columns=["id", "input"])
+
+ for i in range(self.num_log_images):
+ data_point = val_dataset[i]
+ img = data_point[0]
+ id = data_point[3]
+ img = np.transpose(img, (1, 2, 0))
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+ if isinstance(id, torch.Tensor):
+ id = id.item()
+
+ self.val_table.add_data(
+ id,
+ self.wandb.Image(img)
+ )
+
+ self.val_artifact.add(self.val_table, "validation_images_table")
+ self.run.use_artifact(self.val_artifact)
+ self.val_artifact.wait()
+
+ def _convert_prediction_format(self, predictions):
+ image_wise_data = defaultdict(int)
+
+ for key, val in predictions.items():
+ img_id = key
+
+ try:
+ bboxes, cls, scores = val
+ except KeyError:
+ bboxes, cls, scores = val["bboxes"], val["categories"], val["scores"]
+
+ # These store information of actual bounding boxes i.e. the ones which are not None
+ act_box = []
+ act_scores = []
+ act_cls = []
+
+ if bboxes is not None:
+ for box, classes, score in zip(bboxes, cls, scores):
+ if box is None or score is None or classes is None:
+ continue
+ act_box.append(box)
+ act_scores.append(score)
+ act_cls.append(classes)
+
+ image_wise_data.update({
+ int(img_id): {
+ "bboxes": [box.numpy().tolist() for box in act_box],
+ "scores": [score.numpy().item() for score in act_scores],
+ "categories": [
+ self.val_dataset.class_ids[int(act_cls[ind])]
+ for ind in range(len(act_box))
+ ],
+ }
+ })
+
+ return image_wise_data
+
+ def log_metrics(self, metrics, step=None):
+ """
+ Args:
+ metrics (dict): metrics dict.
+ step (int): step number.
+ """
+
+ for k, v in metrics.items():
+ if isinstance(v, torch.Tensor):
+ metrics[k] = v.item()
+
+ if step is not None:
+ metrics.update({"train/step": step})
+ self.run.log(metrics)
+ else:
+ self.run.log(metrics)
+
+ def log_images(self, predictions):
+ if len(predictions) == 0 or self.val_artifact is None or self.num_log_images == 0:
+ return
+
+ table_ref = self.val_artifact.get("validation_images_table")
+
+ columns = ["id", "predicted"]
+ for cls in self.cats:
+ columns.append(cls["name"])
+
+ if isinstance(self.val_dataset, self.voc_dataset):
+ predictions = self._convert_prediction_format(predictions)
+
+ result_table = self.wandb.Table(columns=columns)
+
+ for idx, val in table_ref.iterrows():
+
+ avg_scores = defaultdict(int)
+ num_occurrences = defaultdict(int)
+
+ id = val[0]
+ if isinstance(id, list):
+ id = id[0]
+
+ if id in predictions:
+ prediction = predictions[id]
+ boxes = []
+ for i in range(len(prediction["bboxes"])):
+ bbox = prediction["bboxes"][i]
+ x0 = bbox[0]
+ y0 = bbox[1]
+ x1 = bbox[2]
+ y1 = bbox[3]
+ box = {
+ "position": {
+ "minX": min(x0, x1),
+ "minY": min(y0, y1),
+ "maxX": max(x0, x1),
+ "maxY": max(y0, y1)
+ },
+ "class_id": prediction["categories"][i],
+ "domain": "pixel"
+ }
+ avg_scores[
+ self.id_to_class[prediction["categories"][i]]
+ ] += prediction["scores"][i]
+ num_occurrences[self.id_to_class[prediction["categories"][i]]] += 1
+ boxes.append(box)
+ else:
+ boxes = []
+ average_class_score = []
+ for cls in self.cats:
+ if cls["name"] not in num_occurrences:
+ score = 0
+ else:
+ score = avg_scores[cls["name"]] / num_occurrences[cls["name"]]
+ average_class_score.append(score)
+ result_table.add_data(
+ idx,
+ self.wandb.Image(val[1], boxes={
+ "prediction": {
+ "box_data": boxes,
+ "class_labels": self.id_to_class
+ }
+ }
+ ),
+ *average_class_score
+ )
+
+ self.wandb.log({"val_results/result_table": result_table})
+
+ def save_checkpoint(self, save_dir, model_name, is_best, metadata=None):
+ """
+ Args:
+ save_dir (str): save directory.
+ model_name (str): model name.
+ is_best (bool): whether the model is the best model.
+ metadata (dict): metadata to save corresponding to the checkpoint.
+ """
+
+ if not self.log_checkpoints:
+ return
+
+ if "epoch" in metadata:
+ epoch = metadata["epoch"]
+ else:
+ epoch = None
+
+ filename = os.path.join(save_dir, model_name + "_ckpt.pth")
+ artifact = self.wandb.Artifact(
+ name=f"run_{self.run.id}_model",
+ type="model",
+ metadata=metadata
+ )
+ artifact.add_file(filename, name="model_ckpt.pth")
+
+ aliases = ["latest"]
+
+ if is_best:
+ aliases.append("best")
+
+ if epoch:
+ aliases.append(f"epoch-{epoch}")
+
+ self.run.log_artifact(artifact, aliases=aliases)
+
+ def finish(self):
+ self.run.finish()
+
+ @classmethod
+ def initialize_wandb_logger(cls, args, exp, val_dataset):
+ wandb_params = dict()
+ prefix = "wandb-"
+ for k, v in zip(args.opts[0::2], args.opts[1::2]):
+ if k.startswith("wandb-"):
+ try:
+ wandb_params.update({k[len(prefix):]: int(v)})
+ except ValueError:
+ wandb_params.update({k[len(prefix):]: v})
+
+ return cls(config=vars(exp), val_dataset=val_dataset, **wandb_params)
diff --git a/what/models/detection/yolox/utils/lr_scheduler.py b/what/models/detection/yolox/utils/lr_scheduler.py
new file mode 100644
index 0000000..42c00cf
--- /dev/null
+++ b/what/models/detection/yolox/utils/lr_scheduler.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import math
+from functools import partial
+
+
+class LRScheduler:
+ def __init__(self, name, lr, iters_per_epoch, total_epochs, **kwargs):
+ """
+ Supported lr schedulers: [cos, warmcos, multistep]
+
+ Args:
+ lr (float): learning rate.
+ iters_per_epoch (int): number of iterations in one epoch.
+ total_epochs (int): number of epochs in training.
+ kwargs (dict):
+ - cos: None
+ - warmcos: [warmup_epochs, warmup_lr_start (default 1e-6)]
+ - multistep: [milestones (epochs), gamma (default 0.1)]
+ """
+
+ self.lr = lr
+ self.iters_per_epoch = iters_per_epoch
+ self.total_epochs = total_epochs
+ self.total_iters = iters_per_epoch * total_epochs
+
+ self.__dict__.update(kwargs)
+
+ self.lr_func = self._get_lr_func(name)
+
+ def update_lr(self, iters):
+ return self.lr_func(iters)
+
+ def _get_lr_func(self, name):
+ if name == "cos": # cosine lr schedule
+ lr_func = partial(cos_lr, self.lr, self.total_iters)
+ elif name == "warmcos":
+ warmup_total_iters = self.iters_per_epoch * self.warmup_epochs
+ warmup_lr_start = getattr(self, "warmup_lr_start", 1e-6)
+ lr_func = partial(
+ warm_cos_lr,
+ self.lr,
+ self.total_iters,
+ warmup_total_iters,
+ warmup_lr_start,
+ )
+ elif name == "yoloxwarmcos":
+ warmup_total_iters = self.iters_per_epoch * self.warmup_epochs
+ no_aug_iters = self.iters_per_epoch * self.no_aug_epochs
+ warmup_lr_start = getattr(self, "warmup_lr_start", 0)
+ min_lr_ratio = getattr(self, "min_lr_ratio", 0.2)
+ lr_func = partial(
+ yolox_warm_cos_lr,
+ self.lr,
+ min_lr_ratio,
+ self.total_iters,
+ warmup_total_iters,
+ warmup_lr_start,
+ no_aug_iters,
+ )
+ elif name == "yoloxsemiwarmcos":
+ warmup_lr_start = getattr(self, "warmup_lr_start", 0)
+ min_lr_ratio = getattr(self, "min_lr_ratio", 0.2)
+ warmup_total_iters = self.iters_per_epoch * self.warmup_epochs
+ no_aug_iters = self.iters_per_epoch * self.no_aug_epochs
+ normal_iters = self.iters_per_epoch * self.semi_epoch
+ semi_iters = self.iters_per_epoch_semi * (
+ self.total_epochs - self.semi_epoch - self.no_aug_epochs
+ )
+ lr_func = partial(
+ yolox_semi_warm_cos_lr,
+ self.lr,
+ min_lr_ratio,
+ warmup_lr_start,
+ self.total_iters,
+ normal_iters,
+ no_aug_iters,
+ warmup_total_iters,
+ semi_iters,
+ self.iters_per_epoch,
+ self.iters_per_epoch_semi,
+ )
+ elif name == "multistep": # stepwise lr schedule
+ milestones = [
+ int(self.total_iters * milestone / self.total_epochs)
+ for milestone in self.milestones
+ ]
+ gamma = getattr(self, "gamma", 0.1)
+ lr_func = partial(multistep_lr, self.lr, milestones, gamma)
+ else:
+ raise ValueError("Scheduler version {} not supported.".format(name))
+ return lr_func
+
+
+def cos_lr(lr, total_iters, iters):
+ """Cosine learning rate"""
+ lr *= 0.5 * (1.0 + math.cos(math.pi * iters / total_iters))
+ return lr
+
+
+def warm_cos_lr(lr, total_iters, warmup_total_iters, warmup_lr_start, iters):
+ """Cosine learning rate with warm up."""
+ if iters <= warmup_total_iters:
+ lr = (lr - warmup_lr_start) * iters / float(
+ warmup_total_iters
+ ) + warmup_lr_start
+ else:
+ lr *= 0.5 * (
+ 1.0
+ + math.cos(
+ math.pi
+ * (iters - warmup_total_iters)
+ / (total_iters - warmup_total_iters)
+ )
+ )
+ return lr
+
+
+def yolox_warm_cos_lr(
+ lr,
+ min_lr_ratio,
+ total_iters,
+ warmup_total_iters,
+ warmup_lr_start,
+ no_aug_iter,
+ iters,
+):
+ """Cosine learning rate with warm up."""
+ min_lr = lr * min_lr_ratio
+ if iters <= warmup_total_iters:
+ # lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start
+ lr = (lr - warmup_lr_start) * pow(
+ iters / float(warmup_total_iters), 2
+ ) + warmup_lr_start
+ elif iters >= total_iters - no_aug_iter:
+ lr = min_lr
+ else:
+ lr = min_lr + 0.5 * (lr - min_lr) * (
+ 1.0
+ + math.cos(
+ math.pi
+ * (iters - warmup_total_iters)
+ / (total_iters - warmup_total_iters - no_aug_iter)
+ )
+ )
+ return lr
+
+
+def yolox_semi_warm_cos_lr(
+ lr,
+ min_lr_ratio,
+ warmup_lr_start,
+ total_iters,
+ normal_iters,
+ no_aug_iters,
+ warmup_total_iters,
+ semi_iters,
+ iters_per_epoch,
+ iters_per_epoch_semi,
+ iters,
+):
+ """Cosine learning rate with warm up."""
+ min_lr = lr * min_lr_ratio
+ if iters <= warmup_total_iters:
+ # lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start
+ lr = (lr - warmup_lr_start) * pow(
+ iters / float(warmup_total_iters), 2
+ ) + warmup_lr_start
+ elif iters >= normal_iters + semi_iters:
+ lr = min_lr
+ elif iters <= normal_iters:
+ lr = min_lr + 0.5 * (lr - min_lr) * (
+ 1.0
+ + math.cos(
+ math.pi
+ * (iters - warmup_total_iters)
+ / (total_iters - warmup_total_iters - no_aug_iters)
+ )
+ )
+ else:
+ lr = min_lr + 0.5 * (lr - min_lr) * (
+ 1.0
+ + math.cos(
+ math.pi
+ * (
+ normal_iters
+ - warmup_total_iters
+ + (iters - normal_iters)
+ * iters_per_epoch
+ * 1.0
+ / iters_per_epoch_semi
+ )
+ / (total_iters - warmup_total_iters - no_aug_iters)
+ )
+ )
+ return lr
+
+
+def multistep_lr(lr, milestones, gamma, iters):
+ """MultiStep learning rate"""
+ for milestone in milestones:
+ lr *= gamma if iters >= milestone else 1.0
+ return lr
diff --git a/what/models/detection/yolox/utils/metric.py b/what/models/detection/yolox/utils/metric.py
new file mode 100644
index 0000000..506b582
--- /dev/null
+++ b/what/models/detection/yolox/utils/metric.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+import functools
+import os
+import time
+from collections import defaultdict, deque
+import psutil
+
+import numpy as np
+
+import torch
+
+__all__ = [
+ "AverageMeter",
+ "MeterBuffer",
+ "get_total_and_free_memory_in_Mb",
+ "occupy_mem",
+ "gpu_mem_usage",
+ "mem_usage"
+]
+
+
+def get_total_and_free_memory_in_Mb(cuda_device):
+ devices_info_str = os.popen(
+ "nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader"
+ )
+ devices_info = devices_info_str.read().strip().split("\n")
+ if "CUDA_VISIBLE_DEVICES" in os.environ:
+ visible_devices = os.environ["CUDA_VISIBLE_DEVICES"].split(',')
+ cuda_device = int(visible_devices[cuda_device])
+ total, used = devices_info[int(cuda_device)].split(",")
+ return int(total), int(used)
+
+
+def occupy_mem(cuda_device, mem_ratio=0.9):
+ """
+ pre-allocate gpu memory for training to avoid memory Fragmentation.
+ """
+ total, used = get_total_and_free_memory_in_Mb(cuda_device)
+ max_mem = int(total * mem_ratio)
+ block_mem = max_mem - used
+ x = torch.cuda.FloatTensor(256, 1024, block_mem)
+ del x
+ time.sleep(5)
+
+
+def gpu_mem_usage():
+ """
+ Compute the GPU memory usage for the current device (MB).
+ """
+ mem_usage_bytes = torch.cuda.max_memory_allocated()
+ return mem_usage_bytes / (1024 * 1024)
+
+
+def mem_usage():
+ """
+ Compute the memory usage for the current machine (GB).
+ """
+ gb = 1 << 30
+ mem = psutil.virtual_memory()
+ return mem.used / gb
+
+
+class AverageMeter:
+ """Track a series of values and provide access to smoothed values over a
+ window or the global series average.
+ """
+
+ def __init__(self, window_size=50):
+ self._deque = deque(maxlen=window_size)
+ self._total = 0.0
+ self._count = 0
+
+ def update(self, value):
+ self._deque.append(value)
+ self._count += 1
+ self._total += value
+
+ @property
+ def median(self):
+ d = np.array(list(self._deque))
+ return np.median(d)
+
+ @property
+ def avg(self):
+ # if deque is empty, nan will be returned.
+ d = np.array(list(self._deque))
+ return d.mean()
+
+ @property
+ def global_avg(self):
+ return self._total / max(self._count, 1e-5)
+
+ @property
+ def latest(self):
+ return self._deque[-1] if len(self._deque) > 0 else None
+
+ @property
+ def total(self):
+ return self._total
+
+ def reset(self):
+ self._deque.clear()
+ self._total = 0.0
+ self._count = 0
+
+ def clear(self):
+ self._deque.clear()
+
+
+class MeterBuffer(defaultdict):
+ """Computes and stores the average and current value"""
+
+ def __init__(self, window_size=20):
+ factory = functools.partial(AverageMeter, window_size=window_size)
+ super().__init__(factory)
+
+ def reset(self):
+ for v in self.values():
+ v.reset()
+
+ def get_filtered_meter(self, filter_key="time"):
+ return {k: v for k, v in self.items() if filter_key in k}
+
+ def update(self, values=None, **kwargs):
+ if values is None:
+ values = {}
+ values.update(kwargs)
+ for k, v in values.items():
+ if isinstance(v, torch.Tensor):
+ v = v.detach()
+ self[k].update(v)
+
+ def clear_meters(self):
+ for v in self.values():
+ v.clear()
diff --git a/what/models/detection/yolox/utils/model_utils.py b/what/models/detection/yolox/utils/model_utils.py
new file mode 100644
index 0000000..3bc2d1f
--- /dev/null
+++ b/what/models/detection/yolox/utils/model_utils.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import contextlib
+from copy import deepcopy
+from typing import Sequence
+
+import torch
+import torch.nn as nn
+
+__all__ = [
+ "fuse_conv_and_bn",
+ "fuse_model",
+ "get_model_info",
+ "replace_module",
+ "freeze_module",
+ "adjust_status",
+]
+
+
+def get_model_info(model: nn.Module, tsize: Sequence[int]) -> str:
+ from thop import profile
+
+ stride = 64
+ img = torch.zeros((1, 3, stride, stride), device=next(model.parameters()).device)
+ flops, params = profile(deepcopy(model), inputs=(img,), verbose=False)
+ params /= 1e6
+ flops /= 1e9
+ flops *= tsize[0] * tsize[1] / stride / stride * 2 # Gflops
+ info = "Params: {:.2f}M, Gflops: {:.2f}".format(params, flops)
+ return info
+
+
+def fuse_conv_and_bn(conv: nn.Conv2d, bn: nn.BatchNorm2d) -> nn.Conv2d:
+ """
+ Fuse convolution and batchnorm layers.
+ check more info on https://tehnokv.com/posts/fusing-batchnorm-and-conv/
+
+ Args:
+ conv (nn.Conv2d): convolution to fuse.
+ bn (nn.BatchNorm2d): batchnorm to fuse.
+
+ Returns:
+ nn.Conv2d: fused convolution behaves the same as the input conv and bn.
+ """
+ fusedconv = (
+ nn.Conv2d(
+ conv.in_channels,
+ conv.out_channels,
+ kernel_size=conv.kernel_size,
+ stride=conv.stride,
+ padding=conv.padding,
+ groups=conv.groups,
+ bias=True,
+ )
+ .requires_grad_(False)
+ .to(conv.weight.device)
+ )
+
+ # prepare filters
+ w_conv = conv.weight.clone().view(conv.out_channels, -1)
+ w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))
+ fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape))
+
+ # prepare spatial bias
+ b_conv = (
+ torch.zeros(conv.weight.size(0), device=conv.weight.device)
+ if conv.bias is None
+ else conv.bias
+ )
+ b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(
+ torch.sqrt(bn.running_var + bn.eps)
+ )
+ fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)
+
+ return fusedconv
+
+
+def fuse_model(model: nn.Module) -> nn.Module:
+ """fuse conv and bn in model
+
+ Args:
+ model (nn.Module): model to fuse
+
+ Returns:
+ nn.Module: fused model
+ """
+ from yolox.models.network_blocks import BaseConv
+
+ for m in model.modules():
+ if type(m) is BaseConv and hasattr(m, "bn"):
+ m.conv = fuse_conv_and_bn(m.conv, m.bn) # update conv
+ delattr(m, "bn") # remove batchnorm
+ m.forward = m.fuseforward # update forward
+ return model
+
+
+def replace_module(module, replaced_module_type, new_module_type, replace_func=None) -> nn.Module:
+ """
+ Replace given type in module to a new type. mostly used in deploy.
+
+ Args:
+ module (nn.Module): model to apply replace operation.
+ replaced_module_type (Type): module type to be replaced.
+ new_module_type (Type)
+ replace_func (function): python function to describe replace logic. Defalut value None.
+
+ Returns:
+ model (nn.Module): module that already been replaced.
+ """
+
+ def default_replace_func(replaced_module_type, new_module_type):
+ return new_module_type()
+
+ if replace_func is None:
+ replace_func = default_replace_func
+
+ model = module
+ if isinstance(module, replaced_module_type):
+ model = replace_func(replaced_module_type, new_module_type)
+ else: # recurrsively replace
+ for name, child in module.named_children():
+ new_child = replace_module(child, replaced_module_type, new_module_type)
+ if new_child is not child: # child is already replaced
+ model.add_module(name, new_child)
+
+ return model
+
+
+def freeze_module(module: nn.Module, name=None) -> nn.Module:
+ """freeze module inplace
+
+ Args:
+ module (nn.Module): module to freeze.
+ name (str, optional): name to freeze. If not given, freeze the whole module.
+ Note that fuzzy match is not supported. Defaults to None.
+
+ Examples:
+ freeze the backbone of model
+ >>> freeze_moudle(model.backbone)
+
+ or freeze the backbone of model by name
+ >>> freeze_moudle(model, name="backbone")
+ """
+ for param_name, parameter in module.named_parameters():
+ if name is None or name in param_name:
+ parameter.requires_grad = False
+
+ # ensure module like BN and dropout are freezed
+ for module_name, sub_module in module.named_modules():
+ # actually there are no needs to call eval for every single sub_module
+ if name is None or name in module_name:
+ sub_module.eval()
+
+ return module
+
+
+@contextlib.contextmanager
+def adjust_status(module: nn.Module, training: bool = False) -> nn.Module:
+ """Adjust module to training/eval mode temporarily.
+
+ Args:
+ module (nn.Module): module to adjust status.
+ training (bool): training mode to set. True for train mode, False fro eval mode.
+
+ Examples:
+ >>> with adjust_status(model, training=False):
+ ... model(data)
+ """
+ status = {}
+
+ def backup_status(module):
+ for m in module.modules():
+ # save prev status to dict
+ status[m] = m.training
+ m.training = training
+
+ def recover_status(module):
+ for m in module.modules():
+ # recover prev status from dict
+ m.training = status.pop(m)
+
+ backup_status(module)
+ yield module
+ recover_status(module)
diff --git a/what/models/detection/yolox/utils/setup_env.py b/what/models/detection/yolox/utils/setup_env.py
new file mode 100644
index 0000000..45289f3
--- /dev/null
+++ b/what/models/detection/yolox/utils/setup_env.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import os
+import subprocess
+from loguru import logger
+
+import cv2
+
+from .dist import get_world_size, is_main_process
+
+__all__ = ["configure_nccl", "configure_module", "configure_omp"]
+
+
+def configure_nccl():
+ """Configure multi-machine environment variables of NCCL."""
+ os.environ["NCCL_LAUNCH_MODE"] = "PARALLEL"
+ os.environ["NCCL_IB_HCA"] = subprocess.getoutput(
+ "pushd /sys/class/infiniband/ > /dev/null; for i in mlx5_*; "
+ "do cat $i/ports/1/gid_attrs/types/* 2>/dev/null "
+ "| grep v >/dev/null && echo $i ; done; popd > /dev/null"
+ )
+ os.environ["NCCL_IB_GID_INDEX"] = "3"
+ os.environ["NCCL_IB_TC"] = "106"
+
+
+def configure_omp(num_threads=1):
+ """
+ If OMP_NUM_THREADS is not configured and world_size is greater than 1,
+ Configure OMP_NUM_THREADS environment variables of NCCL to `num_thread`.
+
+ Args:
+ num_threads (int): value of `OMP_NUM_THREADS` to set.
+ """
+ # We set OMP_NUM_THREADS=1 by default, which achieves the best speed on our machines
+ # feel free to change it for better performance.
+ if "OMP_NUM_THREADS" not in os.environ and get_world_size() > 1:
+ os.environ["OMP_NUM_THREADS"] = str(num_threads)
+ if is_main_process():
+ logger.info(
+ "\n***************************************************************\n"
+ "We set `OMP_NUM_THREADS` for each process to {} to speed up.\n"
+ "please further tune the variable for optimal performance.\n"
+ "***************************************************************".format(
+ os.environ["OMP_NUM_THREADS"]
+ )
+ )
+
+
+def configure_module(ulimit_value=8192):
+ """
+ Configure pytorch module environment. setting of ulimit and cv2 will be set.
+
+ Args:
+ ulimit_value(int): default open file number on linux. Default value: 8192.
+ """
+ # system setting
+ try:
+ import resource
+
+ rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+ resource.setrlimit(resource.RLIMIT_NOFILE, (ulimit_value, rlimit[1]))
+ except Exception:
+ # Exception might be raised in Windows OS or rlimit reaches max limit number.
+ # However, set rlimit value might not be necessary.
+ pass
+
+ # cv2
+ # multiprocess might be harmful on performance of torch dataloader
+ os.environ["OPENCV_OPENCL_RUNTIME"] = "disabled"
+ try:
+ cv2.setNumThreads(0)
+ cv2.ocl.setUseOpenCL(False)
+ except Exception:
+ # cv2 version mismatch might rasie exceptions.
+ pass
diff --git a/what/models/detection/yolox/utils/visualize.py b/what/models/detection/yolox/utils/visualize.py
new file mode 100644
index 0000000..e714a3e
--- /dev/null
+++ b/what/models/detection/yolox/utils/visualize.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import cv2
+import numpy as np
+
+__all__ = ["vis"]
+
+
+def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None):
+
+ for i in range(len(boxes)):
+ box = boxes[i]
+ cls_id = int(cls_ids[i])
+ score = scores[i]
+ if score < conf:
+ continue
+ x0 = int(box[0])
+ y0 = int(box[1])
+ x1 = int(box[2])
+ y1 = int(box[3])
+
+ color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist()
+ text = '{}:{:.1f}%'.format(class_names[cls_id], score * 100)
+ txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255)
+ font = cv2.FONT_HERSHEY_SIMPLEX
+
+ txt_size = cv2.getTextSize(text, font, 0.4, 1)[0]
+ cv2.rectangle(img, (x0, y0), (x1, y1), color, 2)
+
+ txt_bk_color = (_COLORS[cls_id] * 255 * 0.7).astype(np.uint8).tolist()
+ cv2.rectangle(
+ img,
+ (x0, y0 + 1),
+ (x0 + txt_size[0] + 1, y0 + int(1.5*txt_size[1])),
+ txt_bk_color,
+ -1
+ )
+ cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1)
+
+ return img
+
+
+_COLORS = np.array(
+ [
+ 0.000, 0.447, 0.741,
+ 0.850, 0.325, 0.098,
+ 0.929, 0.694, 0.125,
+ 0.494, 0.184, 0.556,
+ 0.466, 0.674, 0.188,
+ 0.301, 0.745, 0.933,
+ 0.635, 0.078, 0.184,
+ 0.300, 0.300, 0.300,
+ 0.600, 0.600, 0.600,
+ 1.000, 0.000, 0.000,
+ 1.000, 0.500, 0.000,
+ 0.749, 0.749, 0.000,
+ 0.000, 1.000, 0.000,
+ 0.000, 0.000, 1.000,
+ 0.667, 0.000, 1.000,
+ 0.333, 0.333, 0.000,
+ 0.333, 0.667, 0.000,
+ 0.333, 1.000, 0.000,
+ 0.667, 0.333, 0.000,
+ 0.667, 0.667, 0.000,
+ 0.667, 1.000, 0.000,
+ 1.000, 0.333, 0.000,
+ 1.000, 0.667, 0.000,
+ 1.000, 1.000, 0.000,
+ 0.000, 0.333, 0.500,
+ 0.000, 0.667, 0.500,
+ 0.000, 1.000, 0.500,
+ 0.333, 0.000, 0.500,
+ 0.333, 0.333, 0.500,
+ 0.333, 0.667, 0.500,
+ 0.333, 1.000, 0.500,
+ 0.667, 0.000, 0.500,
+ 0.667, 0.333, 0.500,
+ 0.667, 0.667, 0.500,
+ 0.667, 1.000, 0.500,
+ 1.000, 0.000, 0.500,
+ 1.000, 0.333, 0.500,
+ 1.000, 0.667, 0.500,
+ 1.000, 1.000, 0.500,
+ 0.000, 0.333, 1.000,
+ 0.000, 0.667, 1.000,
+ 0.000, 1.000, 1.000,
+ 0.333, 0.000, 1.000,
+ 0.333, 0.333, 1.000,
+ 0.333, 0.667, 1.000,
+ 0.333, 1.000, 1.000,
+ 0.667, 0.000, 1.000,
+ 0.667, 0.333, 1.000,
+ 0.667, 0.667, 1.000,
+ 0.667, 1.000, 1.000,
+ 1.000, 0.000, 1.000,
+ 1.000, 0.333, 1.000,
+ 1.000, 0.667, 1.000,
+ 0.333, 0.000, 0.000,
+ 0.500, 0.000, 0.000,
+ 0.667, 0.000, 0.000,
+ 0.833, 0.000, 0.000,
+ 1.000, 0.000, 0.000,
+ 0.000, 0.167, 0.000,
+ 0.000, 0.333, 0.000,
+ 0.000, 0.500, 0.000,
+ 0.000, 0.667, 0.000,
+ 0.000, 0.833, 0.000,
+ 0.000, 1.000, 0.000,
+ 0.000, 0.000, 0.167,
+ 0.000, 0.000, 0.333,
+ 0.000, 0.000, 0.500,
+ 0.000, 0.000, 0.667,
+ 0.000, 0.000, 0.833,
+ 0.000, 0.000, 1.000,
+ 0.000, 0.000, 0.000,
+ 0.143, 0.143, 0.143,
+ 0.286, 0.286, 0.286,
+ 0.429, 0.429, 0.429,
+ 0.571, 0.571, 0.571,
+ 0.714, 0.714, 0.714,
+ 0.857, 0.857, 0.857,
+ 0.000, 0.447, 0.741,
+ 0.314, 0.717, 0.741,
+ 0.50, 0.5, 0
+ ]
+).astype(np.float32).reshape(-1, 3)
diff --git a/what/models/detection/yolox/yolox_l.py b/what/models/detection/yolox/yolox_l.py
new file mode 100644
index 0000000..e2140d0
--- /dev/null
+++ b/what/models/detection/yolox/yolox_l.py
@@ -0,0 +1,59 @@
+import cv2
+import torch
+import numpy as np
+
+from keras.models import load_model
+
+from .data.datasets import COCO_CLASSES
+from .exp import get_exp
+from .predictor import Predictor
+
+class YOLOX_L:
+ def __init__(self, class_names, model_path):
+ exp = get_exp(None, "yolox-l")
+ self.model = exp.get_model()
+
+ # load the model state dict
+ device = 'gpu' if torch.cuda.is_available() else 'cpu'
+ ckpt = torch.load(model_path, map_location="cpu")
+ self.model.load_state_dict(ckpt["model"])
+
+ self.predictor = Predictor(
+ self.model, exp, COCO_CLASSES, None, None,
+ device, False, False,
+ )
+
+ self.class_names = class_names
+
+ def predict(self, image):
+ outputs, _ = self.predictor.inference(image)
+ height, width, _ = image.shape
+ scale = min(640 / float(height), 640 / float(width))
+
+ # Run inference
+ class_ids = []
+ boxes = []
+ probs = []
+
+ if outputs[0] is not None:
+ boxes = outputs[0][:, 0:4].cpu().numpy()
+ class_ids = outputs[0][:, -1].cpu().numpy().astype(np.uint32)
+ probs = (outputs[0][:, 4] * outputs[0][:, 5]).cpu().numpy()
+
+ boxes = np.array([box for box, prob in zip(boxes, probs) if prob >= 0.5 ])
+ probs = probs[probs >= 0.5]
+
+ # From (x1, y1, x2, y2) --> (x, y, w, h)
+ for i, box in enumerate(boxes):
+ box /= scale
+ box[0] /= width
+ box[1] /= height
+ box[2] /= width
+ box[3] /= height
+
+ box[2] -= box[0]
+ box[3] -= box[1]
+ box[0] += (box[2] / 2)
+ box[1] += (box[3] / 2)
+
+ return image, boxes, class_ids, probs
diff --git a/what/models/detection/yolox/yolox_m.py b/what/models/detection/yolox/yolox_m.py
new file mode 100644
index 0000000..c18b21d
--- /dev/null
+++ b/what/models/detection/yolox/yolox_m.py
@@ -0,0 +1,59 @@
+import cv2
+import torch
+import numpy as np
+
+from keras.models import load_model
+
+from .data.datasets import COCO_CLASSES
+from .exp import get_exp
+from .predictor import Predictor
+
+class YOLOX_M:
+ def __init__(self, class_names, model_path):
+ exp = get_exp(None, "yolox-m")
+ self.model = exp.get_model()
+
+ # load the model state dict
+ device = 'gpu' if torch.cuda.is_available() else 'cpu'
+ ckpt = torch.load(model_path, map_location="cpu")
+ self.model.load_state_dict(ckpt["model"])
+
+ self.predictor = Predictor(
+ self.model, exp, COCO_CLASSES, None, None,
+ device, False, False,
+ )
+
+ self.class_names = class_names
+
+ def predict(self, image):
+ outputs, _ = self.predictor.inference(image)
+ height, width, _ = image.shape
+ scale = min(640 / float(height), 640 / float(width))
+
+ # Run inference
+ class_ids = []
+ boxes = []
+ probs = []
+
+ if outputs[0] is not None:
+ boxes = outputs[0][:, 0:4].cpu().numpy()
+ class_ids = outputs[0][:, -1].cpu().numpy().astype(np.uint32)
+ probs = (outputs[0][:, 4] * outputs[0][:, 5]).cpu().numpy()
+
+ boxes = np.array([box for box, prob in zip(boxes, probs) if prob >= 0.5 ])
+ probs = probs[probs >= 0.5]
+
+ # From (x1, y1, x2, y2) --> (x, y, w, h)
+ for i, box in enumerate(boxes):
+ box /= scale
+ box[0] /= width
+ box[1] /= height
+ box[2] /= width
+ box[3] /= height
+
+ box[2] -= box[0]
+ box[3] -= box[1]
+ box[0] += (box[2] / 2)
+ box[1] += (box[3] / 2)
+
+ return image, boxes, class_ids, probs
diff --git a/what/models/detection/yolox/yolox_s.py b/what/models/detection/yolox/yolox_s.py
new file mode 100644
index 0000000..45b43b5
--- /dev/null
+++ b/what/models/detection/yolox/yolox_s.py
@@ -0,0 +1,59 @@
+import cv2
+import torch
+import numpy as np
+
+from keras.models import load_model
+
+from .data.datasets import COCO_CLASSES
+from .exp import get_exp
+from .predictor import Predictor
+
+class YOLOX_S:
+ def __init__(self, class_names, model_path):
+ exp = get_exp(None, "yolox-s")
+ self.model = exp.get_model()
+
+ # load the model state dict
+ device = 'gpu' if torch.cuda.is_available() else 'cpu'
+ ckpt = torch.load(model_path, map_location="cpu")
+ self.model.load_state_dict(ckpt["model"])
+
+ self.predictor = Predictor(
+ self.model, exp, COCO_CLASSES, None, None,
+ device, False, False,
+ )
+
+ self.class_names = class_names
+
+ def predict(self, image):
+ outputs, _ = self.predictor.inference(image)
+ height, width, _ = image.shape
+ scale = min(640 / float(height), 640 / float(width))
+
+ # Run inference
+ class_ids = []
+ boxes = []
+ probs = []
+
+ if outputs[0] is not None:
+ boxes = outputs[0][:, 0:4].cpu().numpy()
+ class_ids = outputs[0][:, -1].cpu().numpy().astype(np.uint32)
+ probs = (outputs[0][:, 4] * outputs[0][:, 5]).cpu().numpy()
+
+ boxes = np.array([box for box, prob in zip(boxes, probs) if prob >= 0.5 ])
+ probs = probs[probs >= 0.5]
+
+ # From (x1, y1, x2, y2) --> (x, y, w, h)
+ for i, box in enumerate(boxes):
+ box /= scale
+ box[0] /= width
+ box[1] /= height
+ box[2] /= width
+ box[3] /= height
+
+ box[2] -= box[0]
+ box[3] -= box[1]
+ box[0] += (box[2] / 2)
+ box[1] += (box[3] / 2)
+
+ return image, boxes, class_ids, probs
diff --git a/what/models/detection/yolox/yolox_x.py b/what/models/detection/yolox/yolox_x.py
new file mode 100644
index 0000000..c286c67
--- /dev/null
+++ b/what/models/detection/yolox/yolox_x.py
@@ -0,0 +1,59 @@
+import cv2
+import torch
+import numpy as np
+
+from keras.models import load_model
+
+from .data.datasets import COCO_CLASSES
+from .exp import get_exp
+from .predictor import Predictor
+
+class YOLOX_X:
+ def __init__(self, class_names, model_path):
+ exp = get_exp(None, "yolox-x")
+ self.model = exp.get_model()
+
+ # load the model state dict
+ device = 'gpu' if torch.cuda.is_available() else 'cpu'
+ ckpt = torch.load(model_path, map_location="cpu")
+ self.model.load_state_dict(ckpt["model"])
+
+ self.predictor = Predictor(
+ self.model, exp, COCO_CLASSES, None, None,
+ device, False, False,
+ )
+
+ self.class_names = class_names
+
+ def predict(self, image):
+ outputs, _ = self.predictor.inference(image)
+ height, width, _ = image.shape
+ scale = min(640 / float(height), 640 / float(width))
+
+ # Run inference
+ class_ids = []
+ boxes = []
+ probs = []
+
+ if outputs[0] is not None:
+ boxes = outputs[0][:, 0:4].cpu().numpy()
+ class_ids = outputs[0][:, -1].cpu().numpy().astype(np.uint32)
+ probs = (outputs[0][:, 4] * outputs[0][:, 5]).cpu().numpy()
+
+ boxes = np.array([box for box, prob in zip(boxes, probs) if prob >= 0.5 ])
+ probs = probs[probs >= 0.5]
+
+ # From (x1, y1, x2, y2) --> (x, y, w, h)
+ for i, box in enumerate(boxes):
+ box /= scale
+ box[0] /= width
+ box[1] /= height
+ box[2] /= width
+ box[3] /= height
+
+ box[2] -= box[0]
+ box[3] -= box[1]
+ box[0] += (box[2] / 2)
+ box[1] += (box[3] / 2)
+
+ return image, boxes, class_ids, probs