diff --git a/README.md b/README.md index 92d8ee93..47d1deab 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ English | [简体中文](README_cn.md) Open In Kaggle
- +cp ## YOLOv6 Implementation of paper: diff --git a/configs/base/README.md b/configs/base/README.md deleted file mode 100644 index 77ef5a4e..00000000 --- a/configs/base/README.md +++ /dev/null @@ -1,26 +0,0 @@ -## YOLOv6 base model - -English | [简体中文](./README_cn.md) - -### Features - -- Use only regular convolution and Relu activation functions. - -- Apply CSP (1/2 channel dim) blocks in the network structure, except for Nano base model. - -Advantage: -- Adopt a unified network structure and configuration, and the accuracy loss of the PTQ 8-bit quantization model is negligible. -- Suitable for users who are just getting started or who need to apply, optimize and deploy an 8-bit quantization model quickly and frequently. - - -### Performance - -| Model | Size | mAPval
0.5:0.95 | SpeedT4
TRT FP16 b1
(FPS) | SpeedT4
TRT FP16 b32
(FPS) | SpeedT4
TRT INT8 b1
(FPS) | SpeedT4
TRT INT8 b32
(FPS) | Params
(M) | FLOPs
(G) | -| :--------------------------------------------------------------------------------------------- | --- | ----------------- | ----- | ---- | ---- | ---- | ----- | ------ | -| [**YOLOv6-N-base**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6n_base.pt) | 640 | 36.6distill | 727 | 1302 | 814 | 1805 | 4.65 | 11.46 | -| [**YOLOv6-S-base**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6s_base.pt) | 640 | 45.3distill | 346 | 525 | 487 | 908 | 13.14 | 30.6 | -| [**YOLOv6-M-base**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6m_base.pt) | 640 | 49.4distill | 179 | 245 | 284 | 439 | 28.33 | 72.30 | -| [**YOLOv6-L-base**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6l_base.pt) | 640 | 51.1distill | 116 | 157 | 196 | 288 | 59.61 | 150.89 | - -- Speed is tested with TensorRT 8.2.4.2 on T4. -- The processes of model training, evaluation, and inference are the same as the original ones. For details, please refer to [this README](https://github.com/meituan/YOLOv6#quick-start). diff --git a/configs/base/README_cn.md b/configs/base/README_cn.md deleted file mode 100644 index b6b01d14..00000000 --- a/configs/base/README_cn.md +++ /dev/null @@ -1,25 +0,0 @@ -## YOLOv6 基础版模型 - -简体中文 | [English](./README.md) - -### 模型特点 - -- 仅使用常规卷积和Relu激活函数 - -- 网络结构均采用CSP (1/2通道) block,Nano网络除外。 - -优势: -- 采用统一的网络结构和配置,且 PTQ 8位量化模型精度损失较小,适合刚入门或有快速迭代部署8位量化模型需求的用户。 - - -### 模型指标 - -| 模型 | 尺寸 | mAPval
0.5:0.95 | 速度T4
TRT FP16 b1
(FPS) | 速度T4
TRT FP16 b32
(FPS) | 速度T4
TRT INT8 b1
(FPS) | 速度T4
TRT INT8 b32
(FPS) | Params
(M) | FLOPs
(G) | -| :--------------------------------------------------------------------------------------------- | --- | ----------------- | ----- | ---- | ---- | ---- | ----- | ------ | -| [**YOLOv6-N-base**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6n_base.pt) | 640 | 36.6distill | 727 | 1302 | 814 | 1805 | 4.65 | 11.46 | -| [**YOLOv6-S-base**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6s_base.pt) | 640 | 45.3distill | 346 | 525 | 487 | 908 | 13.14 | 30.6 | -| [**YOLOv6-M-base**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6m_base.pt) | 640 | 49.4distill | 179 | 245 | 284 | 439 | 28.33 | 72.30 | -| [**YOLOv6-L-base**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6l_base.pt) | 640 | 51.1distill | 116 | 157 | 196 | 288 | 59.61 | 150.89 | - -- 速度是在 T4 上测试的,TensorRT 版本为 8.4.2.4; -- 模型训练、评估、推理流程与原来保持一致,具体可参考 [首页 README 文档](https://github.com/meituan/YOLOv6/blob/main/README_cn.md#%E5%BF%AB%E9%80%9F%E5%BC%80%E5%A7%8B)。 diff --git a/configs/base/yolov6l_base_finetune.py b/configs/base/yolov6l_base_finetune.py deleted file mode 100644 index 7e8dc062..00000000 --- a/configs/base/yolov6l_base_finetune.py +++ /dev/null @@ -1,63 +0,0 @@ -# YOLOv6 large base model -model = dict( - type='YOLOv6l_base', - depth_multiple=1.0, - width_multiple=1.0, - pretrained=None, - backbone=dict( - type='CSPBepBackbone', - num_repeats=[1, 6, 12, 18, 6], - out_channels=[64, 128, 256, 512, 1024], - csp_e=float(1)/2, - fuse_P2=True, - ), - neck=dict( - type='CSPRepBiFPANNeck', - num_repeats=[12, 12, 12, 12], - out_channels=[256, 128, 128, 256, 256, 512], - csp_e=float(1)/2, - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=1, - out_indices=[17, 20, 23], - strides=[8, 16, 32], - iou_type='giou', - use_dfl=True, - reg_max=16, #if use_dfl is False, please set reg_max to 0 - distill_weight={ - 'class': 2.0, - 'dfl': 1.0, - }, - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.0032, - lrf=0.12, - momentum=0.843, - weight_decay=0.00036, - warmup_epochs=2.0, - warmup_momentum=0.5, - warmup_bias_lr=0.05 -) - -data_aug = dict( - hsv_h=0.0138, - hsv_s=0.664, - hsv_v=0.464, - degrees=0.373, - translate=0.245, - scale=0.898, - shear=0.602, - flipud=0.00856, - fliplr=0.5, - mosaic=1.0, - mixup=0.243, -) -training_mode = "conv_relu" diff --git a/configs/base/yolov6m_base_finetune.py b/configs/base/yolov6m_base_finetune.py deleted file mode 100644 index af5449ec..00000000 --- a/configs/base/yolov6m_base_finetune.py +++ /dev/null @@ -1,67 +0,0 @@ -# YOLOv6m medium/large base model -model = dict( - type='YOLOv6m_base', - pretrained=None, - depth_multiple=0.80, - width_multiple=0.75, - backbone=dict( - type='CSPBepBackbone', - num_repeats=[1, 6, 12, 18, 6], - out_channels=[64, 128, 256, 512, 1024], - csp_e=float(1)/2, - fuse_P2=True, - ), - neck=dict( - type='CSPRepBiFPANNeck', - num_repeats=[12, 12, 12, 12], - out_channels=[256, 128, 128, 256, 256, 512], - csp_e=float(1)/2, - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=3, - anchors_init=[[10,13, 19,19, 33,23], - [30,61, 59,59, 59,119], - [116,90, 185,185, 373,326]], - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type='giou', - use_dfl=True, - reg_max=16, #if use_dfl is False, please set reg_max to 0 - distill_weight={ - 'class': 0.8, - 'dfl': 1.0, - }, - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.0032, - lrf=0.12, - momentum=0.843, - weight_decay=0.00036, - warmup_epochs=2.0, - warmup_momentum=0.5, - warmup_bias_lr=0.05 -) - -data_aug = dict( - hsv_h=0.0138, - hsv_s=0.664, - hsv_v=0.464, - degrees=0.373, - translate=0.245, - scale=0.898, - shear=0.602, - flipud=0.00856, - fliplr=0.5, - mosaic=1.0, - mixup=0.243, -) -training_mode = "conv_relu" diff --git a/configs/base/yolov6n_base.py b/configs/base/yolov6n_base.py deleted file mode 100644 index 8340ca60..00000000 --- a/configs/base/yolov6n_base.py +++ /dev/null @@ -1,66 +0,0 @@ -# YOLOv6s nano base model -model = dict( - type='YOLOv6n_base', - pretrained=None, - depth_multiple=0.33, - width_multiple=0.25, - backbone=dict( - type='EfficientRep', - num_repeats=[1, 6, 12, 18, 6], - out_channels=[64, 128, 256, 512, 1024], - fuse_P2=True, - cspsppf=True, - ), - neck=dict( - type='RepBiFPANNeck', - num_repeats=[12, 12, 12, 12], - out_channels=[256, 128, 128, 256, 256, 512], - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=3, - anchors_init=[[10,13, 19,19, 33,23], - [30,61, 59,59, 59,119], - [116,90, 185,185, 373,326]], - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type='giou', - use_dfl=True, # set to True if you want to further train with distillation - reg_max=16, # set to 16 if you want to further train with distillation - distill_weight={ - 'class': 1.0, - 'dfl': 1.0, - }, - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.01, - lrf=0.01, - momentum=0.937, - weight_decay=0.0005, - warmup_epochs=3.0, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.5, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.0, -) -training_mode = "conv_relu" diff --git a/configs/base/yolov6n_base_finetune.py b/configs/base/yolov6n_base_finetune.py deleted file mode 100644 index 593c3def..00000000 --- a/configs/base/yolov6n_base_finetune.py +++ /dev/null @@ -1,66 +0,0 @@ -# YOLOv6s nanao base model -model = dict( - type='YOLOv6n_base', - pretrained=None, - depth_multiple=0.33, - width_multiple=0.25, - backbone=dict( - type='EfficientRep', - num_repeats=[1, 6, 12, 18, 6], - out_channels=[64, 128, 256, 512, 1024], - fuse_P2=True, - cspsppf=True, - ), - neck=dict( - type='RepBiFPANNeck', - num_repeats=[12, 12, 12, 12], - out_channels=[256, 128, 128, 256, 256, 512], - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=3, - anchors_init=[[10,13, 19,19, 33,23], - [30,61, 59,59, 59,119], - [116,90, 185,185, 373,326]], - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type='giou', - use_dfl=False, # set to True if you want to further train with distillation - reg_max=0, # set to 16 if you want to further train with distillation - distill_weight={ - 'class': 1.0, - 'dfl': 1.0, - }, - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.0032, - lrf=0.12, - momentum=0.843, - weight_decay=0.00036, - warmup_epochs=2.0, - warmup_momentum=0.5, - warmup_bias_lr=0.05 -) - -data_aug = dict( - hsv_h=0.0138, - hsv_s=0.664, - hsv_v=0.464, - degrees=0.373, - translate=0.245, - scale=0.898, - shear=0.602, - flipud=0.00856, - fliplr=0.5, - mosaic=1.0, - mixup=0.243, -) -training_mode = "conv_relu" diff --git a/configs/base/yolov6s_base.py b/configs/base/yolov6s_base.py deleted file mode 100644 index 4e28c178..00000000 --- a/configs/base/yolov6s_base.py +++ /dev/null @@ -1,68 +0,0 @@ -# YOLOv6s small base model -model = dict( - type='YOLOv6s_base', - pretrained=None, - depth_multiple=0.70, - width_multiple=0.50, - backbone=dict( - type='CSPBepBackbone', - num_repeats=[1, 6, 12, 18, 6], - out_channels=[64, 128, 256, 512, 1024], - csp_e=float(1)/2, - fuse_P2=True, - cspsppf=True, - ), - neck=dict( - type='CSPRepBiFPANNeck',#CSPRepPANNeck - num_repeats=[12, 12, 12, 12], - out_channels=[256, 128, 128, 256, 256, 512], - csp_e=float(1)/2, - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=3, - anchors_init=[[10,13, 19,19, 33,23], - [30,61, 59,59, 59,119], - [116,90, 185,185, 373,326]], - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type='giou', - use_dfl=True, # set to True if you want to further train with distillation - reg_max=16, # set to 16 if you want to further train with distillation - distill_weight={ - 'class': 1.0, - 'dfl': 1.0, - }, - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.01, - lrf=0.01, - momentum=0.937, - weight_decay=0.0005, - warmup_epochs=3.0, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.5, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.0, -) -training_mode = "conv_relu" diff --git a/configs/base/yolov6s_base_finetune.py b/configs/base/yolov6s_base_finetune.py deleted file mode 100644 index eb4d2159..00000000 --- a/configs/base/yolov6s_base_finetune.py +++ /dev/null @@ -1,68 +0,0 @@ -# YOLOv6s small base model -model = dict( - type='YOLOv6s_base', - pretrained=None, - depth_multiple=0.70, - width_multiple=0.50, - backbone=dict( - type='CSPBepBackbone', - num_repeats=[1, 6, 12, 18, 6], - out_channels=[64, 128, 256, 512, 1024], - csp_e=float(1)/2, - fuse_P2=True, - cspsppf=True, - ), - neck=dict( - type='CSPRepBiFPANNeck', - num_repeats=[12, 12, 12, 12], - out_channels=[256, 128, 128, 256, 256, 512], - csp_e=float(1)/2, - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=3, - anchors_init=[[10,13, 19,19, 33,23], - [30,61, 59,59, 59,119], - [116,90, 185,185, 373,326]], - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type='giou', - use_dfl=False, # set to True if you want to further train with distillation - reg_max=0, # set to 16 if you want to further train with distillation - distill_weight={ - 'class': 1.0, - 'dfl': 1.0, - }, - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.0032, - lrf=0.12, - momentum=0.843, - weight_decay=0.00036, - warmup_epochs=2.0, - warmup_momentum=0.5, - warmup_bias_lr=0.05 -) - -data_aug = dict( - hsv_h=0.0138, - hsv_s=0.664, - hsv_v=0.464, - degrees=0.373, - translate=0.245, - scale=0.898, - shear=0.602, - flipud=0.00856, - fliplr=0.5, - mosaic=1.0, - mixup=0.243, -) -training_mode = "conv_relu" diff --git a/configs/experiment/eval_640_repro.py b/configs/experiment/eval_640_repro.py deleted file mode 100644 index 1f6a6217..00000000 --- a/configs/experiment/eval_640_repro.py +++ /dev/null @@ -1,79 +0,0 @@ -# eval param for different scale - -eval_params = dict( - default = dict( - img_size=640, - shrink_size=2, - infer_on_rect=False, - ), - yolov6n = dict( - img_size=640, - shrink_size=4, - infer_on_rect=False, - ), - yolov6t = dict( - img_size=640, - shrink_size=6, - infer_on_rect=False, - ), - yolov6s = dict( - img_size=640, - shrink_size=6, - infer_on_rect=False, - ), - yolov6m = dict( - img_size=640, - shrink_size=4, - infer_on_rect=False, - ), - yolov6l = dict( - img_size=640, - shrink_size=4, - infer_on_rect=False, - ), - yolov6l_relu = dict( - img_size=640, - shrink_size=2, - infer_on_rect=False, - ), - yolov6n6 = dict( - img_size=1280, - shrink_size=17, - infer_on_rect=False, - ), - yolov6s6 = dict( - img_size=1280, - shrink_size=8, - infer_on_rect=False, - ), - yolov6m6 = dict( - img_size=1280, - shrink_size=64, - infer_on_rect=False, - ), - yolov6l6 = dict( - img_size=1280, - shrink_size=41, - infer_on_rect=False, - ), - yolov6s_mbla = dict( - img_size=640, - shrink_size=7, - infer_on_rect=False, - ), - yolov6m_mbla = dict( - img_size=640, - shrink_size=7, - infer_on_rect=False, - ), - yolov6l_mbla = dict( - img_size=640, - shrink_size=7, - infer_on_rect=False, - ), - yolov6x_mbla = dict( - img_size=640, - shrink_size=3, - infer_on_rect=False, - ) -) diff --git a/configs/experiment/yolov6n_with_eval_params.py b/configs/experiment/yolov6n_with_eval_params.py deleted file mode 100644 index e7366b33..00000000 --- a/configs/experiment/yolov6n_with_eval_params.py +++ /dev/null @@ -1,76 +0,0 @@ -# YOLOv6n model with eval param(when traing) -model = dict( - type='YOLOv6n', - pretrained=None, - depth_multiple=0.33, - width_multiple=0.25, - backbone=dict( - type='EfficientRep', - num_repeats=[1, 6, 12, 18, 6], - out_channels=[64, 128, 256, 512, 1024], - ), - neck=dict( - type='RepPANNeck', - num_repeats=[12, 12, 12, 12], - out_channels=[256, 128, 128, 256, 256, 512], - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=1, - out_indices=[17, 20, 23], - strides=[8, 16, 32], - iou_type='siou', - use_dfl=False, - reg_max=0 #if use_dfl is False, please set reg_max to 0 - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.02, #0.01 # 0.02 - lrf=0.01, - momentum=0.937, - weight_decay=0.0005, - warmup_epochs=3.0, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.5, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.0, -) - -# Eval params when eval model. -# If eval_params item is list, eg conf_thres=[0.03, 0.03], -# first will be used in train.py and second will be used in eval.py. -eval_params = dict( - batch_size=None, #None mean will be the same as batch on one device * 2 - img_size=None, #None mean will be the same as train image size - conf_thres=0.03, - iou_thres=0.65, - - #pading and scale coord - shrink_size=None, # None mean will not shrink the image. - infer_on_rect=True, - - #metric - verbose=False, - do_coco_metric=True, - do_pr_metric=False, - plot_curve=False, - plot_confusion_matrix=False -) diff --git a/configs/experiment/yolov6s_csp_scaled.py b/configs/experiment/yolov6s_csp_scaled.py deleted file mode 100644 index ba28843a..00000000 --- a/configs/experiment/yolov6s_csp_scaled.py +++ /dev/null @@ -1,57 +0,0 @@ -# YOLOv6m model -model = dict( - type='YOLOv6s_csp', - pretrained=None, - depth_multiple=0.70, - width_multiple=0.50, - backbone=dict( - type='CSPBepBackbone', - num_repeats=[1, 6, 12, 18, 6], - out_channels=[64, 128, 256, 512, 1024], - csp_e=float(1)/2, - ), - neck=dict( - type='CSPRepPANNeck', - num_repeats=[12, 12, 12, 12], - out_channels=[256, 128, 128, 256, 256, 512], - csp_e=float(1)/2, - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=1, - out_indices=[17, 20, 23], - strides=[8, 16, 32], - iou_type='giou', - use_dfl=False, - reg_max=0 #if use_dfl is False, please set reg_max to 0 - ) -) - -solver=dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.01, - lrf=0.01, - momentum=0.937, - weight_decay=0.0005, - warmup_epochs=3.0, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.9, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.1, -) diff --git a/configs/experiment/yolov6t.py b/configs/experiment/yolov6t.py deleted file mode 100644 index afacd436..00000000 --- a/configs/experiment/yolov6t.py +++ /dev/null @@ -1,55 +0,0 @@ -# YOLOv6t model -model = dict( - type='YOLOv6t', - pretrained=None, - depth_multiple=0.33, - width_multiple=0.375, - backbone=dict( - type='EfficientRep', - num_repeats=[1, 6, 12, 18, 6], - out_channels=[64, 128, 256, 512, 1024], - ), - neck=dict( - type='RepPANNeck', - num_repeats=[12, 12, 12, 12], - out_channels=[256, 128, 128, 256, 256, 512], - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=1, - out_indices=[17, 20, 23], - strides=[8, 16, 32], - iou_type='siou', - use_dfl=False, - reg_max=0 #if use_dfl is False, please set reg_max to 0 - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.01, - lrf=0.01, - momentum=0.937, - weight_decay=0.0005, - warmup_epochs=3.0, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.5, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.0, -) diff --git a/configs/experiment/yolov6t_csp_scaled.py b/configs/experiment/yolov6t_csp_scaled.py deleted file mode 100644 index e8ba99a9..00000000 --- a/configs/experiment/yolov6t_csp_scaled.py +++ /dev/null @@ -1,57 +0,0 @@ -# YOLOv6n model -model = dict( - type='YOLOv6n_csp', - pretrained=None, - depth_multiple=0.60, - width_multiple=0.50, - backbone=dict( - type='CSPBepBackbone', - num_repeats=[1, 6, 12, 18, 6], - out_channels=[64, 128, 256, 512, 1024], - csp_e=float(1)/2, - ), - neck=dict( - type='CSPRepPANNeck', - num_repeats=[12, 12, 12, 12], - out_channels=[256, 128, 128, 256, 256, 512], - csp_e=float(1)/2, - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=1, - out_indices=[17, 20, 23], - strides=[8, 16, 32], - iou_type='giou', - use_dfl=False, - reg_max=0 #if use_dfl is False, please set reg_max to 0 - ) -) - -solver=dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.01, - lrf=0.01, - momentum=0.937, - weight_decay=0.0005, - warmup_epochs=3.0, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.9, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.1, -) diff --git a/configs/experiment/yolov6t_finetune.py b/configs/experiment/yolov6t_finetune.py deleted file mode 100644 index 8be47416..00000000 --- a/configs/experiment/yolov6t_finetune.py +++ /dev/null @@ -1,55 +0,0 @@ -# YOLOv6t model -model = dict( - type='YOLOv6t', - pretrained='weights/yolov6t.pt', - depth_multiple=0.33, - width_multiple=0.375, - backbone=dict( - type='EfficientRep', - num_repeats=[1, 6, 12, 18, 6], - out_channels=[64, 128, 256, 512, 1024], - ), - neck=dict( - type='RepPANNeck', - num_repeats=[12, 12, 12, 12], - out_channels=[256, 128, 128, 256, 256, 512], - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=1, - out_indices=[17, 20, 23], - strides=[8, 16, 32], - iou_type='siou', - use_dfl=False, - reg_max=0 #if use_dfl is False, please set reg_max to 0 - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.0032, - lrf=0.12, - momentum=0.843, - weight_decay=0.00036, - warmup_epochs=2.0, - warmup_momentum=0.5, - warmup_bias_lr=0.05 -) - -data_aug = dict( - hsv_h=0.0138, - hsv_s=0.664, - hsv_v=0.464, - degrees=0.373, - translate=0.245, - scale=0.898, - shear=0.602, - flipud=0.00856, - fliplr=0.5, - mosaic=1.0, - mixup=0.243, -) diff --git a/configs/mbla/README.md b/configs/mbla/README.md deleted file mode 100644 index d163124d..00000000 --- a/configs/mbla/README.md +++ /dev/null @@ -1,28 +0,0 @@ -## YOLOv6 mbla model - -English | [简体中文](./README_cn.md) - -### Features - -- Apply MBLABlock(Multi Branch Layer Aggregation Block) blocks in the network structure. - -Advantage: -- Adopt a unified network structure and configuration. - -- Better performance for Small model comparing to yolov6 3.0 release. - -- Better performance comparing to yolov6 3.0 base. - - - -### Performance - -| Model | Size | mAPval
0.5:0.95 | SpeedT4
trt fp16 b1
(fps) | SpeedT4
trt fp16 b32
(fps) | Params
(M) | FLOPs
(G) | -| :----------------------------------------------------------- | -------- | :----------------------- | -------------------------------------- | --------------------------------------- | -------------------- | ------------------- | -| [**YOLOv6-S-mbla**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6s_mbla.pt) | 640 | 47.0distill | 300 | 424 | 11.6 | 29.8 | -| [**YOLOv6-M-mbla**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6m_mbla.pt) | 640 | 50.3distill | 168 | 216 | 26.1 | 66.7 | -| [**YOLOv6-L-mbla**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6l_base.pt) | 640 | 52.0distill | 129 | 154 | 46.3 | 118.2 | -| [**YOLOv6-X-base**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6x_base.pt) | 640 | 53.5distill | 78 | 94 | 78.8 | 199.0 | - -- Speed is tested with TensorRT 8.4.2.4 on T4. -- The processes of model training, evaluation, and inference are the same as the original ones. For details, please refer to [this README](https://github.com/meituan/YOLOv6#quick-start). diff --git a/configs/mbla/README_cn.md b/configs/mbla/README_cn.md deleted file mode 100644 index ad399fe0..00000000 --- a/configs/mbla/README_cn.md +++ /dev/null @@ -1,26 +0,0 @@ -## YOLOv6 MBLA版模型 - -简体中文 | [English](./README.md) - -### 模型特点 - -- 网络主体结构均采用MBLABlock(Multi Branch Layer Aggregation Block) - -优势: -- 采用统一的网络结构和配置 - -- 相比3.0版本在s尺度效果提升,相比3.0base版本各尺度效果提升 - - - -### 模型指标 - -| 模型 | 输入尺寸 | mAPval
0.5:0.95 | 速度T4
trt fp16 b1
(fps) | 速度T4
trt fp16 b32
(fps) | Params
(M) | FLOPs
(G) | -| :----------------------------------------------------------- | -------- | :----------------------- | -------------------------------------- | --------------------------------------- | -------------------- | ------------------- | -| [**YOLOv6-S-mbla**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6s_mbla.pt) | 640 | 47.0distill | 300 | 424 | 11.6 | 29.8 | -| [**YOLOv6-M-mbla**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6m_mbla.pt) | 640 | 50.3distill | 168 | 216 | 26.1 | 66.7 | -| [**YOLOv6-L-mbla**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6l_base.pt) | 640 | 52.0distill | 129 | 154 | 46.3 | 118.2 | -| [**YOLOv6-X-base**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6x_base.pt) | 640 | 53.5distill | 78 | 94 | 78.8 | 199.0 | - -- 速度是在 T4 上测试的,TensorRT 版本为 8.4.2.4; -- 模型训练、评估、推理流程与原来保持一致,具体可参考 [首页 README 文档](https://github.com/meituan/YOLOv6/blob/main/README_cn.md#%E5%BF%AB%E9%80%9F%E5%BC%80%E5%A7%8B)。 diff --git a/configs/mbla/yolov6l_mbla_finetune.py b/configs/mbla/yolov6l_mbla_finetune.py deleted file mode 100644 index 6ea88967..00000000 --- a/configs/mbla/yolov6l_mbla_finetune.py +++ /dev/null @@ -1,70 +0,0 @@ -# YOLOv6l model -model = dict( - type='YOLOv6l_mbla', - pretrained=None, - depth_multiple=0.5, - width_multiple=1.0, - backbone=dict( - type='CSPBepBackbone', - num_repeats=[1, 4, 8, 8, 4], - out_channels=[64, 128, 256, 512, 1024], - csp_e=float(1)/2, - fuse_P2=True, - stage_block_type="MBLABlock", - ), - neck=dict( - type='CSPRepBiFPANNeck', - num_repeats=[8, 8, 8, 8], - out_channels=[256, 128, 128, 256, 256, 512], - csp_e=float(1)/2, - stage_block_type="MBLABlock", - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=3, - anchors_init=[[10,13, 19,19, 33,23], - [30,61, 59,59, 59,119], - [116,90, 185,185, 373,326]], - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type='giou', - use_dfl=True, - reg_max=16, #if use_dfl is False, please set reg_max to 0 - distill_weight={ - 'class': 2.0, - 'dfl': 1.0, - }, - ) -) - -solver=dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.0032, - lrf=0.12, - momentum=0.843, - weight_decay=0.00036, - warmup_epochs=2.0, - warmup_momentum=0.5, - warmup_bias_lr=0.05 -) - -data_aug = dict( - hsv_h=0.0138, - hsv_s=0.664, - hsv_v=0.464, - degrees=0.373, - translate=0.245, - scale=0.898, - shear=0.602, - flipud=0.00856, - fliplr=0.5, - mosaic=1.0, - mixup=0.243, -) - -training_mode = "conv_silu" diff --git a/configs/mbla/yolov6m_mbla.py b/configs/mbla/yolov6m_mbla.py deleted file mode 100644 index f84fc43d..00000000 --- a/configs/mbla/yolov6m_mbla.py +++ /dev/null @@ -1,70 +0,0 @@ -# YOLOv6l model -model = dict( - type='YOLOv6m_mbla', - pretrained=None, - depth_multiple=0.5, - width_multiple=0.75, - backbone=dict( - type='CSPBepBackbone', - num_repeats=[1, 4, 8, 8, 4], - out_channels=[64, 128, 256, 512, 1024], - csp_e=float(1)/2, - fuse_P2=True, - stage_block_type="MBLABlock", - ), - neck=dict( - type='CSPRepBiFPANNeck', - num_repeats=[8, 8, 8, 8], - out_channels=[256, 128, 128, 256, 256, 512], - csp_e=float(1)/2, - stage_block_type="MBLABlock", - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=3, - anchors_init=[[10,13, 19,19, 33,23], - [30,61, 59,59, 59,119], - [116,90, 185,185, 373,326]], - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type='giou', - use_dfl=True, - reg_max=16, #if use_dfl is False, please set reg_max to 0 - distill_weight={ - 'class': 2.0, - 'dfl': 1.0, - }, - ) -) - -solver=dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.01, - lrf=0.01, - momentum=0.937, - weight_decay=0.0005, - warmup_epochs=3.0, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.9, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.1, -) - -training_mode = "conv_silu" diff --git a/configs/mbla/yolov6m_mbla_finetune.py b/configs/mbla/yolov6m_mbla_finetune.py deleted file mode 100644 index aa0bc816..00000000 --- a/configs/mbla/yolov6m_mbla_finetune.py +++ /dev/null @@ -1,70 +0,0 @@ -# YOLOv6l model -model = dict( - type='YOLOv6m_mbla', - pretrained=None, - depth_multiple=0.5, - width_multiple=0.75, - backbone=dict( - type='CSPBepBackbone', - num_repeats=[1, 4, 8, 8, 4], - out_channels=[64, 128, 256, 512, 1024], - csp_e=float(1)/2, - fuse_P2=True, - stage_block_type="MBLABlock", - ), - neck=dict( - type='CSPRepBiFPANNeck', - num_repeats=[8, 8, 8, 8], - out_channels=[256, 128, 128, 256, 256, 512], - csp_e=float(1)/2, - stage_block_type="MBLABlock", - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=3, - anchors_init=[[10,13, 19,19, 33,23], - [30,61, 59,59, 59,119], - [116,90, 185,185, 373,326]], - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type='giou', - use_dfl=True, - reg_max=16, #if use_dfl is False, please set reg_max to 0 - distill_weight={ - 'class': 2.0, - 'dfl': 1.0, - }, - ) -) - -solver=dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.0032, - lrf=0.12, - momentum=0.843, - weight_decay=0.00036, - warmup_epochs=2.0, - warmup_momentum=0.5, - warmup_bias_lr=0.05 -) - -data_aug = dict( - hsv_h=0.0138, - hsv_s=0.664, - hsv_v=0.464, - degrees=0.373, - translate=0.245, - scale=0.898, - shear=0.602, - flipud=0.00856, - fliplr=0.5, - mosaic=1.0, - mixup=0.243, -) - -training_mode = "conv_silu" diff --git a/configs/mbla/yolov6s_mbla.py b/configs/mbla/yolov6s_mbla.py deleted file mode 100644 index eedc76ee..00000000 --- a/configs/mbla/yolov6s_mbla.py +++ /dev/null @@ -1,70 +0,0 @@ -# YOLOv6l model -model = dict( - type='YOLOv6s_mbla', - pretrained=None, - depth_multiple=0.5, - width_multiple=0.5, - backbone=dict( - type='CSPBepBackbone', - num_repeats=[1, 4, 8, 8, 4], - out_channels=[64, 128, 256, 512, 1024], - csp_e=float(1)/2, - fuse_P2=True, - stage_block_type="MBLABlock", - ), - neck=dict( - type='CSPRepBiFPANNeck', - num_repeats=[8, 8, 8, 8], - out_channels=[256, 128, 128, 256, 256, 512], - csp_e=float(1)/2, - stage_block_type="MBLABlock", - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=3, - anchors_init=[[10,13, 19,19, 33,23], - [30,61, 59,59, 59,119], - [116,90, 185,185, 373,326]], - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type='giou', - use_dfl=True, - reg_max=16, #if use_dfl is False, please set reg_max to 0 - distill_weight={ - 'class': 2.0, - 'dfl': 1.0, - }, - ) -) - -solver=dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.01, - lrf=0.01, - momentum=0.937, - weight_decay=0.0005, - warmup_epochs=3.0, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.9, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.1, -) - -training_mode = "conv_silu" diff --git a/configs/mbla/yolov6s_mbla_finetune.py b/configs/mbla/yolov6s_mbla_finetune.py deleted file mode 100644 index a9812c71..00000000 --- a/configs/mbla/yolov6s_mbla_finetune.py +++ /dev/null @@ -1,70 +0,0 @@ -# YOLOv6l model -model = dict( - type='YOLOv6s_mbla', - pretrained=None, - depth_multiple=0.5, - width_multiple=0.5, - backbone=dict( - type='CSPBepBackbone', - num_repeats=[1, 4, 8, 8, 4], - out_channels=[64, 128, 256, 512, 1024], - csp_e=float(1)/2, - fuse_P2=True, - stage_block_type="MBLABlock", - ), - neck=dict( - type='CSPRepBiFPANNeck', - num_repeats=[8, 8, 8, 8], - out_channels=[256, 128, 128, 256, 256, 512], - csp_e=float(1)/2, - stage_block_type="MBLABlock", - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=3, - anchors_init=[[10,13, 19,19, 33,23], - [30,61, 59,59, 59,119], - [116,90, 185,185, 373,326]], - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type='giou', - use_dfl=True, - reg_max=16, #if use_dfl is False, please set reg_max to 0 - distill_weight={ - 'class': 2.0, - 'dfl': 1.0, - }, - ) -) - -solver=dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.0032, - lrf=0.12, - momentum=0.843, - weight_decay=0.00036, - warmup_epochs=2.0, - warmup_momentum=0.5, - warmup_bias_lr=0.05 -) - -data_aug = dict( - hsv_h=0.0138, - hsv_s=0.664, - hsv_v=0.464, - degrees=0.373, - translate=0.245, - scale=0.898, - shear=0.602, - flipud=0.00856, - fliplr=0.5, - mosaic=1.0, - mixup=0.243, -) - -training_mode = "conv_silu" diff --git a/configs/mbla/yolov6x_mbla.py b/configs/mbla/yolov6x_mbla.py deleted file mode 100644 index b7b9703c..00000000 --- a/configs/mbla/yolov6x_mbla.py +++ /dev/null @@ -1,70 +0,0 @@ -# YOLOv6l model -model = dict( - type='YOLOv6x_mbla', - pretrained=None, - depth_multiple=1.0, - width_multiple=1.0, - backbone=dict( - type='CSPBepBackbone', - num_repeats=[1, 4, 8, 8, 4], - out_channels=[64, 128, 256, 512, 1024], - csp_e=float(1)/2, - fuse_P2=True, - stage_block_type="MBLABlock", - ), - neck=dict( - type='CSPRepBiFPANNeck', - num_repeats=[8, 8, 8, 8], - out_channels=[256, 128, 128, 256, 256, 512], - csp_e=float(1)/2, - stage_block_type="MBLABlock", - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=3, - anchors_init=[[10,13, 19,19, 33,23], - [30,61, 59,59, 59,119], - [116,90, 185,185, 373,326]], - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type='giou', - use_dfl=True, - reg_max=16, #if use_dfl is False, please set reg_max to 0 - distill_weight={ - 'class': 2.0, - 'dfl': 1.0, - }, - ) -) - -solver=dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.01, - lrf=0.01, - momentum=0.937, - weight_decay=0.0005, - warmup_epochs=3.0, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.9, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.1, -) - -training_mode = "conv_silu" diff --git a/configs/mbla/yolov6x_mbla_finetune.py b/configs/mbla/yolov6x_mbla_finetune.py deleted file mode 100644 index 65c57cb2..00000000 --- a/configs/mbla/yolov6x_mbla_finetune.py +++ /dev/null @@ -1,70 +0,0 @@ -# YOLOv6l model -model = dict( - type='YOLOv6x_mbla', - pretrained=None, - depth_multiple=1.0, - width_multiple=1.0, - backbone=dict( - type='CSPBepBackbone', - num_repeats=[1, 4, 8, 8, 4], - out_channels=[64, 128, 256, 512, 1024], - csp_e=float(1)/2, - fuse_P2=True, - stage_block_type="MBLABlock", - ), - neck=dict( - type='CSPRepBiFPANNeck', - num_repeats=[8, 8, 8, 8], - out_channels=[256, 128, 128, 256, 256, 512], - csp_e=float(1)/2, - stage_block_type="MBLABlock", - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=3, - anchors_init=[[10,13, 19,19, 33,23], - [30,61, 59,59, 59,119], - [116,90, 185,185, 373,326]], - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type='giou', - use_dfl=True, - reg_max=16, #if use_dfl is False, please set reg_max to 0 - distill_weight={ - 'class': 2.0, - 'dfl': 1.0, - }, - ) -) - -solver=dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.0032, - lrf=0.12, - momentum=0.843, - weight_decay=0.00036, - warmup_epochs=2.0, - warmup_momentum=0.5, - warmup_bias_lr=0.05 -) - -data_aug = dict( - hsv_h=0.0138, - hsv_s=0.664, - hsv_v=0.464, - degrees=0.373, - translate=0.245, - scale=0.898, - shear=0.602, - flipud=0.00856, - fliplr=0.5, - mosaic=1.0, - mixup=0.243, -) - -training_mode = "conv_silu" diff --git a/configs/qarepvgg/README.md b/configs/qarepvgg/README.md deleted file mode 100644 index 81b130d2..00000000 --- a/configs/qarepvgg/README.md +++ /dev/null @@ -1,26 +0,0 @@ -## YOLOv6 base model - -English | [简体中文](./README_cn.md) - -### Features - -- This is a RepOpt-version implementation of YOLOv6 according to [QARepVGG](https://arxiv.org/abs/2212.01593). - -- The QARep version models possess slightly lower float accuracy on COCO than the RepVGG version models, but achieve highly improved quantized accuracy. - -- The INT8 accuracies listed were obtained using a simple PTQ process, as implemented in the [`onnx_to_trt.py`](../../deploy/TensorRT/onnx_to_trt.py) script. However, higher accuracies could be achieved using Quantization-Aware Training (QAT) due to the specific architecture design of the QARepVGG model. - -### Performance - -| Model | Size | Float
mAPval
0.5:0.95 | INT8
mAPval
0.5:0.95 | SpeedT4
trt fp16 b32
(fps) | SpeedT4
trt int8 b32
(fps) | Params
(M) | FLOPs
(G) | -| :----------------------------------------------------------- | -------- | :----------------------- | -------------------------------------- | --------------------------------------- | -------------------- | ------------------- | -------------------- | -| [**YOLOv6-N**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6n.pt) | 640 | 37.5 | 34.3 | 1286 | 1773 |4.7 | 11.4 | -| [**YOLOv6-N-qa**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6n_qa.pt) | 640 | 37.1 | 36.4 | 1286 | 1773 | 4.7 | 11.4 | -| [**YOLOv6-S**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6s.pt) | 640 | 45.0 | 41.3 | 513 | 1117 | 18.5 | 45.3 | -| [**YOLOv6-S-qa**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6s_qa.pt) | 640 | 44.7 | 44.0 | 513 | 1117 | 18.5 | 45.3 | -| [**YOLOv6-M**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6m.pt) | 640 | 50.0 | 48.1 | 250 | 439 | 34.9 | 85.8 | -| [**YOLOv6-M-qa**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6m_qa.pt) | 640 | 49.7 | 49.4 | 250 | 439 | 34.9 | 85.8 | - -- Speed is tested with TensorRT 8.4 on T4. -- We have not conducted experiments on the YOLOv6-L model since it does not use the RepVGG architecture. -- The processes of model training, evaluation, and inference are the same as the original ones. For details, please refer to [this README](https://github.com/meituan/YOLOv6#quick-start). diff --git a/configs/repopt/yolov6_tiny_hs.py b/configs/repopt/yolov6_tiny_hs.py deleted file mode 100644 index 70a74279..00000000 --- a/configs/repopt/yolov6_tiny_hs.py +++ /dev/null @@ -1,59 +0,0 @@ -# YOLOv6t model -model = dict( - type='YOLOv6t', - pretrained=None, - depth_multiple=0.33, - width_multiple=0.375, - backbone=dict( - type='EfficientRep', - num_repeats=[1, 6, 12, 18, 6], - out_channels=[64, 128, 256, 512, 1024], - ), - neck=dict( - type='RepPANNeck', - num_repeats=[12, 12, 12, 12], - out_channels=[256, 128, 128, 256, 256, 512], - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=1, - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type='siou', - use_dfl=False, - reg_max=0 #if use_dfl is False, please set reg_max to 0 - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.01, - lrf=0.01, - momentum=0.937, - weight_decay=0.0005, - warmup_epochs=3.0, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.5, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.0, -) - -# Choose Rep-block by the training Mode, choices=["repvgg", "hyper-search", "repopt"] -training_mode='hyper_search' diff --git a/configs/repopt/yolov6_tiny_opt.py b/configs/repopt/yolov6_tiny_opt.py deleted file mode 100644 index 95dbf317..00000000 --- a/configs/repopt/yolov6_tiny_opt.py +++ /dev/null @@ -1,59 +0,0 @@ -# YOLOv6t model -model = dict( - type='YOLOv6t', - pretrained=None, - scales='../yolov6_assert/v6t_v2_scale_last.pt', - depth_multiple=0.33, - width_multiple=0.375, - backbone=dict( - type='EfficientRep', - num_repeats=[1, 6, 12, 18, 6], - out_channels=[64, 128, 256, 512, 1024], - ), - neck=dict( - type='RepPANNeck', - num_repeats=[12, 12, 12, 12], - out_channels=[256, 128, 128, 256, 256, 512], - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=1, - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type='siou', - use_dfl=False, - reg_max=0 #if use_dfl is False, please set reg_max to 0 - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.01, - lrf=0.01, - momentum=0.937, - weight_decay=0.0005, - warmup_epochs=3.0, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.5, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.0, -) -# Choose Rep-block by the training Mode, choices=["repvgg", "hyper-search", "repopt"] -training_mode='repopt' diff --git a/configs/repopt/yolov6_tiny_opt_qat.py b/configs/repopt/yolov6_tiny_opt_qat.py deleted file mode 100644 index 701bf4f1..00000000 --- a/configs/repopt/yolov6_tiny_opt_qat.py +++ /dev/null @@ -1,83 +0,0 @@ -# YOLOv6t model -model = dict( - type='YOLOv6t', - pretrained='./assets/v6s_t.pt', - scales='./assets/v6t_v2_scale_last.pt', - depth_multiple=0.33, - width_multiple=0.375, - backbone=dict( - type='EfficientRep', - num_repeats=[1, 6, 12, 18, 6], - out_channels=[64, 128, 256, 512, 1024], - ), - neck=dict( - type='RepPANNeck', - num_repeats=[12, 12, 12, 12], - out_channels=[256, 128, 128, 256, 256, 512], - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=1, - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type='siou', - use_dfl=False, - reg_max=0, #if use_dfl is False, please set reg_max to 0 - distill_weight={ - 'class': 1.0, - 'dfl': 1.0, - }, - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.00001, - lrf=0.001, - momentum=0.937, - weight_decay=0.00005, - warmup_epochs=3.0, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.5, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.0, -) - -ptq = dict( - num_bits = 8, - calib_batches = 4, - # 'max', 'histogram' - calib_method = 'max', - # 'entropy', 'percentile', 'mse' - histogram_amax_method='entropy', - histogram_amax_percentile=99.99, - calib_output_path='./', - sensitive_layers_skip=False, - sensitive_layers_list=[], -) - -qat = dict( - calib_pt = './assets/v6s_t_calib_max.pt', - sensitive_layers_skip = False, - sensitive_layers_list=[], -) - -# Choose Rep-block by the training Mode, choices=["repvgg", "hyper-search", "repopt"] -training_mode='repopt' diff --git a/configs/repopt/yolov6n_hs.py b/configs/repopt/yolov6n_hs.py deleted file mode 100644 index 67607ba2..00000000 --- a/configs/repopt/yolov6n_hs.py +++ /dev/null @@ -1,59 +0,0 @@ -# YOLOv6n model -model = dict( - type='YOLOv6n', - pretrained=None, - depth_multiple=0.33, - width_multiple=0.25, - backbone=dict( - type='EfficientRep', - num_repeats=[1, 6, 12, 18, 6], - out_channels=[64, 128, 256, 512, 1024], - ), - neck=dict( - type='RepPANNeck', - num_repeats=[12, 12, 12, 12], - out_channels=[256, 128, 128, 256, 256, 512], - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=1, - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type='siou', - use_dfl=False, - reg_max=0 #if use_dfl is False, please set reg_max to 0 - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.02, #0.01 # 0.02 - lrf=0.01, - momentum=0.937, - weight_decay=0.0005, - warmup_epochs=3.0, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.5, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.0, -) - -# Choose Rep-block by the training Mode, choices=["repvgg", "hyper-search", "repopt"] -training_mode='hyper_search' diff --git a/configs/repopt/yolov6n_opt.py b/configs/repopt/yolov6n_opt.py deleted file mode 100644 index 9b3db4fb..00000000 --- a/configs/repopt/yolov6n_opt.py +++ /dev/null @@ -1,59 +0,0 @@ -# YOLOv6n model -model = dict( - type='YOLOv6n', - pretrained=None, - scales='../yolov6_assert/v6n_v2_scale_last.pt', - depth_multiple=0.33, - width_multiple=0.25, - backbone=dict( - type='EfficientRep', - num_repeats=[1, 6, 12, 18, 6], - out_channels=[64, 128, 256, 512, 1024], - ), - neck=dict( - type='RepPANNeck', - num_repeats=[12, 12, 12, 12], - out_channels=[256, 128, 128, 256, 256, 512], - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=1, - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type='siou', - use_dfl=False, - reg_max=0 #if use_dfl is False, please set reg_max to 0 - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.02, #0.01 # 0.02 - lrf=0.01, - momentum=0.937, - weight_decay=0.0005, - warmup_epochs=3.0, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.5, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.0, -) -# Choose Rep-block by the training Mode, choices=["repvgg", "hyper-search", "repopt"] -training_mode='repopt' diff --git a/configs/repopt/yolov6n_opt_qat.py b/configs/repopt/yolov6n_opt_qat.py deleted file mode 100644 index 4e76dfd3..00000000 --- a/configs/repopt/yolov6n_opt_qat.py +++ /dev/null @@ -1,82 +0,0 @@ -# YOLOv6n model -model = dict( - type='YOLOv6n', - pretrained='./assets/v6s_n.pt', - scales='./assets/v6n_v2_scale_last.pt', - depth_multiple=0.33, - width_multiple=0.25, - backbone=dict( - type='EfficientRep', - num_repeats=[1, 6, 12, 18, 6], - out_channels=[64, 128, 256, 512, 1024], - ), - neck=dict( - type='RepPANNeck', - num_repeats=[12, 12, 12, 12], - out_channels=[256, 128, 128, 256, 256, 512], - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=1, - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type='siou', - use_dfl=False, - reg_max=0, #if use_dfl is False, please set reg_max to 0 - distill_weight={ - 'class': 1.0, - 'dfl': 1.0, - }, - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.00001, #0.01 # 0.02 - lrf=0.001, - momentum=0.937, - weight_decay=0.00005, - warmup_epochs=3.0, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.5, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.0, -) - -ptq = dict( - num_bits = 8, - calib_batches = 4, - # 'max', 'histogram' - calib_method = 'max', - # 'entropy', 'percentile', 'mse' - histogram_amax_method='entropy', - histogram_amax_percentile=99.99, - calib_output_path='./', - sensitive_layers_skip=False, - sensitive_layers_list=[], -) - -qat = dict( - calib_pt = './assets/v6s_n_calib_max.pt', - sensitive_layers_skip = False, - sensitive_layers_list=[], -) -# Choose Rep-block by the training Mode, choices=["repvgg", "hyper-search", "repopt"] -training_mode='repopt' diff --git a/configs/repopt/yolov6s_hs.py b/configs/repopt/yolov6s_hs.py deleted file mode 100644 index 60c7286a..00000000 --- a/configs/repopt/yolov6s_hs.py +++ /dev/null @@ -1,59 +0,0 @@ -# YOLOv6s model -model = dict( - type='YOLOv6s', - pretrained=None, - depth_multiple=0.33, - width_multiple=0.50, - backbone=dict( - type='EfficientRep', - num_repeats=[1, 6, 12, 18, 6], - out_channels=[64, 128, 256, 512, 1024], - ), - neck=dict( - type='RepPANNeck', - num_repeats=[12, 12, 12, 12], - out_channels=[256, 128, 128, 256, 256, 512], - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=1, - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type='giou', - use_dfl=False, - reg_max=0 - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.01, - lrf=0.01, - momentum=0.937, - weight_decay=0.0005, - warmup_epochs=3.0, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.5, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.0, -) - -# Choose Rep-block by the training Mode, choices=["repvgg", "hyper-search", "repopt"] -training_mode='hyper_search' diff --git a/configs/repopt/yolov6s_opt.py b/configs/repopt/yolov6s_opt.py deleted file mode 100644 index 2676eb4f..00000000 --- a/configs/repopt/yolov6s_opt.py +++ /dev/null @@ -1,60 +0,0 @@ -# YOLOv6s model -model = dict( - type='YOLOv6s', - pretrained=None, - scales='../yolov6_assert/v6s_v2_scale.pt', - depth_multiple=0.33, - width_multiple=0.50, - backbone=dict( - type='EfficientRep', - num_repeats=[1, 6, 12, 18, 6], - out_channels=[64, 128, 256, 512, 1024], - ), - neck=dict( - type='RepPANNeck', - num_repeats=[12, 12, 12, 12], - out_channels=[256, 128, 128, 256, 256, 512], - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=1, - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type='giou', - use_dfl=False, - reg_max=0 - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.01, - lrf=0.01, - momentum=0.937, - weight_decay=0.0005, - warmup_epochs=3.0, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.5, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.0, -) - -# Choose Rep-block by the training Mode, choices=["repvgg", "hyper-search", "repopt"] -training_mode='repopt' diff --git a/configs/repopt/yolov6s_opt_qat.py b/configs/repopt/yolov6s_opt_qat.py deleted file mode 100644 index a41ea085..00000000 --- a/configs/repopt/yolov6s_opt_qat.py +++ /dev/null @@ -1,113 +0,0 @@ -# YOLOv6s model -model = dict( - type='YOLOv6s', - pretrained='./assets/yolov6s_v2_reopt_43.1.pt', - scales='./assets/yolov6s_v2_scale.pt', - depth_multiple=0.33, - width_multiple=0.50, - backbone=dict( - type='EfficientRep', - num_repeats=[1, 6, 12, 18, 6], - out_channels=[64, 128, 256, 512, 1024], - ), - neck=dict( - type='RepPANNeck', - num_repeats=[12, 12, 12, 12], - out_channels=[256, 128, 128, 256, 256, 512], - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=1, - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type = 'giou', - use_dfl = False, - reg_max = 0, # if use_dfl is False, please set reg_max to 0 - distill_weight={ - 'class': 1.0, - 'dfl': 1.0, - }, - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.00001, - lrf=0.001, - momentum=0.937, - weight_decay=0.00005, - warmup_epochs=3, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.5, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.0, -) - -ptq = dict( - num_bits = 8, - calib_batches = 4, - # 'max', 'histogram' - calib_method = 'histogram', - # 'entropy', 'percentile', 'mse' - histogram_amax_method='entropy', - histogram_amax_percentile=99.99, - calib_output_path='./', - sensitive_layers_skip=False, - sensitive_layers_list=['detect.stems.0.conv', - 'detect.stems.1.conv', - 'detect.stems.2.conv', - 'detect.cls_convs.0.conv', - 'detect.cls_convs.1.conv', - 'detect.cls_convs.2.conv', - 'detect.reg_convs.0.conv', - 'detect.reg_convs.1.conv', - 'detect.reg_convs.2.conv', - 'detect.cls_preds.0', - 'detect.cls_preds.1', - 'detect.cls_preds.2', - 'detect.reg_preds.0', - 'detect.reg_preds.1', - 'detect.reg_preds.2', - ], -) - -qat = dict( - calib_pt = './assets/yolov6s_v2_reopt_43.1_calib_histogram.pt', - sensitive_layers_skip = False, - sensitive_layers_list=['detect.stems.0.conv', - 'detect.stems.1.conv', - 'detect.stems.2.conv', - 'detect.cls_convs.0.conv', - 'detect.cls_convs.1.conv', - 'detect.cls_convs.2.conv', - 'detect.reg_convs.0.conv', - 'detect.reg_convs.1.conv', - 'detect.reg_convs.2.conv', - 'detect.cls_preds.0', - 'detect.cls_preds.1', - 'detect.cls_preds.2', - 'detect.reg_preds.0', - 'detect.reg_preds.1', - 'detect.reg_preds.2', - ], -) - -# Choose Rep-block by the training Mode, choices=["repvgg", "hyper-search", "repopt"] -training_mode='repopt' diff --git a/configs/yolov6l.py b/configs/solo/yolov6l_solo.py similarity index 92% rename from configs/yolov6l.py rename to configs/solo/yolov6l_solo.py index bfa6728b..caabc1f4 100644 --- a/configs/yolov6l.py +++ b/configs/solo/yolov6l_solo.py @@ -1,4 +1,4 @@ -# YOLOv6l model +# YOLOv6l-seg model model = dict( type='YOLOv6l', pretrained=None, @@ -22,6 +22,10 @@ in_channels=[128, 256, 512], num_layers=3, begin_indices=24, + npr=256, + nm=64, + isseg=True, + issolo=True, anchors=3, anchors_init=[[10,13, 19,19, 33,23], [30,61, 59,59, 59,119], @@ -45,7 +49,7 @@ lr0=0.01, lrf=0.01, momentum=0.937, - weight_decay=0.0005, + weight_decay=0.001, warmup_epochs=3.0, warmup_momentum=0.8, warmup_bias_lr=0.1 diff --git a/configs/qarepvgg/yolov6m_qa.py b/configs/solo/yolov6m_solo.py similarity index 92% rename from configs/qarepvgg/yolov6m_qa.py rename to configs/solo/yolov6m_solo.py index c0690f15..84e73c0f 100644 --- a/configs/qarepvgg/yolov6m_qa.py +++ b/configs/solo/yolov6m_solo.py @@ -1,4 +1,4 @@ -# YOLOv6m model +# YOLOv6m-seg model model = dict( type='YOLOv6m', pretrained=None, @@ -22,6 +22,10 @@ in_channels=[128, 256, 512], num_layers=3, begin_indices=24, + npr=256, + nm=64, + isseg=True, + issolo=True, anchors=3, anchors_init=[[10,13, 19,19, 33,23], [30,61, 59,59, 59,119], @@ -45,7 +49,7 @@ lr0=0.01, lrf=0.01, momentum=0.937, - weight_decay=0.0005, + weight_decay=0.001, warmup_epochs=3.0, warmup_momentum=0.8, warmup_bias_lr=0.1 @@ -64,5 +68,3 @@ mosaic=1.0, mixup=0.1, ) - -training_mode='qarepvggv2' diff --git a/configs/yolov6n.py b/configs/solo/yolov6n_solo.py similarity index 92% rename from configs/yolov6n.py rename to configs/solo/yolov6n_solo.py index 74f9386d..6392ceb4 100644 --- a/configs/yolov6n.py +++ b/configs/solo/yolov6n_solo.py @@ -1,4 +1,4 @@ -# YOLOv6n model +# YOLOv6n-seg model model = dict( type='YOLOv6n', pretrained=None, @@ -21,6 +21,10 @@ in_channels=[128, 256, 512], num_layers=3, begin_indices=24, + npr=256, + nm=64, + isseg=True, + issolo=True, anchors=3, anchors_init=[[10,13, 19,19, 33,23], [30,61, 59,59, 59,119], @@ -44,7 +48,7 @@ lr0=0.02, lrf=0.01, momentum=0.937, - weight_decay=0.0005, + weight_decay=0.001, warmup_epochs=3.0, warmup_momentum=0.8, warmup_bias_lr=0.1 diff --git a/configs/qarepvgg/yolov6s_qa.py b/configs/solo/yolov6s_solo.py similarity index 94% rename from configs/qarepvgg/yolov6s_qa.py rename to configs/solo/yolov6s_solo.py index 3051679a..c2499ba3 100644 --- a/configs/qarepvgg/yolov6s_qa.py +++ b/configs/solo/yolov6s_solo.py @@ -1,4 +1,4 @@ -# YOLOv6s model +# YOLOv6s-seg model model = dict( type='YOLOv6s', pretrained=None, @@ -21,6 +21,10 @@ in_channels=[128, 256, 512], num_layers=3, begin_indices=24, + npr=256, + nm=64, + isseg=True, + issolo=True, anchors=3, anchors_init=[[10,13, 19,19, 33,23], [30,61, 59,59, 59,119], @@ -63,5 +67,3 @@ mosaic=1.0, mixup=0.0, ) - -training_mode='qarepvggv2' diff --git a/configs/base/yolov6m_base.py b/configs/solo/yolov6x_solo.py similarity index 81% rename from configs/base/yolov6m_base.py rename to configs/solo/yolov6x_solo.py index 5670f096..57a175ab 100644 --- a/configs/base/yolov6m_base.py +++ b/configs/solo/yolov6x_solo.py @@ -1,9 +1,9 @@ -# YOLOv6m medium/large base model +# YOLOv6x-seg model model = dict( - type='YOLOv6m_base', + type='YOLOv6x', pretrained=None, - depth_multiple=0.80, - width_multiple=0.75, + depth_multiple=1.33, + width_multiple=1.25, backbone=dict( type='CSPBepBackbone', num_repeats=[1, 6, 12, 18, 6], @@ -22,6 +22,10 @@ in_channels=[128, 256, 512], num_layers=3, begin_indices=24, + npr=256, + nm=64, + isseg=True, + issolo=True, anchors=3, anchors_init=[[10,13, 19,19, 33,23], [30,61, 59,59, 59,119], @@ -33,7 +37,7 @@ use_dfl=True, reg_max=16, #if use_dfl is False, please set reg_max to 0 distill_weight={ - 'class': 0.8, + 'class': 2.0, 'dfl': 1.0, }, ) @@ -45,7 +49,7 @@ lr0=0.01, lrf=0.01, momentum=0.937, - weight_decay=0.0005, + weight_decay=0.0015, warmup_epochs=3.0, warmup_momentum=0.8, warmup_bias_lr=0.1 @@ -64,4 +68,5 @@ mosaic=1.0, mixup=0.1, ) -training_mode = "conv_relu" +training_mode = "conv_silu" +# use normal conv to speed up training and further improve accuracy. diff --git a/configs/yolov6_lite/README.md b/configs/yolov6_lite/README.md deleted file mode 100644 index 170d12d9..00000000 --- a/configs/yolov6_lite/README.md +++ /dev/null @@ -1,22 +0,0 @@ -## YOLOv6Lite model - -English | [简体中文](./README_cn.md) - -## Mobile Benchmark -| Model | Size | mAPval
0.5:0.95 | sm8350
(ms) | mt6853
(ms) | sdm660
(ms) |Params
(M) | FLOPs
(G) | -| :----------------------------------------------------------- | ---- | -------------------- | -------------------- | -------------------- | -------------------- | -------------------- | -------------------- | -| [**YOLOv6Lite-S**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6lite_s.pt) | 320*320 | 22.4 | 7.99 | 11.99 | 41.86 | 0.55 | 0.56 | -| [**YOLOv6Lite-M**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6lite_m.pt) | 320*320 | 25.1 | 9.08 | 13.27 | 47.95 | 0.79 | 0.67 | -| [**YOLOv6Lite-L**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6lite_l.pt) | 320*320 | 28.0 | 11.37 | 16.20 | 61.40 | 1.09 | 0.87 | -| [**YOLOv6Lite-L**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6lite_l.pt) | 320*192 | 25.0 | 7.02 | 9.66 | 36.13 | 1.09 | 0.52 | -| [**YOLOv6Lite-L**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6lite_l.pt) | 224*128 | 18.9 | 3.63 | 4.99 | 17.76 | 1.09 | 0.24 | - -
-Table Notes - -- From the perspective of model size and input image ratio, we have built a series of models on the mobile terminal to facilitate flexible applications in different scenarios. -- All checkpoints are trained with 400 epochs without distillation. -- Results of the mAP and speed are evaluated on [COCO val2017](https://cocodataset.org/#download) dataset, and the input resolution is the Size in the table. -- Speed is tested on MNN 2.3.0 AArch64 with 2 threads by arm82 acceleration. The inference warm-up is performed 10 times, and the cycle is performed 100 times. -- Qualcomm 888(sm8350), Dimensity 720(mt6853) and Qualcomm 660(sdm660) correspond to chips with different performances at the high, middle and low end respectively, which can be used as a reference for model capabilities under different chips. -- Refer to [Test NCNN Speed](./docs/Test_NCNN_speed.md) tutorial to reproduce the NCNN speed results of YOLOv6Lite. diff --git a/configs/yolov6_lite/README_cn.md b/configs/yolov6_lite/README_cn.md deleted file mode 100644 index 23dd715e..00000000 --- a/configs/yolov6_lite/README_cn.md +++ /dev/null @@ -1,23 +0,0 @@ -## YOLOv6 轻量级模型 - -简体中文 | [English](./README.md) - -## 移动端模型指标 - -| 模型 | 输入尺寸 | mAPval
0.5:0.95 | sm8350
(ms) | mt6853
(ms) | sdm660
(ms) |Params
(M) | FLOPs
(G) | -| :----------------------------------------------------------- | ---- | -------------------- | -------------------- | -------------------- | -------------------- | -------------------- | -------------------- | -| [**YOLOv6Lite-S**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6lite_s.pt) | 320*320 | 22.4 | 7.99 | 11.99 | 41.86 | 0.55 | 0.56 | -| [**YOLOv6Lite-M**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6lite_m.pt) | 320*320 | 25.1 | 9.08 | 13.27 | 47.95 | 0.79 | 0.67 | -| [**YOLOv6Lite-L**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6lite_l.pt) | 320*320 | 28.0 | 11.37 | 16.20 | 61.40 | 1.09 | 0.87 | -| [**YOLOv6Lite-L**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6lite_l.pt) | 320*192 | 25.0 | 7.02 | 9.66 | 36.13 | 1.09 | 0.52 | -| [**YOLOv6Lite-L**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6lite_l.pt) | 224*128 | 18.9 | 3.63 | 4.99 | 17.76 | 1.09 | 0.24 | - -
-表格笔记 - -- 从模型尺寸和输入图片比例两种角度,在构建了移动端系列模型,方便不同场景下的灵活应用。 -- 所有权重都经过 400 个 epoch 的训练,并且没有使用蒸馏技术。 -- mAP 和速度指标是在 COCO val2017 数据集上评估的,输入分辨率为表格中对应展示的。 -- 使用 MNN 2.3.0 AArch64 进行速度测试。测速时,采用2个线程,并开启arm82加速,推理预热10次,循环100次。 -- 高通888(sm8350)、天玑720(mt6853)和高通660(sdm660)分别对应高中低端不同性能的芯片,可以作为不同芯片下机型能力的参考。 -- [NCNN 速度测试](./docs/Test_NCNN_speed.md)教程可以帮助展示及复现 YOLOv6Lite 的 NCNN 速度结果。 diff --git a/configs/yolov6_lite/yolov6_lite_l.py b/configs/yolov6_lite/yolov6_lite_l.py deleted file mode 100644 index 212c8c73..00000000 --- a/configs/yolov6_lite/yolov6_lite_l.py +++ /dev/null @@ -1,54 +0,0 @@ -# YOLOv6-lite-l model -model = dict( - type='YOLOv6-lite-l', - pretrained=None, - width_multiple=1.5, - backbone=dict( - type='Lite_EffiBackbone', - num_repeats=[1, 3, 7, 3], - out_channels=[24, 32, 64, 128, 256], - scale_size=0.5, - ), - neck=dict( - type='Lite_EffiNeck', - in_channels=[256, 128, 64], - unified_channels=96 - ), - head=dict( - type='Lite_EffideHead', - in_channels=[96, 96, 96, 96], - num_layers=4, - anchors=1, - strides=[8, 16, 32, 64], - atss_warmup_epoch=4, - iou_type='siou', - use_dfl=False, - reg_max=0 #if use_dfl is False, please set reg_max to 0 - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.1 * 4, - lrf=0.01, - momentum=0.9, - weight_decay=0.00004, - warmup_epochs=3.0, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.5, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.0, -) diff --git a/configs/yolov6_lite/yolov6_lite_l_finetune.py b/configs/yolov6_lite/yolov6_lite_l_finetune.py deleted file mode 100644 index 48315c4d..00000000 --- a/configs/yolov6_lite/yolov6_lite_l_finetune.py +++ /dev/null @@ -1,54 +0,0 @@ -# YOLOv6-lite-l model -model = dict( - type='YOLOv6-lite-l', - pretrained='weights/yolov6lite_l.pt', - width_multiple=1.5, - backbone=dict( - type='Lite_EffiBackbone', - num_repeats=[1, 3, 7, 3], - out_channels=[24, 32, 64, 128, 256], - scale_size=0.5, - ), - neck=dict( - type='Lite_EffiNeck', - in_channels=[256, 128, 64], - unified_channels=96 - ), - head=dict( - type='Lite_EffideHead', - in_channels=[96, 96, 96, 96], - num_layers=4, - anchors=1, - strides=[8, 16, 32, 64], - atss_warmup_epoch=4, - iou_type='siou', - use_dfl=False, - reg_max=0 #if use_dfl is False, please set reg_max to 0 - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.0032, - lrf=0.12, - momentum=0.843, - weight_decay=0.00036, - warmup_epochs=2.0, - warmup_momentum=0.5, - warmup_bias_lr=0.05 -) - -data_aug = dict( - hsv_h=0.0138, - hsv_s=0.664, - hsv_v=0.464, - degrees=0.373, - translate=0.245, - scale=0.898, - shear=0.602, - flipud=0.00856, - fliplr=0.5, - mosaic=1.0, - mixup=0.243, -) diff --git a/configs/yolov6_lite/yolov6_lite_m.py b/configs/yolov6_lite/yolov6_lite_m.py deleted file mode 100644 index 8f0de368..00000000 --- a/configs/yolov6_lite/yolov6_lite_m.py +++ /dev/null @@ -1,54 +0,0 @@ -# YOLOv6-lite-m model -model = dict( - type='YOLOv6-lite-m', - pretrained=None, - width_multiple=1.1, - backbone=dict( - type='Lite_EffiBackbone', - num_repeats=[1, 3, 7, 3], - out_channels=[24, 32, 64, 128, 256], - scale_size=0.5, - ), - neck=dict( - type='Lite_EffiNeck', - in_channels=[256, 128, 64], - unified_channels=96 - ), - head=dict( - type='Lite_EffideHead', - in_channels=[96, 96, 96, 96], - num_layers=4, - anchors=1, - strides=[8, 16, 32, 64], - atss_warmup_epoch=4, - iou_type='siou', - use_dfl=False, - reg_max=0 #if use_dfl is False, please set reg_max to 0 - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.1 * 4, - lrf=0.01, - momentum=0.9, - weight_decay=0.00004, - warmup_epochs=3.0, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.5, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.0, -) diff --git a/configs/yolov6_lite/yolov6_lite_m_finetune.py b/configs/yolov6_lite/yolov6_lite_m_finetune.py deleted file mode 100644 index 108adda5..00000000 --- a/configs/yolov6_lite/yolov6_lite_m_finetune.py +++ /dev/null @@ -1,54 +0,0 @@ -# YOLOv6-lite-m model -model = dict( - type='YOLOv6-lite-m', - pretrained='weights/yolov6lite_m.pt', - width_multiple=1.1, - backbone=dict( - type='Lite_EffiBackbone', - num_repeats=[1, 3, 7, 3], - out_channels=[24, 32, 64, 128, 256], - scale_size=0.5, - ), - neck=dict( - type='Lite_EffiNeck', - in_channels=[256, 128, 64], - unified_channels=96 - ), - head=dict( - type='Lite_EffideHead', - in_channels=[96, 96, 96, 96], - num_layers=4, - anchors=1, - strides=[8, 16, 32, 64], - atss_warmup_epoch=4, - iou_type='siou', - use_dfl=False, - reg_max=0 #if use_dfl is False, please set reg_max to 0 - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.0032, - lrf=0.12, - momentum=0.843, - weight_decay=0.00036, - warmup_epochs=2.0, - warmup_momentum=0.5, - warmup_bias_lr=0.05 -) - -data_aug = dict( - hsv_h=0.0138, - hsv_s=0.664, - hsv_v=0.464, - degrees=0.373, - translate=0.245, - scale=0.898, - shear=0.602, - flipud=0.00856, - fliplr=0.5, - mosaic=1.0, - mixup=0.243, -) diff --git a/configs/yolov6_lite/yolov6_lite_s.py b/configs/yolov6_lite/yolov6_lite_s.py deleted file mode 100644 index 42a52e37..00000000 --- a/configs/yolov6_lite/yolov6_lite_s.py +++ /dev/null @@ -1,54 +0,0 @@ -# YOLOv6-lite-s model -model = dict( - type='YOLOv6-lite-s', - pretrained=None, - width_multiple=0.7, - backbone=dict( - type='Lite_EffiBackbone', - num_repeats=[1, 3, 7, 3], - out_channels=[24, 32, 64, 128, 256], - scale_size=0.5, - ), - neck=dict( - type='Lite_EffiNeck', - in_channels=[256, 128, 64], - unified_channels=96 - ), - head=dict( - type='Lite_EffideHead', - in_channels=[96, 96, 96, 96], - num_layers=4, - anchors=1, - strides=[8, 16, 32, 64], - atss_warmup_epoch=4, - iou_type='siou', - use_dfl=False, - reg_max=0 #if use_dfl is False, please set reg_max to 0 - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.1 * 4, - lrf=0.01, - momentum=0.9, - weight_decay=0.00004, - warmup_epochs=3.0, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.5, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.0, -) diff --git a/configs/yolov6_lite/yolov6_lite_s_finetune.py b/configs/yolov6_lite/yolov6_lite_s_finetune.py deleted file mode 100644 index befee2ce..00000000 --- a/configs/yolov6_lite/yolov6_lite_s_finetune.py +++ /dev/null @@ -1,54 +0,0 @@ -# YOLOv6-lite-s model -model = dict( - type='YOLOv6-lite-s', - pretrained='weights/yolov6lite_s.pt', - width_multiple=0.7, - backbone=dict( - type='Lite_EffiBackbone', - num_repeats=[1, 3, 7, 3], - out_channels=[24, 32, 64, 128, 256], - scale_size=0.5, - ), - neck=dict( - type='Lite_EffiNeck', - in_channels=[256, 128, 64], - unified_channels=96 - ), - head=dict( - type='Lite_EffideHead', - in_channels=[96, 96, 96, 96], - num_layers=4, - anchors=1, - strides=[8, 16, 32, 64], - atss_warmup_epoch=4, - iou_type='siou', - use_dfl=False, - reg_max=0 #if use_dfl is False, please set reg_max to 0 - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.0032, - lrf=0.12, - momentum=0.843, - weight_decay=0.00036, - warmup_epochs=2.0, - warmup_momentum=0.5, - warmup_bias_lr=0.05 -) - -data_aug = dict( - hsv_h=0.0138, - hsv_s=0.664, - hsv_v=0.464, - degrees=0.373, - translate=0.245, - scale=0.898, - shear=0.602, - flipud=0.00856, - fliplr=0.5, - mosaic=1.0, - mixup=0.243, -) diff --git a/configs/yolov6l6.py b/configs/yolov6l6.py deleted file mode 100644 index 3bb77c5f..00000000 --- a/configs/yolov6l6.py +++ /dev/null @@ -1,62 +0,0 @@ -# YOLOv6l6 model -model = dict( - type='YOLOv6l6', - pretrained=None, - depth_multiple=1.0, - width_multiple=1.0, - backbone=dict( - type='CSPBepBackbone_P6', - num_repeats=[1, 6, 12, 18, 6, 6], - out_channels=[64, 128, 256, 512, 768, 1024], - csp_e=float(1)/2, - fuse_P2=True, - ), - neck=dict( - type='CSPRepBiFPANNeck_P6', - num_repeats=[12, 12, 12, 12, 12, 12], - out_channels=[512, 256, 128, 256, 512, 1024], - csp_e=float(1)/2, - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512, 1024], - num_layers=4, - anchors=1, - strides=[8, 16, 32, 64], - atss_warmup_epoch=4, - iou_type='giou', - use_dfl=True, - reg_max=16, #if use_dfl is False, please set reg_max to 0 - distill_weight={ - 'class': 1.0, - 'dfl': 1.0, - }, - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.01, - lrf=0.01, - momentum=0.937, - weight_decay=0.0005, - warmup_epochs=3.0, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.9, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.2, -) -training_mode = "conv_silu" diff --git a/configs/yolov6l6_finetune.py b/configs/yolov6l6_finetune.py deleted file mode 100644 index 2ffb8ada..00000000 --- a/configs/yolov6l6_finetune.py +++ /dev/null @@ -1,62 +0,0 @@ -# YOLOv6l6 model -model = dict( - type='YOLOv6l6', - pretrained='weights/yolov6l6.pt', - depth_multiple=1.0, - width_multiple=1.0, - backbone=dict( - type='CSPBepBackbone_P6', - num_repeats=[1, 6, 12, 18, 6, 6], - out_channels=[64, 128, 256, 512, 768, 1024], - csp_e=float(1)/2, - fuse_P2=True, - ), - neck=dict( - type='CSPRepBiFPANNeck_P6', - num_repeats=[12, 12, 12, 12, 12, 12], - out_channels=[512, 256, 128, 256, 512, 1024], - csp_e=float(1)/2, - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512, 1024], - num_layers=4, - anchors=1, - strides=[8, 16, 32, 64], - atss_warmup_epoch=4, - iou_type='giou', - use_dfl=True, - reg_max=16, #if use_dfl is False, please set reg_max to 0 - distill_weight={ - 'class': 1.0, - 'dfl': 1.0, - }, - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.0032, - lrf=0.12, - momentum=0.843, - weight_decay=0.00036, - warmup_epochs=2.0, - warmup_momentum=0.5, - warmup_bias_lr=0.05 -) - -data_aug = dict( - hsv_h=0.0138, - hsv_s=0.664, - hsv_v=0.464, - degrees=0.373, - translate=0.245, - scale=0.898, - shear=0.602, - flipud=0.00856, - fliplr=0.5, - mosaic=1.0, - mixup=0.243, -) -training_mode = "conv_silu" diff --git a/configs/yolov6l_finetune.py b/configs/yolov6l_finetune.py deleted file mode 100644 index 9b301233..00000000 --- a/configs/yolov6l_finetune.py +++ /dev/null @@ -1,68 +0,0 @@ -# YOLOv6l model -model = dict( - type='YOLOv6l', - pretrained='weights/yolov6l.pt', - depth_multiple=1.0, - width_multiple=1.0, - backbone=dict( - type='CSPBepBackbone', - num_repeats=[1, 6, 12, 18, 6], - out_channels=[64, 128, 256, 512, 1024], - csp_e=float(1)/2, - fuse_P2=True, - ), - neck=dict( - type='CSPRepBiFPANNeck', - num_repeats=[12, 12, 12, 12], - out_channels=[256, 128, 128, 256, 256, 512], - csp_e=float(1)/2, - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=3, - anchors_init=[[10,13, 19,19, 33,23], - [30,61, 59,59, 59,119], - [116,90, 185,185, 373,326]], - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type='giou', - use_dfl=True, - reg_max=16, #if use_dfl is False, please set reg_max to 0 - distill_weight={ - 'class': 2.0, - 'dfl': 1.0, - }, - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.0032, - lrf=0.12, - momentum=0.843, - weight_decay=0.00036, - warmup_epochs=2.0, - warmup_momentum=0.5, - warmup_bias_lr=0.05 -) - -data_aug = dict( - hsv_h=0.0138, - hsv_s=0.664, - hsv_v=0.464, - degrees=0.373, - translate=0.245, - scale=0.898, - shear=0.602, - flipud=0.00856, - fliplr=0.5, - mosaic=1.0, - mixup=0.243, -) -training_mode = "conv_silu" -# use normal conv to speed up training and further improve accuracy. diff --git a/configs/base/yolov6l_base.py b/configs/yolov6l_seg.py similarity index 85% rename from configs/base/yolov6l_base.py rename to configs/yolov6l_seg.py index ef2dbbb2..2ed9211f 100644 --- a/configs/base/yolov6l_base.py +++ b/configs/yolov6l_seg.py @@ -1,6 +1,6 @@ -# YOLOv6l large base model +# YOLOv6l-seg model model = dict( - type='YOLOv6l_base', + type='YOLOv6l', pretrained=None, depth_multiple=1.0, width_multiple=1.0, @@ -22,6 +22,10 @@ in_channels=[128, 256, 512], num_layers=3, begin_indices=24, + npr=256, + nm=32, + isseg=True, + issolo=False, anchors=3, anchors_init=[[10,13, 19,19, 33,23], [30,61, 59,59, 59,119], @@ -45,7 +49,7 @@ lr0=0.01, lrf=0.01, momentum=0.937, - weight_decay=0.0005, + weight_decay=0.001, warmup_epochs=3.0, warmup_momentum=0.8, warmup_bias_lr=0.1 @@ -64,4 +68,5 @@ mosaic=1.0, mixup=0.1, ) -training_mode = "conv_relu" +training_mode = "conv_silu" +# use normal conv to speed up training and further improve accuracy. diff --git a/configs/yolov6m6.py b/configs/yolov6m6.py deleted file mode 100644 index e741bbc0..00000000 --- a/configs/yolov6m6.py +++ /dev/null @@ -1,61 +0,0 @@ -# YOLOv6m6 model -model = dict( - type='YOLOv6m6', - pretrained=None, - depth_multiple=0.60, - width_multiple=0.75, - backbone=dict( - type='CSPBepBackbone_P6', - num_repeats=[1, 6, 12, 18, 6, 6], - out_channels=[64, 128, 256, 512, 768, 1024], - csp_e=float(2)/3, - fuse_P2=True, - ), - neck=dict( - type='CSPRepBiFPANNeck_P6', - num_repeats=[12, 12, 12, 12, 12, 12], - out_channels=[512, 256, 128, 256, 512, 1024], - csp_e=float(2)/3, - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512, 1024], - num_layers=4, - anchors=1, - strides=[8, 16, 32, 64], - atss_warmup_epoch=4, - iou_type='giou', - use_dfl=True, - reg_max=16, #if use_dfl is False, please set reg_max to 0 - distill_weight={ - 'class': 1.0, - 'dfl': 1.0, - }, - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.01, - lrf=0.01, - momentum=0.937, - weight_decay=0.0005, - warmup_epochs=3.0, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.9, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.1, -) diff --git a/configs/yolov6m6_finetune.py b/configs/yolov6m6_finetune.py deleted file mode 100644 index 83760d3a..00000000 --- a/configs/yolov6m6_finetune.py +++ /dev/null @@ -1,61 +0,0 @@ -# YOLOv6m6 model -model = dict( - type='YOLOv6m6', - pretrained='weights/yolov6m6.pt', - depth_multiple=0.60, - width_multiple=0.75, - backbone=dict( - type='CSPBepBackbone_P6', - num_repeats=[1, 6, 12, 18, 6, 6], - out_channels=[64, 128, 256, 512, 768, 1024], - csp_e=float(2)/3, - fuse_P2=True, - ), - neck=dict( - type='CSPRepBiFPANNeck_P6', - num_repeats=[12, 12, 12, 12, 12, 12], - out_channels=[512, 256, 128, 256, 512, 1024], - csp_e=float(2)/3, - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512, 1024], - num_layers=4, - anchors=1, - strides=[8, 16, 32, 64], - atss_warmup_epoch=4, - iou_type='giou', - use_dfl=True, - reg_max=16, #if use_dfl is False, please set reg_max to 0 - distill_weight={ - 'class': 1.0, - 'dfl': 1.0, - }, - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.0032, - lrf=0.12, - momentum=0.843, - weight_decay=0.00036, - warmup_epochs=2.0, - warmup_momentum=0.5, - warmup_bias_lr=0.05 -) - -data_aug = dict( - hsv_h=0.0138, - hsv_s=0.664, - hsv_v=0.464, - degrees=0.373, - translate=0.245, - scale=0.898, - shear=0.602, - flipud=0.00856, - fliplr=0.5, - mosaic=1.0, - mixup=0.243, -) diff --git a/configs/yolov6m_finetune.py b/configs/yolov6m_finetune.py deleted file mode 100644 index cfe0fa93..00000000 --- a/configs/yolov6m_finetune.py +++ /dev/null @@ -1,66 +0,0 @@ -# YOLOv6m model -model = dict( - type='YOLOv6m', - pretrained='weights/yolov6m.pt', - depth_multiple=0.60, - width_multiple=0.75, - backbone=dict( - type='CSPBepBackbone', - num_repeats=[1, 6, 12, 18, 6], - out_channels=[64, 128, 256, 512, 1024], - csp_e=float(2)/3, - fuse_P2=True, - ), - neck=dict( - type='CSPRepBiFPANNeck', - num_repeats=[12, 12, 12, 12], - out_channels=[256, 128, 128, 256, 256, 512], - csp_e=float(2)/3, - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=3, - anchors_init=[[10,13, 19,19, 33,23], - [30,61, 59,59, 59,119], - [116,90, 185,185, 373,326]], - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type='giou', - use_dfl=True, - reg_max=16, #if use_dfl is False, please set reg_max to 0 - distill_weight={ - 'class': 0.8, - 'dfl': 1.0, - }, - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.0032, - lrf=0.12, - momentum=0.843, - weight_decay=0.00036, - warmup_epochs=2.0, - warmup_momentum=0.5, - warmup_bias_lr=0.05 -) - -data_aug = dict( - hsv_h=0.0138, - hsv_s=0.664, - hsv_v=0.464, - degrees=0.373, - translate=0.245, - scale=0.898, - shear=0.602, - flipud=0.00856, - fliplr=0.5, - mosaic=1.0, - mixup=0.243, -) diff --git a/configs/yolov6m.py b/configs/yolov6m_seg.py similarity index 92% rename from configs/yolov6m.py rename to configs/yolov6m_seg.py index 29fae396..d8660be3 100644 --- a/configs/yolov6m.py +++ b/configs/yolov6m_seg.py @@ -1,4 +1,4 @@ -# YOLOv6m model +# YOLOv6m-seg model model = dict( type='YOLOv6m', pretrained=None, @@ -22,6 +22,10 @@ in_channels=[128, 256, 512], num_layers=3, begin_indices=24, + npr=256, + nm=32, + isseg=True, + issolo=False, anchors=3, anchors_init=[[10,13, 19,19, 33,23], [30,61, 59,59, 59,119], @@ -45,7 +49,7 @@ lr0=0.01, lrf=0.01, momentum=0.937, - weight_decay=0.0005, + weight_decay=0.001, warmup_epochs=3.0, warmup_momentum=0.8, warmup_bias_lr=0.1 diff --git a/configs/yolov6n6.py b/configs/yolov6n6.py deleted file mode 100644 index 0abe3a44..00000000 --- a/configs/yolov6n6.py +++ /dev/null @@ -1,56 +0,0 @@ -# YOLOv6n model -model = dict( - type='YOLOv6n6', - pretrained=None, - depth_multiple=0.33, - width_multiple=0.25, - backbone=dict( - type='EfficientRep6', - num_repeats=[1, 6, 12, 18, 6, 6], - out_channels=[64, 128, 256, 512, 768, 1024], - fuse_P2=True, # if use RepBiFPANNeck6, please set fuse_P2 to True. - cspsppf=True, - ), - neck=dict( - type='RepBiFPANNeck6', - num_repeats=[12, 12, 12, 12, 12, 12], - out_channels=[512, 256, 128, 256, 512, 1024], - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512, 1024], - num_layers=4, - anchors=1, - strides=[8, 16, 32, 64], - atss_warmup_epoch=4, - iou_type='siou', - use_dfl=False, - reg_max=0 #if use_dfl is False, please set reg_max to 0 - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.02, - lrf=0.01, - momentum=0.937, - weight_decay=0.0005, - warmup_epochs=3.0, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.5, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.0, -) diff --git a/configs/yolov6n6_finetune.py b/configs/yolov6n6_finetune.py deleted file mode 100644 index 01100f0f..00000000 --- a/configs/yolov6n6_finetune.py +++ /dev/null @@ -1,56 +0,0 @@ -# YOLOv6n model -model = dict( - type='YOLOv6n6', - pretrained='weights/yolov6n6.pt', - depth_multiple=0.33, - width_multiple=0.25, - backbone=dict( - type='EfficientRep6', - num_repeats=[1, 6, 12, 18, 6, 6], - out_channels=[64, 128, 256, 512, 768, 1024], - fuse_P2=True, # if use RepBiFPANNeck6, please set fuse_P2 to True. - cspsppf=True, - ), - neck=dict( - type='RepBiFPANNeck6', - num_repeats=[12, 12, 12, 12, 12, 12], - out_channels=[512, 256, 128, 256, 512, 1024], - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512, 1024], - num_layers=4, - anchors=1, - strides=[8, 16, 32, 64], - atss_warmup_epoch=4, - iou_type='siou', - use_dfl=False, - reg_max=0 #if use_dfl is False, please set reg_max to 0 - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.0032, - lrf=0.12, - momentum=0.843, - weight_decay=0.00036, - warmup_epochs=2.0, - warmup_momentum=0.5, - warmup_bias_lr=0.05 -) - -data_aug = dict( - hsv_h=0.0138, - hsv_s=0.664, - hsv_v=0.464, - degrees=0.373, - translate=0.245, - scale=0.898, - shear=0.602, - flipud=0.00856, - fliplr=0.5, - mosaic=1.0, - mixup=0.243, -) diff --git a/configs/yolov6n_finetune.py b/configs/yolov6n_finetune.py deleted file mode 100644 index 03b6d1ba..00000000 --- a/configs/yolov6n_finetune.py +++ /dev/null @@ -1,65 +0,0 @@ -# YOLOv6s model -model = dict( - type='YOLOv6n', - pretrained='weights/yolov6n.pt', - depth_multiple=0.33, - width_multiple=0.25, - backbone=dict( - type='EfficientRep', - num_repeats=[1, 6, 12, 18, 6], - out_channels=[64, 128, 256, 512, 1024], - fuse_P2=True, - cspsppf=True, - ), - neck=dict( - type='RepBiFPANNeck', - num_repeats=[12, 12, 12, 12], - out_channels=[256, 128, 128, 256, 256, 512], - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=3, - anchors_init=[[10,13, 19,19, 33,23], - [30,61, 59,59, 59,119], - [116,90, 185,185, 373,326]], - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type='siou', - use_dfl=False, # set to True if you want to further train with distillation - reg_max=0, # set to 16 if you want to further train with distillation - distill_weight={ - 'class': 1.0, - 'dfl': 1.0, - }, - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.0032, - lrf=0.12, - momentum=0.843, - weight_decay=0.00036, - warmup_epochs=2.0, - warmup_momentum=0.5, - warmup_bias_lr=0.05 -) - -data_aug = dict( - hsv_h=0.0138, - hsv_s=0.664, - hsv_v=0.464, - degrees=0.373, - translate=0.245, - scale=0.898, - shear=0.602, - flipud=0.00856, - fliplr=0.5, - mosaic=1.0, - mixup=0.243, -) diff --git a/configs/qarepvgg/yolov6n_qa.py b/configs/yolov6n_seg.py similarity index 92% rename from configs/qarepvgg/yolov6n_qa.py rename to configs/yolov6n_seg.py index b42d9ddb..94b42ed1 100644 --- a/configs/qarepvgg/yolov6n_qa.py +++ b/configs/yolov6n_seg.py @@ -1,4 +1,4 @@ -# YOLOv6s model +# YOLOv6n-seg model model = dict( type='YOLOv6n', pretrained=None, @@ -21,6 +21,10 @@ in_channels=[128, 256, 512], num_layers=3, begin_indices=24, + npr=256, + nm=32, + isseg=True, + issolo=False, anchors=3, anchors_init=[[10,13, 19,19, 33,23], [30,61, 59,59, 59,119], @@ -44,7 +48,7 @@ lr0=0.02, lrf=0.01, momentum=0.937, - weight_decay=0.0005, + weight_decay=0.001, warmup_epochs=3.0, warmup_momentum=0.8, warmup_bias_lr=0.1 @@ -63,4 +67,3 @@ mosaic=1.0, mixup=0.0, ) -training_mode='qarepvggv2' diff --git a/configs/yolov6s6.py b/configs/yolov6s6.py deleted file mode 100644 index 091bfffc..00000000 --- a/configs/yolov6s6.py +++ /dev/null @@ -1,56 +0,0 @@ -# YOLOv6n model -model = dict( - type='YOLOv6s6', - pretrained=None, - depth_multiple=0.33, - width_multiple=0.50, - backbone=dict( - type='EfficientRep6', - num_repeats=[1, 6, 12, 18, 6, 6], - out_channels=[64, 128, 256, 512, 768, 1024], - fuse_P2=True, # if use RepBiFPANNeck6, please set fuse_P2 to True. - cspsppf=True, - ), - neck=dict( - type='RepBiFPANNeck6', - num_repeats=[12, 12, 12, 12, 12, 12], - out_channels=[512, 256, 128, 256, 512, 1024], - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512, 1024], - num_layers=4, - anchors=1, - strides=[8, 16, 32, 64], - atss_warmup_epoch=4, - iou_type='giou', - use_dfl=False, - reg_max=0 #if use_dfl is False, please set reg_max to 0 - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.01, - lrf=0.01, - momentum=0.937, - weight_decay=0.0005, - warmup_epochs=3.0, - warmup_momentum=0.8, - warmup_bias_lr=0.1 -) - -data_aug = dict( - hsv_h=0.015, - hsv_s=0.7, - hsv_v=0.4, - degrees=0.0, - translate=0.1, - scale=0.5, - shear=0.0, - flipud=0.0, - fliplr=0.5, - mosaic=1.0, - mixup=0.0, -) diff --git a/configs/yolov6s6_finetune.py b/configs/yolov6s6_finetune.py deleted file mode 100644 index a22697ed..00000000 --- a/configs/yolov6s6_finetune.py +++ /dev/null @@ -1,56 +0,0 @@ -# YOLOv6n model -model = dict( - type='YOLOv6s6', - pretrained='weights/yolov6s6.pt', - depth_multiple=0.33, - width_multiple=0.50, - backbone=dict( - type='EfficientRep6', - num_repeats=[1, 6, 12, 18, 6, 6], - out_channels=[64, 128, 256, 512, 768, 1024], - fuse_P2=True, # if use RepBiFPANNeck6, please set fuse_P2 to True. - cspsppf=True, - ), - neck=dict( - type='RepBiFPANNeck6', - num_repeats=[12, 12, 12, 12, 12, 12], - out_channels=[512, 256, 128, 256, 512, 1024], - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512, 1024], - num_layers=4, - anchors=1, - strides=[8, 16, 32, 64], - atss_warmup_epoch=4, - iou_type='giou', - use_dfl=False, - reg_max=0 #if use_dfl is False, please set reg_max to 0 - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.0032, - lrf=0.12, - momentum=0.843, - weight_decay=0.00036, - warmup_epochs=2.0, - warmup_momentum=0.5, - warmup_bias_lr=0.05 -) - -data_aug = dict( - hsv_h=0.0138, - hsv_s=0.664, - hsv_v=0.464, - degrees=0.373, - translate=0.245, - scale=0.898, - shear=0.602, - flipud=0.00856, - fliplr=0.5, - mosaic=1.0, - mixup=0.243, -) diff --git a/configs/yolov6s_finetune.py b/configs/yolov6s_finetune.py deleted file mode 100644 index d6fb27fe..00000000 --- a/configs/yolov6s_finetune.py +++ /dev/null @@ -1,65 +0,0 @@ -# YOLOv6s model -model = dict( - type='YOLOv6s', - pretrained='weights/yolov6s.pt', - depth_multiple=0.33, - width_multiple=0.50, - backbone=dict( - type='EfficientRep', - num_repeats=[1, 6, 12, 18, 6], - out_channels=[64, 128, 256, 512, 1024], - fuse_P2=True, - cspsppf=True, - ), - neck=dict( - type='RepBiFPANNeck', - num_repeats=[12, 12, 12, 12], - out_channels=[256, 128, 128, 256, 256, 512], - ), - head=dict( - type='EffiDeHead', - in_channels=[128, 256, 512], - num_layers=3, - begin_indices=24, - anchors=3, - anchors_init=[[10,13, 19,19, 33,23], - [30,61, 59,59, 59,119], - [116,90, 185,185, 373,326]], - out_indices=[17, 20, 23], - strides=[8, 16, 32], - atss_warmup_epoch=0, - iou_type='giou', - use_dfl=False, # set to True if you want to further train with distillation - reg_max=0, # set to 16 if you want to further train with distillation - distill_weight={ - 'class': 1.0, - 'dfl': 1.0, - }, - ) -) - -solver = dict( - optim='SGD', - lr_scheduler='Cosine', - lr0=0.0032, - lrf=0.12, - momentum=0.843, - weight_decay=0.00036, - warmup_epochs=2.0, - warmup_momentum=0.5, - warmup_bias_lr=0.05 -) - -data_aug = dict( - hsv_h=0.0138, - hsv_s=0.664, - hsv_v=0.464, - degrees=0.373, - translate=0.245, - scale=0.898, - shear=0.602, - flipud=0.00856, - fliplr=0.5, - mosaic=1.0, - mixup=0.243, -) diff --git a/configs/yolov6s.py b/configs/yolov6s_seg.py similarity index 92% rename from configs/yolov6s.py rename to configs/yolov6s_seg.py index 8d8b6739..c4274ccc 100644 --- a/configs/yolov6s.py +++ b/configs/yolov6s_seg.py @@ -1,4 +1,4 @@ -# YOLOv6s model +# YOLOv6s-seg model model = dict( type='YOLOv6s', pretrained=None, @@ -21,6 +21,10 @@ in_channels=[128, 256, 512], num_layers=3, begin_indices=24, + npr=256, + nm=32, + isseg=True, + issolo=False, anchors=3, anchors_init=[[10,13, 19,19, 33,23], [30,61, 59,59, 59,119], @@ -44,7 +48,7 @@ lr0=0.01, lrf=0.01, momentum=0.937, - weight_decay=0.0005, + weight_decay=0.001, warmup_epochs=3.0, warmup_momentum=0.8, warmup_bias_lr=0.1 diff --git a/configs/mbla/yolov6l_mbla.py b/configs/yolov6x_seg.py similarity index 79% rename from configs/mbla/yolov6l_mbla.py rename to configs/yolov6x_seg.py index 7534b705..3ef53e50 100644 --- a/configs/mbla/yolov6l_mbla.py +++ b/configs/yolov6x_seg.py @@ -1,29 +1,31 @@ -# YOLOv6l model +# YOLOv6l-seg model model = dict( - type='YOLOv6l_mbla', + type='YOLOv6l', pretrained=None, - depth_multiple=0.5, - width_multiple=1.0, + depth_multiple=1.33, + width_multiple=1.25, backbone=dict( type='CSPBepBackbone', - num_repeats=[1, 4, 8, 8, 4], + num_repeats=[1, 6, 12, 18, 6], out_channels=[64, 128, 256, 512, 1024], csp_e=float(1)/2, fuse_P2=True, - stage_block_type="MBLABlock", ), neck=dict( type='CSPRepBiFPANNeck', - num_repeats=[8, 8, 8, 8], + num_repeats=[12, 12, 12, 12], out_channels=[256, 128, 128, 256, 256, 512], csp_e=float(1)/2, - stage_block_type="MBLABlock", ), head=dict( type='EffiDeHead', in_channels=[128, 256, 512], num_layers=3, begin_indices=24, + npr=256, + nm=32, + isseg=True, + issolo=False, anchors=3, anchors_init=[[10,13, 19,19, 33,23], [30,61, 59,59, 59,119], @@ -47,7 +49,7 @@ lr0=0.01, lrf=0.01, momentum=0.937, - weight_decay=0.0005, + weight_decay=0.001, warmup_epochs=3.0, warmup_momentum=0.8, warmup_bias_lr=0.1 @@ -66,5 +68,5 @@ mosaic=1.0, mixup=0.1, ) - training_mode = "conv_silu" +# use normal conv to speed up training and further improve accuracy. diff --git a/data/coco.yaml b/data/coco.yaml index d20d411e..8ce2676d 100644 --- a/data/coco.yaml +++ b/data/coco.yaml @@ -1,13 +1,11 @@ # COCO 2017 dataset http://cocodataset.org -train: ../coco/images/train2017 # 118287 images -val: ../coco/images/val2017 # 5000 images -test: ../coco/images/test2017 -anno_path: ../coco/annotations/instances_val2017.json +train: ./data/coco/images/train2017 # 118287 images +val: ./data/coco/images/val2017 # 5000 images +# test: ./data/coco/images/val2017 # number of classes nc: 80 -# whether it is coco dataset, only coco dataset should be set to True. -is_coco: True + # class names names: [ 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', diff --git a/data/images/000000056350.jpg b/data/images/000000056350.jpg new file mode 100644 index 00000000..0c95d084 Binary files /dev/null and b/data/images/000000056350.jpg differ diff --git a/data/images/9_Press_Conference_Press_Conference_9_946.jpg b/data/images/9_Press_Conference_Press_Conference_9_946.jpg new file mode 100644 index 00000000..aa342667 Binary files /dev/null and b/data/images/9_Press_Conference_Press_Conference_9_946.jpg differ diff --git a/deploy/ONNX/README.md b/deploy/ONNX/README.md index d42f3c8c..c3a618cb 100644 --- a/deploy/ONNX/README.md +++ b/deploy/ONNX/README.md @@ -33,15 +33,6 @@ python ./deploy/ONNX/export_onnx.py \ - `--conf-thres` : Confidence threshold for NMS algorithm. - `--device` : Export device. Cuda device : 0 or 0,1,2,3 ... , CPU : cpu . -## Download - -* [YOLOv6-N](https://github.com/meituan/YOLOv6/releases/download/0.2.0/yolov6n.onnx) -* [YOLOv6-T](https://github.com/meituan/YOLOv6/releases/download/0.2.0/yolov6t.onnx) -* [YOLOv6-S](https://github.com/meituan/YOLOv6/releases/download/0.2.0/yolov6s.onnx) -* [YOLOv6-M](https://github.com/meituan/YOLOv6/releases/download/0.2.0/yolov6m.onnx) -* [YOLOv6-L-ReLU](https://github.com/meituan/YOLOv6/releases/download/0.2.0/yolov6l_relu.onnx) -* [YOLOv6-L](https://github.com/meituan/YOLOv6/releases/download/0.2.0/yolov6l.onnx) - ## End2End export diff --git a/deploy/ONNX/export_onnx.py b/deploy/ONNX/export_onnx.py index ba7440ae..85368c85 100644 --- a/deploy/ONNX/export_onnx.py +++ b/deploy/ONNX/export_onnx.py @@ -22,7 +22,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('--weights', type=str, default='./yolov6s.pt', help='weights path') + parser.add_argument('--weights', type=str, default='./weights/best_ckpt.pt', help='weights path') parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='image size, the order is: height width') # height, width parser.add_argument('--batch-size', type=int, default=1, help='batch size') parser.add_argument('--half', action='store_true', help='FP16 half-precision export') diff --git a/tools/eval.py b/tools/eval.py index 5543029c..7814639e 100644 --- a/tools/eval.py +++ b/tools/eval.py @@ -23,13 +23,13 @@ def boolean_string(s): def get_args_parser(add_help=True): parser = argparse.ArgumentParser(description='YOLOv6 PyTorch Evalating', add_help=add_help) parser.add_argument('--data', type=str, default='./data/coco.yaml', help='dataset.yaml path') - parser.add_argument('--weights', type=str, default='./weights/yolov6s.pt', help='model.pt path(s)') - parser.add_argument('--batch-size', type=int, default=32, help='batch size') + parser.add_argument('--weights', type=str, default='./checkpoints/yolov6n_yol.pt', help='model.pt path(s)') + parser.add_argument('--batch-size', type=int, default=2, help='batch size') parser.add_argument('--img-size', type=int, default=640, help='inference size (pixels)') parser.add_argument('--conf-thres', type=float, default=0.03, help='confidence threshold') parser.add_argument('--iou-thres', type=float, default=0.65, help='NMS IoU threshold') parser.add_argument('--task', default='val', help='val, test, or speed') - parser.add_argument('--device', default='0', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') + parser.add_argument('--device', default='4', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') parser.add_argument('--half', default=False, action='store_true', help='whether to use fp16 infer') parser.add_argument('--save_dir', type=str, default='runs/val/', help='evaluation save dir') parser.add_argument('--name', type=str, default='exp', help='save evaluation results to save_dir/name') @@ -37,8 +37,8 @@ def get_args_parser(add_help=True): parser.add_argument('--infer_on_rect', default=True, type=boolean_string, help='default to run with rectangle image to boost speed.') parser.add_argument('--reproduce_640_eval', default=False, action='store_true', help='whether to reproduce 640 infer result, overwrite some config') parser.add_argument('--eval_config_file', type=str, default='./configs/experiment/eval_640_repro.py', help='config file for repro 640 infer result') - parser.add_argument('--do_coco_metric', default=True, type=boolean_string, help='whether to use pycocotool to metric, set False to close') - parser.add_argument('--do_pr_metric', default=False, type=boolean_string, help='whether to calculate precision, recall and F1, n, set False to close') + parser.add_argument('--do_coco_metric', default=False, type=boolean_string, help='whether to use pycocotool to metric, set False to close') + parser.add_argument('--do_pr_metric', default=True, type=boolean_string, help='whether to calculate precision, recall and F1, n, set False to close') parser.add_argument('--plot_curve', default=True, type=boolean_string, help='whether to save plots in savedir when do pr metric, set False to close') parser.add_argument('--plot_confusion_matrix', default=False, action='store_true', help='whether to save confusion matrix plots when do pr metric, might cause no harm warning print') parser.add_argument('--verbose', default=False, action='store_true', help='whether to print metric on each class') @@ -46,6 +46,7 @@ def get_args_parser(add_help=True): parser.add_argument('--specific-shape', action='store_true', help='rectangular training') parser.add_argument('--height', type=int, default=None, help='image height of model input') parser.add_argument('--width', type=int, default=None, help='image width of model input') + parser.add_argument('--issolo', default=False, type=boolean_string, help='is solo format') args = parser.parse_args() if args.config_file: @@ -113,7 +114,8 @@ def run(data, config_file=None, specific_shape=False, height=640, - width=640 + width=640, + issolo=False ): """ Run the evaluation process @@ -155,9 +157,11 @@ def run(data, # eval model.eval() - pred_result, vis_outputs, vis_paths = val.predict_model(model, dataloader, task) - eval_result = val.eval_model(pred_result, model, dataloader, task) - return eval_result, vis_outputs, vis_paths + pred_result, _, __= val.predict_model(model, dataloader, task, issolo=issolo) + return pred_result, _, __ + #raise ValueError("..") + #eval_result = val.eval_model(pred_result, model, dataloader, task) + #return eval_result, vis_outputs, vis_paths def main(args): diff --git a/tools/infer.py b/tools/infer.py index 95b3fdc7..cb051112 100644 --- a/tools/infer.py +++ b/tools/infer.py @@ -17,11 +17,11 @@ def get_args_parser(add_help=True): parser = argparse.ArgumentParser(description='YOLOv6 PyTorch Inference.', add_help=add_help) - parser.add_argument('--weights', type=str, default='weights/yolov6s.pt', help='model path(s) for inference.') - parser.add_argument('--source', type=str, default='data/images', help='the source path, e.g. image-file/dir.') + parser.add_argument('--weights', type=str, default='./checkpoints/yolov6n_yol.pt', help='model path(s) for inference.') + parser.add_argument('--source', type=str, default='./data/images', help='the source path, e.g. image-file/dir.') parser.add_argument('--webcam', action='store_true', help='whether to use webcam.') - parser.add_argument('--webcam-addr', type=str, default='0', help='the web camera address, local camera or rtsp address.') - parser.add_argument('--yaml', type=str, default='data/coco.yaml', help='data yaml file.') + parser.add_argument('--webcam-addr', type=str, default='6', help='the web camera address, local camera or rtsp address.') + parser.add_argument('--yaml', type=str, default='data/test.yaml', help='data yaml file.') parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='the image-size(h,w) in inference size.') parser.add_argument('--conf-thres', type=float, default=0.4, help='confidence threshold for inference.') parser.add_argument('--iou-thres', type=float, default=0.45, help='NMS IoU threshold for inference.') @@ -29,7 +29,7 @@ def get_args_parser(add_help=True): parser.add_argument('--device', default='0', help='device to run our model i.e. 0 or 0,1,2,3 or cpu.') parser.add_argument('--save-txt', action='store_true', help='save results to *.txt.') parser.add_argument('--not-save-img', action='store_true', help='do not save visuallized inference results.') - parser.add_argument('--save-dir', type=str, help='directory to save predictions in. See --save-txt.') + parser.add_argument('--save-dir', type=str, default='./runs/inference', help='directory to save predictions in. See --save-txt.') parser.add_argument('--view-img', action='store_true', help='show inference results') parser.add_argument('--classes', nargs='+', type=int, help='filter by classes, e.g. --classes 0, or --classes 0 2 3.') parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS.') @@ -38,6 +38,7 @@ def get_args_parser(add_help=True): parser.add_argument('--hide-labels', default=False, action='store_true', help='hide labels.') parser.add_argument('--hide-conf', default=False, action='store_true', help='hide confidences.') parser.add_argument('--half', action='store_true', help='whether to use FP16 half-precision inference.') + parser.add_argument('--issolo', action='store_true', help='solo structure or not') args = parser.parse_args() LOGGER.info(args) @@ -66,6 +67,7 @@ def run(weights=osp.join(ROOT, 'yolov6s.pt'), hide_labels=False, hide_conf=False, half=False, + issolo=False ): """ Inference process, supporting inference on one image file or directory which containing images. Args: @@ -105,7 +107,7 @@ def run(weights=osp.join(ROOT, 'yolov6s.pt'), # Inference inferer = Inferer(source, webcam, webcam_addr, weights, device, yaml, img_size, half) - inferer.infer(conf_thres, iou_thres, classes, agnostic_nms, max_det, save_dir, save_txt, not not_save_img, hide_labels, hide_conf, view_img) + inferer.infer(conf_thres, iou_thres, classes, agnostic_nms, max_det, save_dir, save_txt, not not_save_img, hide_labels, hide_conf, view_img, issolo=issolo) if save_txt or not not_save_img: LOGGER.info(f"Results saved to {save_dir}") diff --git a/tools/train.py b/tools/train.py index 635c68e4..9771e562 100644 --- a/tools/train.py +++ b/tools/train.py @@ -25,10 +25,10 @@ def get_args_parser(add_help=True): parser = argparse.ArgumentParser(description='YOLOv6 PyTorch Training', add_help=add_help) parser.add_argument('--data-path', default='./data/coco.yaml', type=str, help='path of dataset') - parser.add_argument('--conf-file', default='./configs/yolov6n.py', type=str, help='experiments description file') + parser.add_argument('--conf-file', default='./configs/yolov6s.py', type=str, help='experiments description file') parser.add_argument('--img-size', default=640, type=int, help='train, val image size (pixels)') parser.add_argument('--rect', action='store_true', help='whether to use rectangular training, default is False') - parser.add_argument('--batch-size', default=32, type=int, help='total batch size for all GPUs') + parser.add_argument('--batch-size', default=16, type=int, help='total batch size for all GPUs') parser.add_argument('--epochs', default=400, type=int, help='number of total epochs to run') parser.add_argument('--workers', default=8, type=int, help='number of data loading workers (default: 8)') parser.add_argument('--device', default='0', type=str, help='cuda device, i.e. 0 or 0,1,2,3 or cpu') @@ -45,7 +45,7 @@ def get_args_parser(add_help=True): parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter') parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume the most recent training') parser.add_argument('--write_trainbatch_tb', action='store_true', help='write train_batch image to tensorboard once an epoch, may slightly slower train speed if open') - parser.add_argument('--stop_aug_last_n_epoch', default=15, type=int, help='stop strong aug at last n epoch, neg value not stop, default 15') + parser.add_argument('--stop_aug_last_n_epoch', default=-1, type=int, help='stop strong aug at last n epoch, neg value not stop, default 15') parser.add_argument('--save_ckpt_on_last_n_epoch', default=-1, type=int, help='save last n epoch even not best or last, neg value not save') parser.add_argument('--distill', action='store_true', help='distill or not') parser.add_argument('--distill_feat', action='store_true', help='distill featmap or not') @@ -54,7 +54,7 @@ def get_args_parser(add_help=True): parser.add_argument('--teacher_model_path', type=str, default=None, help='teacher model path') parser.add_argument('--temperature', type=int, default=20, help='distill temperature') parser.add_argument('--fuse_ab', action='store_true', help='fuse ab branch in training process or not') - parser.add_argument('--bs_per_gpu', default=32, type=int, help='batch size per GPU for auto-rescale learning rate, set to 16 for P6 models') + parser.add_argument('--bs_per_gpu', default=8, type=int, help='batch size per GPU for auto-rescale learning rate, set to 16 for P6 models') parser.add_argument('--specific-shape', action='store_true', help='rectangular training') parser.add_argument('--height', type=int, default=None, help='image height of model input') parser.add_argument('--width', type=int, default=None, help='image width of model input') diff --git a/yolov6/assigners/anchor_generator.py b/yolov6/assigners/anchor_generator.py index c8276418..3a41e0ba 100644 --- a/yolov6/assigners/anchor_generator.py +++ b/yolov6/assigners/anchor_generator.py @@ -1,7 +1,5 @@ import torch -from yolov6.utils.general import check_version -torch_1_10_plus = check_version(torch.__version__, minimum='1.10.0') def generate_anchors(feats, fpn_strides, grid_cell_size=5.0, grid_cell_offset=0.5, device='cpu', is_eval=False, mode='af'): '''Generate anchors from features.''' @@ -15,7 +13,10 @@ def generate_anchors(feats, fpn_strides, grid_cell_size=5.0, grid_cell_offset=0. _, _, h, w = feats[i].shape shift_x = torch.arange(end=w, device=device) + grid_cell_offset shift_y = torch.arange(end=h, device=device) + grid_cell_offset - shift_y, shift_x = torch.meshgrid(shift_y, shift_x, indexing='ij') if torch_1_10_plus else torch.meshgrid(shift_y, shift_x) + try: + shift_y, shift_x = torch.meshgrid(shift_y, shift_x, indexing='ij') + except: + shift_y, shift_x = torch.meshgrid(shift_y, shift_x) anchor_point = torch.stack( [shift_x, shift_y], axis=-1).to(torch.float) if mode == 'af': # anchor-free @@ -37,7 +38,10 @@ def generate_anchors(feats, fpn_strides, grid_cell_size=5.0, grid_cell_offset=0. cell_half_size = grid_cell_size * stride * 0.5 shift_x = (torch.arange(end=w, device=device) + grid_cell_offset) * stride shift_y = (torch.arange(end=h, device=device) + grid_cell_offset) * stride - shift_y, shift_x = torch.meshgrid(shift_y, shift_x, indexing='ij') if torch_1_10_plus else torch.meshgrid(shift_y, shift_x) + try: + shift_y, shift_x = torch.meshgrid(shift_y, shift_x, indexing='ij') + except: + shift_y, shift_x = torch.meshgrid(shift_y, shift_x) anchor = torch.stack( [ shift_x - cell_half_size, shift_y - cell_half_size, diff --git a/yolov6/assigners/atss_assigner.py b/yolov6/assigners/atss_assigner.py index 12a5f243..c1d51e74 100644 --- a/yolov6/assigners/atss_assigner.py +++ b/yolov6/assigners/atss_assigner.py @@ -21,7 +21,8 @@ def forward(self, gt_labels, gt_bboxes, mask_gt, - pd_bboxes): + pd_bboxes, + gt_segmasks): r"""This code is based on https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/atss_assigner.py @@ -47,7 +48,8 @@ def forward(self, return torch.full( [self.bs, self.n_anchors], self.bg_idx).to(device), \ torch.zeros([self.bs, self.n_anchors, 4]).to(device), \ torch.zeros([self.bs, self.n_anchors, self.num_classes]).to(device), \ - torch.zeros([self.bs, self.n_anchors]).to(device) + torch.zeros([self.bs, self.n_anchors]).to(device), \ + torch.zeros(*pd_bboxes.shape[:2], 40, 40) overlaps = iou2d_calculator(gt_bboxes.reshape([-1, 4]), anc_bboxes) @@ -74,7 +76,7 @@ def forward(self, mask_pos, overlaps, self.n_max_boxes) # assigned target - target_labels, target_bboxes, target_scores = self.get_targets( + target_labels, target_bboxes, target_scores, target_segmasks = self.get_targets( gt_labels, gt_bboxes, target_gt_idx, fg_mask) # soft label with iou @@ -83,7 +85,7 @@ def forward(self, ious = ious.max(axis=-2)[0].unsqueeze(-1) target_scores *= ious - return target_labels.long(), target_bboxes, target_scores, fg_mask.bool() + return target_labels.long(), target_bboxes, target_scores, fg_mask.bool(), target_segmasks def select_topk_candidates(self, distances, @@ -139,7 +141,8 @@ def get_targets(self, gt_labels, gt_bboxes, target_gt_idx, - fg_mask): + fg_mask, + gt_segmasks): # assigned target labels batch_idx = torch.arange(self.bs, dtype=gt_labels.dtype, device=gt_labels.device) @@ -158,4 +161,7 @@ def get_targets(self, target_scores = F.one_hot(target_labels.long(), self.num_classes + 1).float() target_scores = target_scores[:, :, :self.num_classes] - return target_labels, target_bboxes, target_scores + m_shape = gt_segmasks.shape[-2:] + target_segmasks = gt_segmasks.reshape([-1, m_shape[0], m_shape[1]])[target_gt_idx.flatten()] + + return target_labels, target_bboxes, target_scores, target_segmasks diff --git a/yolov6/assigners/atss_assigner_seg.py b/yolov6/assigners/atss_assigner_seg.py new file mode 100644 index 00000000..bf844387 --- /dev/null +++ b/yolov6/assigners/atss_assigner_seg.py @@ -0,0 +1,166 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from yolov6.assigners.iou2d_calculator import iou2d_calculator +from yolov6.assigners.assigner_utils import dist_calculator, select_candidates_in_gts, select_highest_overlaps, iou_calculator + +class ATSSAssigner(nn.Module): + '''Adaptive Training Sample Selection Assigner''' + def __init__(self, + topk=9, + num_classes=80): + super(ATSSAssigner, self).__init__() + self.topk = topk + self.num_classes = num_classes + self.bg_idx = num_classes + + @torch.no_grad() + def forward(self, + anc_bboxes, + n_level_bboxes, + gt_labels, + gt_bboxes, + mask_gt, + pd_bboxes, + gt_segmasks): + r"""This code is based on + https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/atss_assigner.py + + Args: + anc_bboxes (Tensor): shape(num_total_anchors, 4) + n_level_bboxes (List):len(3) + gt_labels (Tensor): shape(bs, n_max_boxes, 1) + gt_bboxes (Tensor): shape(bs, n_max_boxes, 4) + mask_gt (Tensor): shape(bs, n_max_boxes, 1) + pd_bboxes (Tensor): shape(bs, n_max_boxes, 4) + Returns: + target_labels (Tensor): shape(bs, num_total_anchors) + target_bboxes (Tensor): shape(bs, num_total_anchors, 4) + target_scores (Tensor): shape(bs, num_total_anchors, num_classes) + fg_mask (Tensor): shape(bs, num_total_anchors) + """ + self.n_anchors = anc_bboxes.size(0) + self.bs = gt_bboxes.size(0) + self.n_max_boxes = gt_bboxes.size(1) + + if self.n_max_boxes == 0: + device = gt_bboxes.device + return torch.full( [self.bs, self.n_anchors], self.bg_idx).to(device), \ + torch.zeros([self.bs, self.n_anchors, 4]).to(device), \ + torch.zeros([self.bs, self.n_anchors, self.num_classes]).to(device), \ + torch.zeros([self.bs, self.n_anchors]).to(device) + + + overlaps = iou2d_calculator(gt_bboxes.reshape([-1, 4]), anc_bboxes) + overlaps = overlaps.reshape([self.bs, -1, self.n_anchors]) + + distances, ac_points = dist_calculator(gt_bboxes.reshape([-1, 4]), anc_bboxes) + distances = distances.reshape([self.bs, -1, self.n_anchors]) + + is_in_candidate, candidate_idxs = self.select_topk_candidates( + distances, n_level_bboxes, mask_gt) + + overlaps_thr_per_gt, iou_candidates = self.thres_calculator( + is_in_candidate, candidate_idxs, overlaps) + + # select candidates iou >= threshold as positive + is_pos = torch.where( + iou_candidates > overlaps_thr_per_gt.repeat([1, 1, self.n_anchors]), + is_in_candidate, torch.zeros_like(is_in_candidate)) + + is_in_gts = select_candidates_in_gts(ac_points, gt_bboxes) + mask_pos = is_pos * is_in_gts * mask_gt + + target_gt_idx, fg_mask, mask_pos = select_highest_overlaps( + mask_pos, overlaps, self.n_max_boxes) + + # assigned target + target_labels, target_bboxes, target_scores, target_segmasks = self.get_targets( + gt_labels, gt_bboxes, target_gt_idx, fg_mask, gt_segmasks) + + # soft label with iou + if pd_bboxes is not None: + ious = iou_calculator(gt_bboxes, pd_bboxes) * mask_pos + ious = ious.max(axis=-2)[0].unsqueeze(-1) + target_scores *= ious + + return target_labels.long(), target_bboxes, target_scores, fg_mask.bool(), target_segmasks + + def select_topk_candidates(self, + distances, + n_level_bboxes, + mask_gt): + + mask_gt = mask_gt.repeat(1, 1, self.topk).bool() + level_distances = torch.split(distances, n_level_bboxes, dim=-1) + is_in_candidate_list = [] + candidate_idxs = [] + start_idx = 0 + for per_level_distances, per_level_boxes in zip(level_distances, n_level_bboxes): + + end_idx = start_idx + per_level_boxes + selected_k = min(self.topk, per_level_boxes) + _, per_level_topk_idxs = per_level_distances.topk(selected_k, dim=-1, largest=False) + candidate_idxs.append(per_level_topk_idxs + start_idx) + per_level_topk_idxs = torch.where(mask_gt, + per_level_topk_idxs, torch.zeros_like(per_level_topk_idxs)) + is_in_candidate = F.one_hot(per_level_topk_idxs, per_level_boxes).sum(dim=-2) + is_in_candidate = torch.where(is_in_candidate > 1, + torch.zeros_like(is_in_candidate), is_in_candidate) + is_in_candidate_list.append(is_in_candidate.to(distances.dtype)) + start_idx = end_idx + + is_in_candidate_list = torch.cat(is_in_candidate_list, dim=-1) + candidate_idxs = torch.cat(candidate_idxs, dim=-1) + + return is_in_candidate_list, candidate_idxs + + def thres_calculator(self, + is_in_candidate, + candidate_idxs, + overlaps): + + n_bs_max_boxes = self.bs * self.n_max_boxes + _candidate_overlaps = torch.where(is_in_candidate > 0, + overlaps, torch.zeros_like(overlaps)) + candidate_idxs = candidate_idxs.reshape([n_bs_max_boxes, -1]) + assist_idxs = self.n_anchors * torch.arange(n_bs_max_boxes, device=candidate_idxs.device) + assist_idxs = assist_idxs[:,None] + faltten_idxs = candidate_idxs + assist_idxs + candidate_overlaps = _candidate_overlaps.reshape(-1)[faltten_idxs] + candidate_overlaps = candidate_overlaps.reshape([self.bs, self.n_max_boxes, -1]) + + overlaps_mean_per_gt = candidate_overlaps.mean(axis=-1, keepdim=True) + overlaps_std_per_gt = candidate_overlaps.std(axis=-1, keepdim=True) + overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt + + return overlaps_thr_per_gt, _candidate_overlaps + + def get_targets(self, + gt_labels, + gt_bboxes, + target_gt_idx, + fg_mask, + gt_segmasks): + + # assigned target labels + batch_idx = torch.arange(self.bs, dtype=gt_labels.dtype, device=gt_labels.device) + batch_idx = batch_idx[...,None] + target_gt_idx = (target_gt_idx + batch_idx * self.n_max_boxes).long() + target_labels = gt_labels.flatten()[target_gt_idx.flatten()] + target_labels = target_labels.reshape([self.bs, self.n_anchors]) + target_labels = torch.where(fg_mask > 0, + target_labels, torch.full_like(target_labels, self.bg_idx)) + + # assigned target boxes + target_bboxes = gt_bboxes.reshape([-1, 4])[target_gt_idx.flatten()] + target_bboxes = target_bboxes.reshape([self.bs, self.n_anchors, 4]) + + # assigned target scores + target_scores = F.one_hot(target_labels.long(), self.num_classes + 1).float() + target_scores = target_scores[:, :, :self.num_classes] + + m_shape = gt_segmasks.shape[-2:] + target_segmasks = gt_segmasks.reshape([-1, m_shape[0], m_shape[1]])[target_gt_idx.flatten()] + + return target_labels, target_bboxes, target_scores, target_segmasks diff --git a/yolov6/assigners/tal_assigner.py b/yolov6/assigners/tal_assigner.py index 45008f5a..d1bd404a 100644 --- a/yolov6/assigners/tal_assigner.py +++ b/yolov6/assigners/tal_assigner.py @@ -25,7 +25,8 @@ def forward(self, anc_points, gt_labels, gt_bboxes, - mask_gt): + mask_gt, + gt_segmasks): """This code referenced to https://github.com/Nioolek/PPYOLOE_pytorch/blob/master/ppyoloe/assigner/tal_assigner.py @@ -50,10 +51,11 @@ def forward(self, return torch.full_like(pd_scores[..., 0], self.bg_idx).to(device), \ torch.zeros_like(pd_bboxes).to(device), \ torch.zeros_like(pd_scores).to(device), \ - torch.zeros_like(pd_scores[..., 0]).to(device) + torch.zeros_like(pd_scores[..., 0]).to(device), \ + torch.zeros(*pd_bboxes.shape[:2], 40, 40) cycle, step, self.bs = (1, self.bs, self.bs) if self.n_max_boxes <= 100 else (self.bs, 1, 1) - target_labels_lst, target_bboxes_lst, target_scores_lst, fg_mask_lst = [], [], [], [] + target_labels_lst, target_bboxes_lst, target_scores_lst, fg_mask_lst, target_segmasks_lst = [], [], [], [], [] # loop batch dim in case of numerous object box for i in range(cycle): start, end = i*step, (i+1)*step @@ -62,6 +64,7 @@ def forward(self, gt_labels_ = gt_labels[start:end, ...] gt_bboxes_ = gt_bboxes[start:end, ...] mask_gt_ = mask_gt[start:end, ...] + gt_segmasks_ = gt_segmasks[start:end, ...] mask_pos, align_metric, overlaps = self.get_pos_mask( pd_scores_, pd_bboxes_, gt_labels_, gt_bboxes_, anc_points, mask_gt_) @@ -70,8 +73,8 @@ def forward(self, mask_pos, overlaps, self.n_max_boxes) # assigned target - target_labels, target_bboxes, target_scores = self.get_targets( - gt_labels_, gt_bboxes_, target_gt_idx, fg_mask) + target_labels, target_bboxes, target_scores, target_segmasks = self.get_targets( + gt_labels_, gt_bboxes_, target_gt_idx, fg_mask, gt_segmasks_) # normalize align_metric *= mask_pos @@ -85,14 +88,16 @@ def forward(self, target_bboxes_lst.append(target_bboxes) target_scores_lst.append(target_scores) fg_mask_lst.append(fg_mask) + target_segmasks_lst.append(target_segmasks) # concat target_labels = torch.cat(target_labels_lst, 0) target_bboxes = torch.cat(target_bboxes_lst, 0) target_scores = torch.cat(target_scores_lst, 0) fg_mask = torch.cat(fg_mask_lst, 0) + target_segmasks = torch.cat(target_segmasks_lst, 0) - return target_labels, target_bboxes, target_scores, fg_mask.bool() + return target_labels, target_bboxes, target_scores, fg_mask.bool(), target_segmasks def get_pos_mask(self, pd_scores, @@ -153,7 +158,8 @@ def get_targets(self, gt_labels, gt_bboxes, target_gt_idx, - fg_mask): + fg_mask, + gt_segmasks): # assigned target labels batch_ind = torch.arange(end=self.bs, dtype=torch.int64, device=gt_labels.device)[...,None] @@ -169,5 +175,8 @@ def get_targets(self, fg_scores_mask = fg_mask[:, :, None].repeat(1, 1, self.num_classes) target_scores = torch.where(fg_scores_mask > 0, target_scores, torch.full_like(target_scores, 0)) + m_shape = gt_segmasks.shape[-2:] + target_segmasks = gt_segmasks.reshape([-1, m_shape[0], m_shape[1]])[target_gt_idx] - return target_labels, target_bboxes, target_scores + + return target_labels, target_bboxes, target_scores, target_segmasks diff --git a/yolov6/assigners/tal_assigner_seg.py b/yolov6/assigners/tal_assigner_seg.py new file mode 100644 index 00000000..057c718b --- /dev/null +++ b/yolov6/assigners/tal_assigner_seg.py @@ -0,0 +1,185 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from yolov6.assigners.assigner_utils import select_candidates_in_gts, select_highest_overlaps, iou_calculator, dist_calculator + +class TaskAlignedAssigner(nn.Module): + def __init__(self, + topk=13, + num_classes=80, + alpha=1.0, + beta=6.0, + eps=1e-9): + super(TaskAlignedAssigner, self).__init__() + self.topk = topk + self.num_classes = num_classes + self.bg_idx = num_classes + self.alpha = alpha + self.beta = beta + self.eps = eps + + @torch.no_grad() + def forward(self, + pd_scores, + pd_bboxes, + anc_points, + gt_labels, + gt_bboxes, + mask_gt, + gt_segmasks): + """This code referenced to + https://github.com/Nioolek/PPYOLOE_pytorch/blob/master/ppyoloe/assigner/tal_assigner.py + + Args: + pd_scores (Tensor): shape(bs, num_total_anchors, num_classes) + pd_bboxes (Tensor): shape(bs, num_total_anchors, 4) + anc_points (Tensor): shape(num_total_anchors, 2) + gt_labels (Tensor): shape(bs, n_max_boxes, 1) + gt_bboxes (Tensor): shape(bs, n_max_boxes, 4) + mask_gt (Tensor): shape(bs, n_max_boxes, 1) + Returns: + target_labels (Tensor): shape(bs, num_total_anchors) + target_bboxes (Tensor): shape(bs, num_total_anchors, 4) + target_scores (Tensor): shape(bs, num_total_anchors, num_classes) + fg_mask (Tensor): shape(bs, num_total_anchors) + """ + self.bs = pd_scores.size(0) + self.n_max_boxes = gt_bboxes.size(1) + + if self.n_max_boxes == 0: + device = gt_bboxes.device + return torch.full_like(pd_scores[..., 0], self.bg_idx).to(device), \ + torch.zeros_like(pd_bboxes).to(device), \ + torch.zeros_like(pd_scores).to(device), \ + torch.zeros_like(pd_scores[..., 0]).to(device), \ + [] + #torch.zeros(*pd_bboxes.shape[:2]).to(device) + + + # cycle, step, self.bs = (1, self.bs, self.bs) if self.n_max_boxes <= 100 else (self.bs, 1, 1) + cycle, step, self.bs = (1, self.bs, self.bs) + target_labels_lst, target_bboxes_lst, target_scores_lst, fg_mask_lst, idx_lst = [], [], [], [], [] + # loop batch dim in case of numerous object box + for i in range(cycle): + start, end = i*step, (i+1)*step + pd_scores_ = pd_scores[start:end, ...] + pd_bboxes_ = pd_bboxes[start:end, ...] + gt_labels_ = gt_labels[start:end, ...] + gt_bboxes_ = gt_bboxes[start:end, ...] + mask_gt_ = mask_gt[start:end, ...] + # gt_segmasks_ = gt_segmasks[start:end, ...] + + mask_pos, align_metric, overlaps = self.get_pos_mask( + pd_scores_, pd_bboxes_, gt_labels_, gt_bboxes_, anc_points, mask_gt_) + + target_gt_idx, fg_mask, mask_pos = select_highest_overlaps( + mask_pos, overlaps, self.n_max_boxes) + + # assigned target + target_labels, target_bboxes, target_scores, idx = self.get_targets( + gt_labels_, gt_bboxes_, target_gt_idx, fg_mask) + + # normalize + align_metric *= mask_pos + pos_align_metrics = align_metric.max(axis=-1, keepdim=True)[0] + pos_overlaps = (overlaps * mask_pos).max(axis=-1, keepdim=True)[0] + norm_align_metric = (align_metric * pos_overlaps / (pos_align_metrics + self.eps)).max(-2)[0].unsqueeze(-1) + target_scores = target_scores * norm_align_metric + + # append + target_labels_lst.append(target_labels) + idx_lst.append(idx) + target_bboxes_lst.append(target_bboxes) + target_scores_lst.append(target_scores) + fg_mask_lst.append(fg_mask) + # target_segmasks_lst.append(target_segmasks) + + # concat + target_labels = torch.cat(target_labels_lst, 0) + target_bboxes = torch.cat(target_bboxes_lst, 0) + target_scores = torch.cat(target_scores_lst, 0) + fg_mask = torch.cat(fg_mask_lst, 0) + # target_segmasks = torch.cat(target_segmasks_lst, 0) + + return target_labels, target_bboxes, target_scores, fg_mask.bool(), idx_lst + + def get_pos_mask(self, + pd_scores, + pd_bboxes, + gt_labels, + gt_bboxes, + anc_points, + mask_gt): + + # get anchor_align metric + align_metric, overlaps = self.get_box_metrics(pd_scores, pd_bboxes, gt_labels, gt_bboxes) + # get in_gts mask + mask_in_gts = select_candidates_in_gts(anc_points, gt_bboxes) + # get topk_metric mask + mask_topk = self.select_topk_candidates( + align_metric * mask_in_gts, topk_mask=mask_gt.repeat([1, 1, self.topk]).bool()) + # merge all mask to a final mask + mask_pos = mask_topk * mask_in_gts * mask_gt + + return mask_pos, align_metric, overlaps + + def get_box_metrics(self, + pd_scores, + pd_bboxes, + gt_labels, + gt_bboxes): + + pd_scores = pd_scores.permute(0, 2, 1) + gt_labels = gt_labels.to(torch.long) + ind = torch.zeros([2, self.bs, self.n_max_boxes], dtype=torch.long) + ind[0] = torch.arange(end=self.bs).view(-1, 1).repeat(1, self.n_max_boxes) + ind[1] = gt_labels.squeeze(-1) + bbox_scores = pd_scores[ind[0], ind[1]] + + overlaps = iou_calculator(gt_bboxes, pd_bboxes) + align_metric = bbox_scores.pow(self.alpha) * overlaps.pow(self.beta) + + return align_metric, overlaps + + def select_topk_candidates(self, + metrics, + largest=True, + topk_mask=None): + + num_anchors = metrics.shape[-1] + topk_metrics, topk_idxs = torch.topk( + metrics, self.topk, axis=-1, largest=largest) + if topk_mask is None: + topk_mask = (topk_metrics.max(axis=-1, keepdim=True) > self.eps).tile( + [1, 1, self.topk]) + topk_idxs = torch.where(topk_mask, topk_idxs, torch.zeros_like(topk_idxs)) + is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(axis=-2) + is_in_topk = torch.where(is_in_topk > 1, + torch.zeros_like(is_in_topk), is_in_topk) + return is_in_topk.to(metrics.dtype) + + def get_targets(self, + gt_labels, + gt_bboxes, + target_gt_idx, + fg_mask): + + # assigned target labels + batch_ind = torch.arange(end=self.bs, dtype=torch.int64, device=gt_labels.device)[...,None] + target_gt_idx = target_gt_idx + batch_ind * self.n_max_boxes + target_labels = gt_labels.long().flatten()[target_gt_idx] + + # assigned target boxes + target_bboxes = gt_bboxes.reshape([-1, 4])[target_gt_idx] + + # assigned target scores + target_labels[target_labels<0] = 0 + target_scores = F.one_hot(target_labels, self.num_classes) + fg_scores_mask = fg_mask[:, :, None].repeat(1, 1, self.num_classes) + target_scores = torch.where(fg_scores_mask > 0, target_scores, + torch.full_like(target_scores, 0)) + # m_shape = gt_segmasks.shape[-2:] + # target_segmasks = gt_segmasks.reshape([-1, m_shape[0], m_shape[1]])[target_gt_idx] + + + return target_labels, target_bboxes, target_scores, target_gt_idx diff --git a/yolov6/assigners/tal_assigner_seg2.py b/yolov6/assigners/tal_assigner_seg2.py new file mode 100644 index 00000000..aa1101cd --- /dev/null +++ b/yolov6/assigners/tal_assigner_seg2.py @@ -0,0 +1,183 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from yolov6.assigners.assigner_utils import select_candidates_in_gts, select_highest_overlaps, iou_calculator, dist_calculator + +class TaskAlignedAssigner(nn.Module): + def __init__(self, + topk=13, + num_classes=80, + alpha=1.0, + beta=6.0, + eps=1e-9): + super(TaskAlignedAssigner, self).__init__() + self.topk = topk + self.num_classes = num_classes + self.bg_idx = num_classes + self.alpha = alpha + self.beta = beta + self.eps = eps + + @torch.no_grad() + def forward(self, + pd_scores, + pd_bboxes, + anc_points, + gt_labels, + gt_bboxes, + mask_gt, + gt_segmasks): + """This code referenced to + https://github.com/Nioolek/PPYOLOE_pytorch/blob/master/ppyoloe/assigner/tal_assigner.py + + Args: + pd_scores (Tensor): shape(bs, num_total_anchors, num_classes) + pd_bboxes (Tensor): shape(bs, num_total_anchors, 4) + anc_points (Tensor): shape(num_total_anchors, 2) + gt_labels (Tensor): shape(bs, n_max_boxes, 1) + gt_bboxes (Tensor): shape(bs, n_max_boxes, 4) + mask_gt (Tensor): shape(bs, n_max_boxes, 1) + Returns: + target_labels (Tensor): shape(bs, num_total_anchors) + target_bboxes (Tensor): shape(bs, num_total_anchors, 4) + target_scores (Tensor): shape(bs, num_total_anchors, num_classes) + fg_mask (Tensor): shape(bs, num_total_anchors) + """ + self.bs = pd_scores.size(0) + self.n_max_boxes = gt_bboxes.size(1) + + if self.n_max_boxes == 0: + device = gt_bboxes.device + return torch.full_like(pd_scores[..., 0], self.bg_idx).to(device), \ + torch.zeros_like(pd_bboxes).to(device), \ + torch.zeros_like(pd_scores).to(device), \ + torch.zeros_like(pd_scores[..., 0]).to(device), \ + torch.zeros(*pd_bboxes.shape[:2], 40, 40) + + cycle, step, self.bs = (1, self.bs, self.bs) if self.n_max_boxes <= 100 else (self.bs, 1, 1) + target_labels_lst, target_bboxes_lst, target_scores_lst, fg_mask_lst, target_segmasks_lst = [], [], [], [], [] + # loop batch dim in case of numerous object box + for i in range(cycle): + start, end = i*step, (i+1)*step + pd_scores_ = pd_scores[start:end, ...] + pd_bboxes_ = pd_bboxes[start:end, ...] + gt_labels_ = gt_labels[start:end, ...] + gt_bboxes_ = gt_bboxes[start:end, ...] + mask_gt_ = mask_gt[start:end, ...] + gt_segmasks_ = gt_segmasks[start:end, ...] + + mask_pos, align_metric, overlaps = self.get_pos_mask( + pd_scores_, pd_bboxes_, gt_labels_, gt_bboxes_, anc_points, mask_gt_) + + target_gt_idx, fg_mask, mask_pos = select_highest_overlaps( + mask_pos, overlaps, self.n_max_boxes) + + # assigned target + target_labels, target_bboxes, target_scores, target_segmasks = self.get_targets( + gt_labels_, gt_bboxes_, target_gt_idx, fg_mask, gt_segmasks_) + + # normalize + align_metric *= mask_pos + pos_align_metrics = align_metric.max(axis=-1, keepdim=True)[0] + pos_overlaps = (overlaps * mask_pos).max(axis=-1, keepdim=True)[0] + norm_align_metric = (align_metric * pos_overlaps / (pos_align_metrics + self.eps)).max(-2)[0].unsqueeze(-1) + target_scores = target_scores * norm_align_metric + + # append + target_labels_lst.append(target_labels) + target_bboxes_lst.append(target_bboxes) + target_scores_lst.append(target_scores) + fg_mask_lst.append(fg_mask) + target_segmasks_lst.append(target_segmasks) + + # concat + target_labels = torch.cat(target_labels_lst, 0) + target_bboxes = torch.cat(target_bboxes_lst, 0) + target_scores = torch.cat(target_scores_lst, 0) + fg_mask = torch.cat(fg_mask_lst, 0) + target_segmasks = torch.cat(target_segmasks_lst, 0) + + return target_labels, target_bboxes, target_scores, fg_mask.bool(), target_segmasks + + def get_pos_mask(self, + pd_scores, + pd_bboxes, + gt_labels, + gt_bboxes, + anc_points, + mask_gt): + + # get anchor_align metric + align_metric, overlaps = self.get_box_metrics(pd_scores, pd_bboxes, gt_labels, gt_bboxes) + # get in_gts mask + mask_in_gts = select_candidates_in_gts(anc_points, gt_bboxes) + # get topk_metric mask + mask_topk = self.select_topk_candidates( + align_metric * mask_in_gts, topk_mask=mask_gt.repeat([1, 1, self.topk]).bool()) + # merge all mask to a final mask + mask_pos = mask_topk * mask_in_gts * mask_gt + + return mask_pos, align_metric, overlaps + + def get_box_metrics(self, + pd_scores, + pd_bboxes, + gt_labels, + gt_bboxes): + + pd_scores = pd_scores.permute(0, 2, 1) + gt_labels = gt_labels.to(torch.long) + ind = torch.zeros([2, self.bs, self.n_max_boxes], dtype=torch.long) + ind[0] = torch.arange(end=self.bs).view(-1, 1).repeat(1, self.n_max_boxes) + ind[1] = gt_labels.squeeze(-1) + bbox_scores = pd_scores[ind[0], ind[1]] + + overlaps = iou_calculator(gt_bboxes, pd_bboxes) + align_metric = bbox_scores.pow(self.alpha) * overlaps.pow(self.beta) + + return align_metric, overlaps + + def select_topk_candidates(self, + metrics, + largest=True, + topk_mask=None): + + num_anchors = metrics.shape[-1] + topk_metrics, topk_idxs = torch.topk( + metrics, self.topk, axis=-1, largest=largest) + if topk_mask is None: + topk_mask = (topk_metrics.max(axis=-1, keepdim=True) > self.eps).tile( + [1, 1, self.topk]) + topk_idxs = torch.where(topk_mask, topk_idxs, torch.zeros_like(topk_idxs)) + is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(axis=-2) + is_in_topk = torch.where(is_in_topk > 1, + torch.zeros_like(is_in_topk), is_in_topk) + return is_in_topk.to(metrics.dtype) + + def get_targets(self, + gt_labels, + gt_bboxes, + target_gt_idx, + fg_mask, + gt_segmasks): + + # assigned target labels + batch_ind = torch.arange(end=self.bs, dtype=torch.int64, device=gt_labels.device)[...,None] + target_gt_idx = target_gt_idx + batch_ind * self.n_max_boxes + target_labels = gt_labels.long().flatten()[target_gt_idx] + + # assigned target boxes + target_bboxes = gt_bboxes.reshape([-1, 4])[target_gt_idx] + + # assigned target scores + target_labels[target_labels<0] = 0 + target_scores = F.one_hot(target_labels, self.num_classes) + fg_scores_mask = fg_mask[:, :, None].repeat(1, 1, self.num_classes) + target_scores = torch.where(fg_scores_mask > 0, target_scores, + torch.full_like(target_scores, 0)) + m_shape = gt_segmasks.shape[-2:] + target_segmasks = gt_segmasks.reshape([-1, m_shape[0], m_shape[1]])[target_gt_idx] + print(target_gt_idx.shape, fg_mask.shape) + + + return target_labels, target_bboxes, target_scores, target_segmasks diff --git a/yolov6/core/engine.py b/yolov6/core/engine.py index 10545135..663a0812 100644 --- a/yolov6/core/engine.py +++ b/yolov6/core/engine.py @@ -21,7 +21,6 @@ from yolov6.models.yolo import build_model from yolov6.models.yolo_lite import build_model as build_lite_model -from yolov6.models.losses.loss import ComputeLoss as ComputeLoss from yolov6.models.losses.loss_fuseab import ComputeLoss as ComputeLoss_ab from yolov6.models.losses.loss_distill import ComputeLoss as ComputeLoss_distill from yolov6.models.losses.loss_distill_ns import ComputeLoss as ComputeLoss_distill_ns @@ -35,6 +34,8 @@ from yolov6.utils.general import download_ckpt + + class Trainer: def __init__(self, args, cfg, device): self.args = args @@ -42,6 +43,8 @@ def __init__(self, args, cfg, device): self.device = device self.max_epoch = args.epochs + + if args.resume: self.ckpt = torch.load(args.resume, map_location='cpu') @@ -105,8 +108,8 @@ def __init__(self, args, cfg, device): self.height = args.height self.width = args.width - self.loss_num = 3 - self.loss_info = ['Epoch', 'lr', 'iou_loss', 'dfl_loss', 'cls_loss'] + self.loss_num = 4 + self.loss_info = ['Epoch', 'lr', 'iou_loss', 'dfl_loss', 'cls_loss', "seg_loss"] if self.args.distill: self.loss_num += 1 self.loss_info += ['cwd_loss'] @@ -140,7 +143,9 @@ def train_one_epoch(self, epoch_num): # Training one batch data. def train_in_steps(self, epoch_num, step_num): - images, targets = self.prepro_data(self.batch_data, self.device) + # torch.cuda.synchronize() + # qq1 = time.time() + images, targets, segmasks = self.prepro_data(self.batch_data, self.device) # plot train_batch and save to tensorboard once an epoch if self.write_trainbatch_tb and self.main_process and self.step == 0: self.plot_train_batch(images, targets) @@ -149,7 +154,11 @@ def train_in_steps(self, epoch_num, step_num): # forward with amp.autocast(enabled=self.device != 'cpu'): _, _, batch_height, batch_width = images.shape + # torch.cuda.synchronize() + # qq2 = time.time() preds, s_featmaps = self.model(images) + # torch.cuda.synchronize() + # qq3 = time.time() if self.args.distill: with torch.no_grad(): t_preds, t_featmaps = self.teacher_model(images) @@ -159,18 +168,21 @@ def train_in_steps(self, epoch_num, step_num): batch_height, batch_width) elif self.args.fuse_ab: - total_loss, loss_items = self.compute_loss((preds[0],preds[3],preds[4]), targets, epoch_num, - step_num, batch_height, batch_width) # YOLOv6_af - total_loss_ab, loss_items_ab = self.compute_loss_ab(preds[:3], targets, epoch_num, step_num, - batch_height, batch_width) # YOLOv6_ab + total_loss, loss_items = self.compute_loss((preds[0],preds[3],preds[4], preds[5]), targets, epoch_num, + step_num, batch_height, batch_width, segmasks) # YOLOv6_af + total_loss_ab, loss_items_ab = self.compute_loss_ab((preds[0],preds[1],preds[2], preds[6]), targets, epoch_num, step_num, + batch_height, batch_width, segmasks) # YOLOv6_ab total_loss += total_loss_ab loss_items += loss_items_ab else: - total_loss, loss_items = self.compute_loss(preds, targets, epoch_num, step_num, - batch_height, batch_width) # YOLOv6_af + total_loss, loss_items = self.compute_loss((preds[0],preds[3],preds[4], preds[5]), targets, epoch_num, step_num, + batch_height, batch_width, segmasks, img=images) # YOLOv6_af if self.rank != -1: total_loss *= self.world_size + # torch.cuda.synchronize() + # qq4 = time.time() # backward + # print("prepare : {}s | model : {}s | loss : {}s".format(qq2 - qq1, qq3 - qq2, qq4 - qq3)) self.scaler.scale(total_loss).backward() self.loss_items = loss_items self.update_optimizer() @@ -186,12 +198,12 @@ def after_epoch(self): is_val_epoch = (remaining_epochs == 0) or ((not self.args.eval_final_only) and ((self.epoch + 1) % eval_interval == 0)) if is_val_epoch: self.eval_model() - self.ap = self.evaluate_results[1] + self.ap = self.evaluate_results[3] self.best_ap = max(self.ap, self.best_ap) # save ckpt ckpt = { - 'model': deepcopy(de_parallel(self.model)).half(), - 'ema': deepcopy(self.ema.ema).half(), + 'model': deepcopy(de_parallel(self.model)), + 'ema': deepcopy(self.ema.ema), 'updates': self.ema.updates, 'optimizer': self.optimizer.state_dict(), 'scheduler': self.scheduler.state_dict(), @@ -231,7 +243,10 @@ def eval_model(self): task='train', specific_shape=self.specific_shape, height=self.height, - width=self.width + width=self.width, + do_pr_metric=True, + do_coco_metric=False, + issolo=self.cfg.model.head.issolo ) else: def get_cfg_value(cfg_dict, value_str, default_value): @@ -263,10 +278,10 @@ def get_cfg_value(cfg_dict, value_str, default_value): width=self.width ) - LOGGER.info(f"Epoch: {self.epoch} | mAP@0.5: {results[0]} | mAP@0.50:0.95: {results[1]}") - self.evaluate_results = results[:2] + LOGGER.info(f"Epoch: {self.epoch} | box_mAP@0.5: {results[0]} | box_mAP@0.50:0.95: {results[1]} | mask_mAP@0.5: {results[2]} | mask_mAP@0.50:0.95: {results[3]}") + self.evaluate_results = [results[1], results[3]] # plot validation predictions - self.plot_val_pred(vis_outputs, vis_paths) + # self.plot_val_pred(vis_outputs, vis_paths) def before_train_loop(self): @@ -286,6 +301,10 @@ def before_train_loop(self): self.best_ap = self.evaluate_results[1] self.best_stop_strong_aug_ap = self.evaluate_results[1] + if self.cfg.model.head.issolo: + from yolov6.models.losses.seg_loss_solo_main import ComputeLoss as ComputeLoss + else: + from yolov6.models.losses.seg_loss import ComputeLoss as ComputeLoss self.compute_loss = ComputeLoss(num_classes=self.data_dict['nc'], ori_img_size=self.img_size, @@ -293,6 +312,7 @@ def before_train_loop(self): use_dfl=self.cfg.model.head.use_dfl, reg_max=self.cfg.model.head.reg_max, iou_type=self.cfg.model.head.iou_type, + nm=self.cfg.model.head.nm, fpn_strides=self.cfg.model.head.strides) if self.args.fuse_ab: @@ -305,7 +325,7 @@ def before_train_loop(self): fpn_strides=self.cfg.model.head.strides, ) if self.args.distill : - if self.cfg.model.type in ['YOLOv6n','YOLOv6s']: + if self.cfg.model.type in ['YOLOv6n','YOLOv6s']: Loss_distill_func = ComputeLoss_distill_ns else: Loss_distill_func = ComputeLoss_distill @@ -404,7 +424,8 @@ def get_data_loader(args, cfg, data_dict): def prepro_data(batch_data, device): images = batch_data[0].to(device, non_blocking=True).float() / 255 targets = batch_data[1].to(device) - return images, targets + segmask = batch_data[4].to(device) + return images, targets, segmask def get_model(self, args, cfg, nc, device): if 'YOLOv6-lite' in cfg.model.type: @@ -588,4 +609,4 @@ def quant_setup(self, model, cfg, device): # QAT flow load calibrated model assert cfg.qat.calib_pt is not None, 'Please provide calibrated model' model.load_state_dict(torch.load(cfg.qat.calib_pt)['model'].float().state_dict()) - model.to(device) + model.to(device) \ No newline at end of file diff --git a/yolov6/core/evaler.py b/yolov6/core/evaler.py index e79f51be..15c8bc76 100644 --- a/yolov6/core/evaler.py +++ b/yolov6/core/evaler.py @@ -7,13 +7,19 @@ import torch import yaml from pathlib import Path +import cv2 +from multiprocessing.pool import ThreadPool + + from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval +import torch.nn.functional as F + from yolov6.data.data_load import create_dataloader from yolov6.utils.events import LOGGER, NCOLS -from yolov6.utils.nms import non_max_suppression +from yolov6.utils.nms import non_max_suppression_seg, non_max_suppression_seg_solo from yolov6.utils.general import download_ckpt from yolov6.utils.checkpoint import load_checkpoint from yolov6.utils.torch_utils import time_sync, get_model_info @@ -87,24 +93,25 @@ def init_data(self, dataloader, task): self.is_coco = self.data.get("is_coco", False) self.ids = self.coco80_to_coco91_class() if self.is_coco else list(range(1000)) if task != 'train': + pad = 0.0 eval_hyp = { "shrink_size":self.shrink_size, } rect = self.infer_on_rect - pad = 0.5 if rect else 0.0 dataloader = create_dataloader(self.data[task if task in ('train', 'val', 'test') else 'val'], - self.img_size, self.batch_size, self.stride, hyp=eval_hyp, check_labels=True, pad=pad, rect=rect, + self.img_size, self.batch_size, self.stride, hyp=eval_hyp, check_labels=True, pad=0.5, rect=True, data_dict=self.data, task=task, specific_shape=self.specific_shape, height=self.height, width=self.width)[0] return dataloader - def predict_model(self, model, dataloader, task): + def predict_model(self, model, dataloader, task, issolo=False, weight_nums=66, bias_nums=1, dyconv_channels=66): '''Model prediction Predicts the whole dataset and gets the prediced results and inference time. ''' self.speed_result = torch.zeros(4, device=self.device) pred_results = [] pbar = tqdm(dataloader, desc=f"Inferencing model in {task} datasets.", ncols=NCOLS) - + weight_nums = [weight_nums] + bias_nums = [bias_nums] # whether to compute metric and plot PR curve and P、R、F1 curve under iou50 match rule if self.do_pr_metric: stats, ap = [], [] @@ -115,7 +122,7 @@ def predict_model(self, model, dataloader, task): from yolov6.utils.metrics import ConfusionMatrix confusion_matrix = ConfusionMatrix(nc=model.nc) - for i, (imgs, targets, paths, shapes) in enumerate(pbar): + for i, (imgs, targets, paths, shapes, masks) in enumerate(pbar): # pre-process t1 = time_sync() imgs = imgs.to(self.device, non_blocking=True) @@ -125,12 +132,23 @@ def predict_model(self, model, dataloader, task): # Inference t2 = time_sync() - outputs, _ = model(imgs) + toutputs, _ = model(imgs) self.speed_result[2] += time_sync() - t2 # inference time # post-process t3 = time_sync() - outputs = non_max_suppression(outputs, self.conf_thres, self.iou_thres, multi_label=True) + if not issolo: + loutputs = non_max_suppression_seg(toutputs, self.conf_thres, self.iou_thres, multi_label=True) + else: + loutputs = non_max_suppression_seg_solo(toutputs, self.conf_thres, self.iou_thres, multi_label=True) + protos = toutputs[1][0] + segments = [] + segconf = [loutputs[li][..., 0:] for li in range(len(loutputs))] + outputs = [loutputs[li][..., :6] for li in range(len(loutputs))] + if not issolo: + segments = [self.handle_proto_test([protos[li].reshape(1, *(protos[li].shape[-3:]))], segconf[li], imgs.shape[-2:]) for li in range(len(loutputs))] + else: + segments = [self.handle_proto_solo([protos[li].reshape(1, *(protos[li].shape[-3:]))], segconf[li], imgs.shape[-2:], weight_sums=weight_nums, bias_sums=bias_nums, dyconv=dyconv_channels) for li in range(len(loutputs))] self.speed_result[3] += time_sync() - t3 # post-process time self.speed_result[0] += len(outputs) @@ -139,7 +157,7 @@ def predict_model(self, model, dataloader, task): eval_outputs = copy.deepcopy([x.detach().cpu() for x in outputs]) # save result - pred_results.extend(self.convert_to_coco_format(outputs, imgs, paths, shapes, self.ids)) + # pred_results.extend(self.convert_to_coco_format_seg(outputs, imgs, paths, shapes, self.ids, segments)) # for tensorboard visualization, maximum images to show: 8 if i == 0: @@ -153,25 +171,29 @@ def predict_model(self, model, dataloader, task): # Statistics per image # This code is based on # https://github.com/ultralytics/yolov5/blob/master/val.py - for si, pred in enumerate(eval_outputs): + for si, (pred, pred_masks) in enumerate(zip(eval_outputs, segments)): labels = targets[targets[:, 0] == si, 1:] nl = len(labels) tcls = labels[:, 0].tolist() if nl else [] # target class seen += 1 + correct_masks = torch.zeros(len(pred), niou, dtype=torch.bool) # init + correct = torch.zeros(len(pred), niou, dtype=torch.bool) # init if len(pred) == 0: if nl: - stats.append((torch.zeros(0, niou, dtype=torch.bool), torch.Tensor(), torch.Tensor(), tcls)) + stats.append((correct_masks, correct, torch.Tensor(), torch.Tensor(), tcls)) continue + # Masks + midx = targets[:, 0] == si + gt_masks = masks[midx] # Predictions predn = pred.clone() self.scale_coords(imgs[si].shape[1:], predn[:, :4], shapes[si][0], shapes[si][1]) # native-space pred # Assign all predictions as incorrect - correct = torch.zeros(pred.shape[0], niou, dtype=torch.bool) + if nl: - from yolov6.utils.nms import xywh2xyxy # target boxes @@ -183,49 +205,122 @@ def predict_model(self, model, dataloader, task): labelsn = torch.cat((labels[:, 0:1], tbox), 1) # native-space labels - from yolov6.utils.metrics import process_batch + from yolov6.utils.metrics import process_batch correct = process_batch(predn, labelsn, iouv) + correct_masks = process_batch(predn, labelsn, iouv, pred_masks, gt_masks, overlap=False, masks=True) if self.plot_confusion_matrix: confusion_matrix.process_batch(predn, labelsn) # Append statistics (correct, conf, pcls, tcls) - stats.append((correct.cpu(), pred[:, 4].cpu(), pred[:, 5].cpu(), tcls)) + + + stats.append((correct_masks.cpu(), correct.cpu(), pred[:, 4].cpu(), pred[:, 5].cpu(), tcls)) if self.do_pr_metric: # Compute statistics stats = [np.concatenate(x, 0) for x in zip(*stats)] # to numpy if len(stats) and stats[0].any(): - - from yolov6.utils.metrics import ap_per_class - p, r, ap, f1, ap_class = ap_per_class(*stats, plot=self.plot_curve, save_dir=self.save_dir, names=model.names) - AP50_F1_max_idx = len(f1.mean(0)) - f1.mean(0)[::-1].argmax() -1 - LOGGER.info(f"IOU 50 best mF1 thershold near {AP50_F1_max_idx/1000.0}.") - ap50, ap = ap[:, 0], ap.mean(1) # AP@0.5, AP@0.5:0.95 - mp, mr, map50, map = p[:, AP50_F1_max_idx].mean(), r[:, AP50_F1_max_idx].mean(), ap50.mean(), ap.mean() - nt = np.bincount(stats[3].astype(np.int64), minlength=model.nc) # number of targets per class + from yolov6.utils.metrics import ap_per_class_box_and_mask, Metrics + metrics = Metrics() + # v5 method + results = ap_per_class_box_and_mask(*stats, plot=self.plot_curve, save_dir=self.save_dir, names=model.names) + metrics.update(results) + nt = np.bincount(stats[4].astype(np.int64), minlength=model.nc) # number of targets per class # Print results - s = ('%-16s' + '%12s' * 7) % ('Class', 'Images', 'Labels', 'P@.5iou', 'R@.5iou', 'F1@.5iou', 'mAP@.5', 'mAP@.5:.95') + s = ('%22s' + '%15s' * 10) % ('Class', 'Images', 'Instances', 'Box(P', 'R', 'mAP50', 'mAP50-95)', 'Mask(P', 'R', + 'mAP50', 'mAP50-95)') LOGGER.info(s) - pf = '%-16s' + '%12i' * 2 + '%12.3g' * 5 # print format - LOGGER.info(pf % ('all', seen, nt.sum(), mp, mr, f1.mean(0)[AP50_F1_max_idx], map50, map)) - - self.pr_metric_result = (map50, map) - - # Print results per class - if self.verbose and model.nc > 1: - for i, c in enumerate(ap_class): - LOGGER.info(pf % (model.names[c], seen, nt[c], p[i, AP50_F1_max_idx], r[i, AP50_F1_max_idx], - f1[i, AP50_F1_max_idx], ap50[i], ap[i])) + pf = '%22s' + '%15i' * 2 + '%11.5g' * 8 # print format + mr = metrics.mean_results() + LOGGER.info(pf % ('all', seen, nt.sum(), *mr)) + return [mr[2], mr[3], mr[6], mr[7]], [], [] if self.plot_confusion_matrix: confusion_matrix.plot(save_dir=self.save_dir, names=list(model.names)) else: - LOGGER.info("Calculate metric failed, might check dataset.") - self.pr_metric_result = (0.0, 0.0) + return [0, 0, 0, 0], [], [] + + return pred_results - return pred_results, vis_outputs, vis_paths + def parse_dynamic_params(self, flatten_kernels, weight_nums, bias_nums, dyconv_channels): + """split kernel head prediction to conv weight and bias.""" + n_inst = flatten_kernels.size(0) + n_layers = len(weight_nums) + params_splits = list( + torch.split_with_sizes( + flatten_kernels, weight_nums + bias_nums, dim=1)) + weight_splits = params_splits[:n_layers] + bias_splits = params_splits[n_layers:] + for i in range(n_layers): + if i < n_layers - 1: + weight_splits[i] = weight_splits[i].reshape( + n_inst * dyconv_channels, -1, 1, 1) + bias_splits[i] = bias_splits[i].reshape(n_inst * + dyconv_channels) + else: + weight_splits[i] = weight_splits[i].reshape(n_inst, -1, 1, 1) + bias_splits[i] = bias_splits[i].reshape(n_inst) + + return weight_splits, bias_splits + + def handle_proto_solo(self, proto_list, oconfs, imgshape, weight_sums=66, bias_sums=1, dyconv=66, img_orishape=None): + ''' + proto_list: [(bs, 32, w, h), ...] + conf: (bs, l, 33) -> which_proto, 32 + ''' + def handle_proto_coord(proto): + _ = proto.shape[-2:] + x = torch.arange(0, 1, step = 1 / _[1]).unsqueeze(0).unsqueeze(0).repeat(1, _[0], 1).to(proto.dtype).to(proto.device) + y = torch.arange(0, 1, step = 1 / _[0]).unsqueeze(0).T.unsqueeze(0).repeat(1, 1, _[1]).to(proto.dtype).to(proto.device) + return torch.cat([proto, x, y]).reshape(1, -1, *_) + + def crop_mask(masks, boxes): + """ + "Crop" predicted masks by zeroing out everything not in the predicted bbox. + Vectorized by Chong (thanks Chong). + + Args: + - masks should be a size [n, h, w] tensor of masks + - boxes should be a size [n, 4] tensor of bbox coords in relative point form + """ + + n, h, w = masks.shape + x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) # x1 shape(1,1,n) + r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :] # rows shape(1,w,1) + c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None] # cols shape(h,1,1) + return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2)) + + conf = oconfs[..., 6:] + if conf.shape[0] == 0: + return None + + xyxy = oconfs[..., :4] + confs = conf[..., 1:] + proto = proto_list[0][0] + proto = handle_proto_coord(proto) + s = proto.shape[-2:] + num_inst = confs.shape[0] + proto = proto.reshape(1, -1, *proto.shape[-2:]) + weights, biases = self.parse_dynamic_params(confs, weight_nums=weight_sums, bias_nums=bias_sums, dyconv_channels=dyconv) + n_layers = len(weights) + for i, (weight, bias) in enumerate(zip(weights, biases)): + x = F.conv2d( + proto, weight, bias=bias, stride=1, padding=0, groups=1) + if i < n_layers - 1: + x = F.relu(x) + x = x.reshape(num_inst, *proto.shape[-2:]).unsqueeze(0) + seg = x.sigmoid() + masks = F.interpolate(seg, imgshape, mode='bilinear', align_corners=False)[0] + if img_orishape: + masks_ori = F.interpolate(seg, img_orishape, mode='nearest')[0] + else: + masks_ori = None + masks = crop_mask(masks, xyxy).gt_(0.5) + masks = masks.gt_(0.5) + return masks + def eval_model(self, pred_results, model, dataloader, task): @@ -282,7 +377,8 @@ def eval_model(self, pred_results, model, dataloader, task): label_count_dicts[nc_i]["images"].add(ann_i["image_id"]) label_count_dicts[nc_i]["anns"] += 1 - s = ('%-16s' + '%12s' * 7) % ('Class', 'Labeled_images', 'Labels', 'P@.5iou', 'R@.5iou', 'F1@.5iou', 'mAP@.5', 'mAP@.5:.95') + s = ('%22s' + '%11s' * 10) % ('Class', 'Images', 'Instances', 'Box(P', 'R', 'mAP50', 'mAP50-95)', 'Mask(P', 'R', + 'mAP50', 'mAP50-95)') LOGGER.info(s) #IOU , all p, all cats, all gt, maxdet 100 coco_p = cocoEval.eval['precision'] @@ -383,6 +479,51 @@ def convert_to_coco_format(self, outputs, imgs, paths, shapes, ids): pred_results.append(pred_data) return pred_results + def convert_to_coco_format_seg(self, outputs, imgs, paths, shapes, ids, masks): + + from pycocotools.mask import encode + import time + + def single_encode(x): + rle = encode(np.asarray(x[:, :, None], order='F', dtype='uint8'))[0] + rle['counts'] = rle['counts'].decode('utf-8') + return rle + + + pred_results = [] + for i, pred in enumerate(outputs): + if len(pred) == 0: + continue + pred_masks = masks[i].cpu().numpy() + pred_masks = np.transpose(pred_masks, (2, 0, 1)) + a = time.time() + with ThreadPool(64) as pool: + rles = pool.map(single_encode, pred_masks) + print("rle time") + b = time.time() + path, shape = Path(paths[i]), shapes[i][0] + self.scale_coords(imgs[i].shape[1:], pred[:, :4], shape, shapes[i][1]) + image_id = int(path.stem) if self.is_coco else path.stem + bboxes = self.box_convert(pred[:, 0:4]) + bboxes[:, :2] -= bboxes[:, 2:] / 2 + cls = pred[:, 5] + scores = pred[:, 4] + for ind in range(pred.shape[0]): + category_id = ids[int(cls[ind])] + bbox = [round(x, 3) for x in bboxes[ind].tolist()] + score = round(scores[ind].item(), 5) + pred_data = { + "image_id": image_id, + "category_id": category_id, + "bbox": bbox, + "score": score, + 'segmentation': rles[i] + } + pred_results.append(pred_data) + c = time.time() + print(b-a, c-b) + return pred_results + @staticmethod def check_task(task): if task not in ['train', 'val', 'test', 'speed']: @@ -543,3 +684,48 @@ def convert_to_coco_format_trt(nums, boxes, scores, classes, paths, shapes, ids) pred_results.extend(convert_to_coco_format_trt(nums, boxes, scores, classes, paths, shapes, self.ids)) self.speed_result[0] += self.batch_size return dataloader, pred_results + + + + @staticmethod + def handle_proto_test(proto_list, oconfs, imgshape, img_orishape=None): + ''' + proto_list: [(bs, 32, w, h), ...] + conf: (bs, l, 33) -> which_proto, 32 + ''' + + + def crop_mask(masks, boxes): + """ + "Crop" predicted masks by zeroing out everything not in the predicted bbox. + Vectorized by Chong (thanks Chong). + + Args: + - masks should be a size [n, h, w] tensor of masks + - boxes should be a size [n, 4] tensor of bbox coords in relative point form + """ + + n, h, w = masks.shape + x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) # x1 shape(1,1,n) + r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :] # rows shape(1,w,1) + c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None] # cols shape(h,1,1) + return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2)) + + conf = oconfs[..., 6:] + if conf.shape[0] == 0: + return None + + xyxy = oconfs[..., :4] + confs = conf[..., 1:] + proto = proto_list[0] + + s = proto.shape[-2:] + seg = ((confs@proto.reshape(proto.shape[0], proto.shape[1], -1)).reshape(proto.shape[0], confs.shape[0], *s)) + seg = seg.sigmoid() + masks = F.interpolate(seg, imgshape, mode='bilinear', align_corners=False)[0] + if img_orishape: + masks_ori = F.interpolate(seg, img_orishape, mode='nearest')[0] + else: + masks_ori = None + masks = crop_mask(masks, xyxy).gt_(0.5) + return masks diff --git a/yolov6/core/inferer.py b/yolov6/core/inferer.py index cea6586d..3fef6b35 100644 --- a/yolov6/core/inferer.py +++ b/yolov6/core/inferer.py @@ -13,11 +13,13 @@ from PIL import ImageFont from collections import deque +import torch.nn.functional as F + from yolov6.utils.events import LOGGER, load_yaml from yolov6.layers.common import DetectBackend from yolov6.data.data_augment import letterbox from yolov6.data.datasets import LoadData -from yolov6.utils.nms import non_max_suppression +from yolov6.utils.nms import non_max_suppression_seg, non_max_suppression_seg_solo from yolov6.utils.torch_utils import get_model_info class Inferer: @@ -67,10 +69,13 @@ def model_switch(self, model, img_size): LOGGER.info("Switch model to deploy modality.") - def infer(self, conf_thres, iou_thres, classes, agnostic_nms, max_det, save_dir, save_txt, save_img, hide_labels, hide_conf, view_img=True): + def infer(self, conf_thres, iou_thres, classes, agnostic_nms, max_det, save_dir, save_txt, save_img, hide_labels, hide_conf, view_img=True, issolo=True, weight_nums=66, bias_nums=1, dyconv_channels=66): ''' Model Inference and results visualization ''' vid_path, vid_writer, windows = None, None, [] + print(issolo) fps_calculator = CalcFPS() + weight_nums = [weight_nums] + bias_nums = [bias_nums] for img_src, img_path, vid_cap in tqdm(self.files): img, img_src = self.process_image(img_src, self.img_size, self.stride, self.half) img = img.to(self.device) @@ -79,15 +84,31 @@ def infer(self, conf_thres, iou_thres, classes, agnostic_nms, max_det, save_dir, # expand for batch dim t1 = time.time() pred_results = self.model(img) - det = non_max_suppression(pred_results, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det)[0] + if not issolo: + loutputs = non_max_suppression_seg(pred_results, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det) + else: + loutputs = non_max_suppression_seg_solo(pred_results, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det) + protos = pred_results[1][0] + segments = [] + print(len(loutputs)) + segconf = [loutputs[li][..., 0:] for li in range(len(loutputs))] + det = [loutputs[li][..., :6] for li in range(len(loutputs))][0] + if not issolo: + segments = [self.handle_proto_test([protos[li].reshape(1, *(protos[li].shape[-3:]))], segconf[li], img.shape[-2:]) for li in range(len(loutputs))][0] + else: + segments = [self.handle_proto_solo([protos[li].reshape(1, *(protos[li].shape[-3:]))], segconf[li], img.shape[-2:], weight_sums=weight_nums, bias_sums=bias_nums, dyconv=dyconv_channels) for li in range(len(loutputs))][0] t2 = time.time() + + if self.webcam: save_path = osp.join(save_dir, self.webcam_addr) txt_path = osp.join(save_dir, self.webcam_addr) else: # Create output files in nested dirs that mirrors the structure of the images' dirs - rel_path = osp.relpath(osp.dirname(img_path), osp.dirname(self.source)) + print(osp.dirname(img_path)) + print(osp.dirname(self.source)) + rel_path = "test" save_path = osp.join(save_dir, rel_path, osp.basename(img_path)) # im.jpg txt_path = osp.join(save_dir, rel_path, 'labels', osp.splitext(osp.basename(img_path))[0]) os.makedirs(osp.join(save_dir, rel_path), exist_ok=True) @@ -98,9 +119,14 @@ def infer(self, conf_thres, iou_thres, classes, agnostic_nms, max_det, save_dir, # check image and font assert img_ori.data.contiguous, 'Image needs to be contiguous. Please apply to input images with np.ascontiguousarray(im).' self.font_check() - if len(det): det[:, :4] = self.rescale(img.shape[2:], det[:, :4], img_src.shape).round() + + + ii = 0 + segments = self.rescale_mask(img.shape[2:], segments.cpu().numpy(), img_src.shape) + print(segments.shape) + segments = segments.transpose(2, 0, 1) for *xyxy, conf, cls in reversed(det): if save_txt: # Write to file xywh = (self.box_convert(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist() # normalized xywh @@ -109,13 +135,16 @@ def infer(self, conf_thres, iou_thres, classes, agnostic_nms, max_det, save_dir, f.write(('%g ' * len(line)).rstrip() % line + '\n') if save_img: + print(cls) class_num = int(cls) # integer class label = None if hide_labels else (self.class_names[class_num] if hide_conf else f'{self.class_names[class_num]} {conf:.2f}') - self.plot_box_and_label(img_ori, max(round(sum(img_ori.shape) / 2 * 0.003), 2), xyxy, label, color=self.generate_colors(class_num, True)) + img_ori = self.plot_box_and_label(img_ori, max(round(sum(img_ori.shape) / 2 * 0.003), 2), xyxy, label, color=self.generate_colors(class_num, True), segment=segments[ii]) + ii += 1 img_src = np.asarray(img_ori) + # FPS counter fps_calculator.update(1.0 / (t2 - t1)) avg_fps = fps_calculator.accumulate() @@ -187,6 +216,21 @@ def rescale(ori_shape, boxes, target_shape): return boxes + @staticmethod + def rescale_mask(ori_shape, masks, target_shape): + '''Rescale the output to the original image shape''' + ratio = min(ori_shape[0] / target_shape[0], ori_shape[1] / target_shape[1]) + padding = int((ori_shape[1] - target_shape[1] * ratio) / 2), int((ori_shape[0] - target_shape[0] * ratio) / 2) + + + masks = masks[:, padding[1]: ori_shape[0]- padding[1], padding[0]: ori_shape[1] - padding[0]] + masks = masks.transpose(1, 2, 0) + masks = cv2.resize(masks, target_shape[:2][::-1]) + if len(masks.shape) == 2: + masks = masks.reshape(*masks.shape, 1) + + return masks + def check_img_size(self, img_size, s=32, floor=0): """Make sure image size is a multiple of stride s in each dimension, and return a new shape list of image.""" if isinstance(img_size, int): # integer i.e. img_size=640 @@ -204,6 +248,200 @@ def make_divisible(self, x, divisor): # Upward revision the value x to make it evenly divisible by the divisor. return math.ceil(x / divisor) * divisor + @staticmethod + def handle_proto(proto_list, oconfs, imgshape, det): + ''' + proto_list: [(bs, 32, w, h), ...] + conf: (bs, l, 33) -> which_proto, 32 + ''' + def crop_mask(masks, boxes): + """ + "Crop" predicted masks by zeroing out everything not in the predicted bbox. + Vectorized by Chong (thanks Chong). + + Args: + - masks should be a size [n, h, w] tensor of masks + - boxes should be a size [n, 4] tensor of bbox coords in relative point form + """ + + n, h, w = masks.shape + x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) # x1 shape(1,1,n) + r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :] # rows shape(1,w,1) + c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None] # cols shape(h,1,1) + return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2)) + + conf = oconfs[..., 6:] + + xyxy = oconfs[..., :4] + which_proto = conf[..., 0] + confs = conf[..., 1:] + res = [] + protos = proto_list[0] + for i, proto in enumerate([protos, protos, protos]): + s = proto.shape[-2:] + tconfs = confs[which_proto[..., 0] == i] + if tconfs.shape[0] == 0: + continue + tseg = ((tconfs@proto.reshape(proto.shape[0], proto.shape[1], -1)).reshape(proto.shape[0], tconfs.shape[1], *s)) + print("a:") + print(which_proto[..., 0] == i) + tseg=tseg.sigmoid() + masks = F.interpolate(tseg, imgshape, mode='nearest')[0] + #return masks + print(xyxy[which_proto[..., 0] == i][0].shape) + masks = crop_mask(masks, xyxy[which_proto[..., 0] == i][0])[0] + res.append(masks.gt_(0.5)) + return torch.cat(res, dim = 0), xyxy[which_proto[..., 0] == i][0] + + + @staticmethod + def handle_proto_test(proto_list, oconfs, imgshape, img_orishape=None): + ''' + proto_list: [(bs, 32, w, h), ...] + conf: (bs, l, 33) -> which_proto, 32 + ''' + def crop_mask(masks, boxes): + """ + "Crop" predicted masks by zeroing out everything not in the predicted bbox. + Vectorized by Chong (thanks Chong). + + Args: + - masks should be a size [n, h, w] tensor of masks + - boxes should be a size [n, 4] tensor of bbox coords in relative point form + """ + + n, h, w = masks.shape + x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) # x1 shape(1,1,n) + r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :] # rows shape(1,w,1) + c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None] # cols shape(h,1,1) + return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2)) + + conf = oconfs[..., 6:] + if conf.shape[0] == 0: + return None + + xyxy = oconfs[..., :4] + confs = conf[..., 1:] + proto = proto_list[0] + s = proto.shape[-2:] + seg = ((confs@proto.reshape(proto.shape[0], proto.shape[1], -1)).reshape(proto.shape[0], confs.shape[0], *s)) + seg = seg.sigmoid() + masks = F.interpolate(seg, imgshape, mode='bilinear', align_corners=False)[0] + if img_orishape: + masks_ori = F.interpolate(seg, img_orishape, mode='nearest')[0] + else: + masks_ori = None + masks = crop_mask(masks, xyxy).gt_(0.5) + return masks + + # def handle_proto_solo(self, proto_list, oconfs, imgshape, weight_sums=66, bias_sums=66, dyconv=66, img_orishape=None): + # ''' + # proto_list: [(bs, 32, w, h), ...] + # conf: (bs, l, 33) -> which_proto, 32 + # ''' + # def crop_mask(masks, boxes): + # """ + # "Crop" predicted masks by zeroing out everything not in the predicted bbox. + # Vectorized by Chong (thanks Chong). + + # Args: + # - masks should be a size [n, h, w] tensor of masks + # - boxes should be a size [n, 4] tensor of bbox coords in relative point form + # """ + + # n, h, w = masks.shape + # x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) # x1 shape(1,1,n) + # r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :] # rows shape(1,w,1) + # c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None] # cols shape(h,1,1) + # return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2)) + + # conf = oconfs[..., 6:] + # if conf.shape[0] == 0: + # return None + + # xyxy = oconfs[..., :4] + # confs = conf[..., 1:] + # proto = proto_list[0] + # s = proto.shape[-2:] + # num_inst = confs.shape[0] + # proto = proto.reshape(1, -1, *proto.shape[-2:]) + # proto = proto.repeat(num_inst, 1, 1, 1) + # weights, biases = self.parse_dynamic_params(confs, weight_nums=weight_sums, bias_nums=bias_sums, dyconv_channels=dyconv) + # n_layers = len(weights) + # for i, (weight, bias) in enumerate(zip(weights, biases)): + # x = F.conv2d( + # proto, weight, bias=bias, stride=1, padding=0, groups=num_inst) + # if i < n_layers - 1: + # x = F.relu(x) + # x = x.reshape(num_inst, *proto.shape[-2:]) + # seg = x.sigmoid() + # masks = F.interpolate(seg, imgshape, mode='bilinear', align_corners=False)[0] + # if img_orishape: + # masks_ori = F.interpolate(seg, img_orishape, mode='nearest')[0] + # else: + # masks_ori = None + # masks = crop_mask(masks, xyxy).gt_(0.5) + # return masks + def handle_proto_solo(self, proto_list, oconfs, imgshape, weight_sums=66, bias_sums=1, dyconv=66, img_orishape=None): + ''' + proto_list: [(bs, 32, w, h), ...] + conf: (bs, l, 33) -> which_proto, 32 + ''' + def handle_proto_coord(proto): + _ = proto.shape[-2:] + x = torch.arange(0, 1, step = 1 / _[1]).unsqueeze(0).unsqueeze(0).repeat(1, _[0], 1).to(proto.dtype).to(proto.device) + y = torch.arange(0, 1, step = 1 / _[0]).unsqueeze(0).T.unsqueeze(0).repeat(1, 1, _[1]).to(proto.dtype).to(proto.device) + return torch.cat([proto, x, y]).reshape(1, -1, *_) + + def crop_mask(masks, boxes): + """ + "Crop" predicted masks by zeroing out everything not in the predicted bbox. + Vectorized by Chong (thanks Chong). + + Args: + - masks should be a size [n, h, w] tensor of masks + - boxes should be a size [n, 4] tensor of bbox coords in relative point form + """ + + n, h, w = masks.shape + x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) # x1 shape(1,1,n) + r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :] # rows shape(1,w,1) + c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None] # cols shape(h,1,1) + return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2)) + + conf = oconfs[..., 6:] + if conf.shape[0] == 0: + return None + + xyxy = oconfs[..., :4] + confs = conf[..., 1:] + proto = proto_list[0][0] + proto = handle_proto_coord(proto) + s = proto.shape[-2:] + num_inst = confs.shape[0] + proto = proto.reshape(1, -1, *proto.shape[-2:]) + weights, biases = self.parse_dynamic_params(confs, weight_nums=weight_sums, bias_nums=bias_sums, dyconv_channels=dyconv) + n_layers = len(weights) + for i, (weight, bias) in enumerate(zip(weights, biases)): + x = F.conv2d( + proto, weight, bias=bias, stride=1, padding=0, groups=1) + if i < n_layers - 1: + x = F.relu(x) + x = x.reshape(num_inst, *proto.shape[-2:]).unsqueeze(0) + seg = x.sigmoid() + masks = F.interpolate(seg, imgshape, mode='bilinear', align_corners=False)[0] + if img_orishape: + masks_ori = F.interpolate(seg, img_orishape, mode='nearest')[0] + else: + masks_ori = None + masks = crop_mask(masks, xyxy).gt_(0.5) + masks = masks.gt_(0.5) + return masks + + + + + @staticmethod def draw_text( img, @@ -237,9 +475,10 @@ def draw_text( return text_size @staticmethod - def plot_box_and_label(image, lw, box, label='', color=(128, 128, 128), txt_color=(255, 255, 255), font=cv2.FONT_HERSHEY_COMPLEX): + def plot_box_and_label(image, lw, box, label='', color=(128, 128, 128), txt_color=(255, 255, 255), font=cv2.FONT_HERSHEY_COMPLEX, segment=None): # Add one xyxy box to image with label p1, p2 = (int(box[0]), int(box[1])), (int(box[2]), int(box[3])) + common_color = [[128,0,0], [255,0,0],[255,0,255],[255,102,0],[51,51,0],[0,51,0],[51,204,204],[0,128,128],[0,204,255]] cv2.rectangle(image, p1, p2, color, thickness=lw, lineType=cv2.LINE_AA) if label: tf = max(lw - 1, 1) # font thickness @@ -249,6 +488,13 @@ def plot_box_and_label(image, lw, box, label='', color=(128, 128, 128), txt_colo cv2.rectangle(image, p1, p2, color, -1, cv2.LINE_AA) # filled cv2.putText(image, label, (p1[0], p1[1] - 2 if outside else p1[1] + h + 2), font, lw / 3, txt_color, thickness=tf, lineType=cv2.LINE_AA) + if segment is not None: + import random + ii=random.randint(0, len(common_color)-1) + colr = np.asarray(common_color[ii]) + colr = colr.reshape(1,3).repeat((image.shape[0] * image.shape[1]), axis = 0).reshape(image.shape[0], image.shape[1], 3) + image = cv2.addWeighted(image, 1, (colr * segment.reshape(*segment.shape[:2], 1)).astype(image.dtype), 0.8, 1) + return image @staticmethod def font_check(font='./yolov6/utils/Arial.ttf', size=10): @@ -280,6 +526,27 @@ def generate_colors(i, bgr=False): num = len(palette) color = palette[int(i) % num] return (color[2], color[1], color[0]) if bgr else color + + def parse_dynamic_params(self, flatten_kernels, weight_nums, bias_nums, dyconv_channels): + """split kernel head prediction to conv weight and bias.""" + n_inst = flatten_kernels.size(0) + n_layers = len(weight_nums) + params_splits = list( + torch.split_with_sizes( + flatten_kernels, weight_nums + bias_nums, dim=1)) + weight_splits = params_splits[:n_layers] + bias_splits = params_splits[n_layers:] + for i in range(n_layers): + if i < n_layers - 1: + weight_splits[i] = weight_splits[i].reshape( + n_inst * dyconv_channels, -1, 1, 1) + bias_splits[i] = bias_splits[i].reshape(n_inst * + dyconv_channels) + else: + weight_splits[i] = weight_splits[i].reshape(n_inst, -1, 1, 1) + bias_splits[i] = bias_splits[i].reshape(n_inst) + + return weight_splits, bias_splits class CalcFPS: def __init__(self, nsamples: int = 50): diff --git a/yolov6/data/data_augment.py b/yolov6/data/data_augment.py index 45df88e6..e21c3873 100644 --- a/yolov6/data/data_augment.py +++ b/yolov6/data/data_augment.py @@ -26,7 +26,7 @@ def augment_hsv(im, hgain=0.5, sgain=0.5, vgain=0.5): cv2.cvtColor(im_hsv, cv2.COLOR_HSV2BGR, dst=im) # no return needed -def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleup=True, stride=32): +def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=False, scaleup=True, stride=32): '''Resize and pad image while meeting stride-multiple constraints.''' shape = im.shape[:2] # current shape [height, width] if isinstance(new_shape, int): @@ -51,19 +51,22 @@ def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleu if shape[::-1] != new_unpad: # resize im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR) + top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border + return im, r, (left, top) -def mixup(im, labels, im2, labels2): +def mixup(im, labels, segments, im2, labels2, segments2): '''Applies MixUp augmentation https://arxiv.org/pdf/1710.09412.pdf.''' r = np.random.beta(32.0, 32.0) # mixup ratio, alpha=beta=32.0 im = (im * r + im2 * (1 - r)).astype(np.uint8) labels = np.concatenate((labels, labels2), 0) - return im, labels + segments = np.concatenate((segments, segments2), 0) + return im, labels, segments def box_candidates(box1, box2, wh_thr=2, ar_thr=20, area_thr=0.1, eps=1e-16): # box1(4,n), box2(4,n) @@ -78,19 +81,17 @@ def random_affine(img, labels=(), degrees=10, translate=.1, scale=.1, shear=10, new_shape=(640, 640)): '''Applies Random affine transformation.''' n = len(labels) - if isinstance(new_shape, int): - height = width = new_shape - else: - height, width = new_shape + height, width = new_shape M, s = get_transform_matrix(img.shape[:2], (height, width), degrees, scale, shear, translate) if (M != np.eye(3)).any(): # image changed img = cv2.warpAffine(img, M[:2], dsize=(width, height), borderValue=(114, 114, 114)) # Transform label coordinates + new_segments = [] if n: new = np.zeros((n, 4)) - + xy = np.ones((n * 4, 3)) xy[:, :2] = labels[:, [1, 2, 3, 4, 1, 4, 3, 2]].reshape(n * 4, 2) # x1y1, x2y2, x1y2, x2y1 xy = xy @ M.T # transform @@ -113,6 +114,7 @@ def random_affine(img, labels=(), degrees=10, translate=.1, scale=.1, shear=10, return img, labels + def get_transform_matrix(img_shape, new_shape, degrees, scale, shear, translate): new_height, new_width = new_shape # Center @@ -147,6 +149,7 @@ def mosaic_augmentation(shape, imgs, hs, ws, labels, hyp, specific_shape = False '''Applies Mosaic augmentation.''' assert len(imgs) == 4, "Mosaic augmentation of current version only supports 4 images." labels4 = [] + if not specific_shape: if isinstance(shape, list) or isinstance(shape, np.ndarray): target_height, target_width = shape @@ -180,15 +183,18 @@ def mosaic_augmentation(shape, imgs, hs, ws, labels, hyp, specific_shape = False # Labels labels_per_img = labels[i].copy() + if labels_per_img.size: boxes = np.copy(labels_per_img[:, 1:]) boxes[:, 0] = w * (labels_per_img[:, 1] - labels_per_img[:, 3] / 2) + padw # top left x boxes[:, 1] = h * (labels_per_img[:, 2] - labels_per_img[:, 4] / 2) + padh # top left y boxes[:, 2] = w * (labels_per_img[:, 1] + labels_per_img[:, 3] / 2) + padw # bottom right x boxes[:, 3] = h * (labels_per_img[:, 2] + labels_per_img[:, 4] / 2) + padh # bottom right y + labels_per_img[:, 1:] = boxes labels4.append(labels_per_img) + # Concat/clip labels labels4 = np.concatenate(labels4, 0) @@ -196,6 +202,7 @@ def mosaic_augmentation(shape, imgs, hs, ws, labels, hyp, specific_shape = False # np.clip(x, 0, 2 * s, out=x) labels4[:, 1::2] = np.clip(labels4[:, 1::2], 0, 2 * target_width) labels4[:, 2::2] = np.clip(labels4[:, 2::2], 0, 2 * target_height) + # Augment img4, labels4 = random_affine(img4, labels4, @@ -205,4 +212,4 @@ def mosaic_augmentation(shape, imgs, hs, ws, labels, hyp, specific_shape = False shear=hyp['shear'], new_shape=(target_height, target_width)) - return img4, labels4 + return img4, labels4 \ No newline at end of file diff --git a/yolov6/data/data_load.py b/yolov6/data/data_load.py index e68e8d71..923ab1f2 100644 --- a/yolov6/data/data_load.py +++ b/yolov6/data/data_load.py @@ -7,7 +7,7 @@ import torch.distributed as dist from torch.utils.data import dataloader, distributed -from .datasets import TrainValDataset +from .seg_datasets import TrainValDataset from yolov6.utils.events import LOGGER from yolov6.utils.torch_utils import torch_distributed_zero_first diff --git a/yolov6/data/seg_data_augment.py b/yolov6/data/seg_data_augment.py new file mode 100644 index 00000000..6a2c87b6 --- /dev/null +++ b/yolov6/data/seg_data_augment.py @@ -0,0 +1,298 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# This code is based on +# https://github.com/ultralytics/yolov5/blob/master/utils/dataloaders.py + +import math +import random + +import cv2 +import numpy as np + + +def augment_hsv(im, hgain=0.5, sgain=0.5, vgain=0.5): + '''HSV color-space augmentation.''' + if hgain or sgain or vgain: + r = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] + 1 # random gains + hue, sat, val = cv2.split(cv2.cvtColor(im, cv2.COLOR_BGR2HSV)) + dtype = im.dtype # uint8 + + x = np.arange(0, 256, dtype=r.dtype) + lut_hue = ((x * r[0]) % 180).astype(dtype) + lut_sat = np.clip(x * r[1], 0, 255).astype(dtype) + lut_val = np.clip(x * r[2], 0, 255).astype(dtype) + + im_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val))) + cv2.cvtColor(im_hsv, cv2.COLOR_HSV2BGR, dst=im) # no return needed + + +def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleup=True, stride=32): + '''Resize and pad image while meeting stride-multiple constraints.''' + shape = im.shape[:2] # current shape [height, width] + if isinstance(new_shape, int): + new_shape = (new_shape, new_shape) + elif isinstance(new_shape, list) and len(new_shape) == 1: + new_shape = (new_shape[0], new_shape[0]) + + # Scale ratio (new / old) + r = min(new_shape[0] / shape[0], new_shape[1] / shape[1]) + if not scaleup: # only scale down, do not scale up (for better val mAP) + r = min(r, 1.0) + + # Compute padding + new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r)) + dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding + + if auto: # minimum rectangle + dw, dh = np.mod(dw, stride), np.mod(dh, stride) # wh padding + + dw /= 2 # divide padding into 2 sides + dh /= 2 + + if shape[::-1] != new_unpad: # resize + im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR) + top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) + left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) + im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border + + return im, r, (left, top) + + +def mixup(im, labels, segments, im2, labels2, segments2): + # Applies MixUp augmentation https://arxiv.org/pdf/1710.09412.pdf + r = np.random.beta(32.0, 32.0) # mixup ratio, alpha=beta=32.0 + im = (im * r + im2 * (1 - r)).astype(np.uint8) + labels = np.concatenate((labels, labels2), 0) + segments = np.concatenate((segments, segments2), 0) + return im, labels, segments + + +def box_candidates(box1, box2, wh_thr=2, ar_thr=20, area_thr=0.1, eps=1e-16): # box1(4,n), box2(4,n) + '''Compute candidate boxes: box1 before augment, box2 after augment, wh_thr (pixels), aspect_ratio_thr, area_ratio.''' + w1, h1 = box1[2] - box1[0], box1[3] - box1[1] + w2, h2 = box2[2] - box2[0], box2[3] - box2[1] + ar = np.maximum(w2 / (h2 + eps), h2 / (w2 + eps)) # aspect ratio + return (w2 > wh_thr) & (h2 > wh_thr) & (ar < ar_thr) # candidates + + +def random_affine(img, labels=(), segments=(), degrees=10, translate=.1, scale=.1, shear=10, + new_shape=(640, 640), task=""): + '''Applies Random affine transformation.''' + n = len(labels) + if isinstance(new_shape, int): + new_shape = (new_shape, new_shape) + height, width = new_shape + # print(height, width, (height, width)) + + M, s = get_transform_matrix(img.shape[:2], (height, width), degrees, scale, shear, translate) + if (M != np.eye(3)).any(): # image changed + img = cv2.warpAffine(img, M[:2], dsize=(width, height), borderValue=(114, 114, 114)) + + new_segments = [] + # Transform label coordinates + if n: + new = np.zeros((n, 4)) + segments = resample_segments(segments) + for i, segment in enumerate(segments): + xy = np.ones((len(segment), 3)) + xy[:, :2] = segment + xy = xy @ M.T # transform + xy = (xy[:, :2]) + + # clip + new[i] = segment2box(xy, width, height) + new_segments.append(xy) + i = box_candidates(box1=labels[:, 1:5].T * s, box2=new.T, area_thr=0.01) + if task!="val": + labels = labels[i] + labels[:, 1:5] = new[i] + new_segments = np.array(new_segments)[i] + else: + labels[:, 1:5] = new + new_segments = np.array(new_segments) + return img, labels, new_segments + +def copy_paste(im, labels, segments, p=0.5): + # Implement Copy-Paste augmentation https://arxiv.org/abs/2012.07177, labels as nx5 np.array(cls, xyxy) + n = len(segments) + if p and n: + h, w, c = im.shape # height, width, channels + im_new = np.zeros(im.shape, np.uint8) + for j in random.sample(range(n), k=round(p * n)): + l, s = labels[j], segments[j] + box = w - l[3], l[2], w - l[1], l[4] + ioa = bbox_ioa(box, labels[:, 1:5]) # intersection over area + if (ioa < 0.30).all(): # allow 30% obscuration of existing labels + labels = np.concatenate((labels, [[l[0], *box]]), 0) + segments.append(np.concatenate((w - s[:, 0:1], s[:, 1:2]), 1)) + cv2.drawContours(im_new, [segments[j].astype(np.int32)], -1, (1, 1, 1), cv2.FILLED) + result = cv2.flip(im, 1) # augment segments (flip left-right) + i = cv2.flip(im_new, 1).astype(bool) + im[i] = result[i] # cv2.imwrite('debug.jpg', im) # debug + + return im, labels, segments + +def bbox_ioa(box1, box2, eps=1e-7): + """ Returns the intersection over box2 area given box1, box2. Boxes are x1y1x2y2 + box1: np.array of shape(4) + box2: np.array of shape(nx4) + returns: np.array of shape(n) + """ + + # Get the coordinates of bounding boxes + b1_x1, b1_y1, b1_x2, b1_y2 = box1 + b2_x1, b2_y1, b2_x2, b2_y2 = box2.T + + # Intersection area + inter_area = (np.minimum(b1_x2, b2_x2) - np.maximum(b1_x1, b2_x1)).clip(0) * \ + (np.minimum(b1_y2, b2_y2) - np.maximum(b1_y1, b2_y1)).clip(0) + + # box2 area + box2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + eps + + # Intersection over box2 area + return inter_area / box2_area + + +def regen_labels(labels=None, segments=None, new_shape=(640, 640)): + '''Applies Random affine transformation.''' + n = len(segments) + if isinstance(new_shape, int): + new_shape = (new_shape, new_shape) + height, width = new_shape + + new_segments = [] + # Transform label coordinates + if n: + new = np.zeros((n, 4)) + segments = resample_segments(segments) + for i, segment in enumerate(segments): + new[i] = segment2box(segment, width, height) + new_segments.append(segment) + labels[:, 1:5] = new[i] + new_segments = np.array(new_segments)[i] + + return labels, new_segments + +def resample_segments(segments, n=1000): + # Up-sample an (n,2) segment + for i, s in enumerate(segments): + s = np.concatenate((s, s[0:1, :]), axis=0) + x = np.linspace(0, len(s) - 1, n) + xp = np.arange(len(s)) + segments[i] = np.concatenate([np.interp(x, xp, s[:, i]) for i in range(2)]).reshape(2, -1).T # segment xy + return segments + + +def get_transform_matrix(img_shape, new_shape, degrees, scale, shear, translate): + new_height, new_width = new_shape + # print(new_height, new_width) + # Center + C = np.eye(3) + C[0, 2] = -img_shape[1] / 2 # x translation (pixels) + C[1, 2] = -img_shape[0] / 2 # y translation (pixels) + + # Rotation and Scale + R = np.eye(3) + a = random.uniform(-degrees, degrees) + # a += random.choice([-180, -90, 0, 90]) # add 90deg rotations to small rotations + s = random.uniform(1 - scale, 1 + scale) + # s = 2 ** random.uniform(-scale, scale) + R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s) + + # Shear + S = np.eye(3) + S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180) # x shear (deg) + S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180) # y shear (deg) + + # Translation + T = np.eye(3) + T[0, 2] = random.uniform(0.5 - translate, 0.5 + translate) * new_width # x translation (pixels) + T[1, 2] = random.uniform(0.5 - translate, 0.5 + translate) * new_height # y transla ion (pixels) + + # Combined rotation matrix + M = T @ S @ R @ C # order of operations (right to left) is IMPORTANT + return M, s + + +def mosaic_augmentation(shape, imgs, hs, ws, labels, segments, hyp, specific_shape = False, target_height=640, target_width=640): + '''Applies Mosaic augmentation.''' + assert len(imgs) == 4, "Mosaic augmentation of current version only supports 4 images." + labels4 = [] + segments4 = [] + if not specific_shape: + if isinstance(shape, list) or isinstance(shape, np.ndarray): + target_height, target_width = shape + else: + target_height = target_width = shape + + yc, xc = (int(random.uniform(x//2, 3*x//2)) for x in (target_height, target_width) ) # mosaic center x, y + + for i in range(len(imgs)): + # Load image + img, h, w = imgs[i], hs[i], ws[i] + # place img in img4 + if i == 0: # top left + img4 = np.full((target_height * 2, target_width * 2, img.shape[2]), 114, dtype=np.uint8) # base image with 4 tiles + + x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc # xmin, ymin, xmax, ymax (large image) + x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h # xmin, ymin, xmax, ymax (small image) + elif i == 1: # top right + x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, target_width * 2), yc + x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h + elif i == 2: # bottom left + x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(target_height * 2, yc + h) + x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h) + elif i == 3: # bottom right + x1a, y1a, x2a, y2a = xc, yc, min(xc + w, target_width * 2), min(target_height * 2, yc + h) + x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h) + + img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b] # img4[ymin:ymax, xmin:xmax] + padw = x1a - x1b + padh = y1a - y1b + + # Labels + labels_per_img = labels[i].copy() + segments_per_img = segments[i].copy() + if labels_per_img.size: + boxes = np.copy(labels_per_img[:, 1:]) + boxes[:, 0] = w * (labels_per_img[:, 1] - labels_per_img[:, 3] / 2) + padw # top left x + boxes[:, 1] = h * (labels_per_img[:, 2] - labels_per_img[:, 4] / 2) + padh # top left y + boxes[:, 2] = w * (labels_per_img[:, 1] + labels_per_img[:, 3] / 2) + padw # bottom right x + boxes[:, 3] = h * (labels_per_img[:, 2] + labels_per_img[:, 4] / 2) + padh # bottom right y + for __ in range(len(segments_per_img)): + segments_per_img[__][:, 0] = w * segments_per_img[__][:, 0] + padw + segments_per_img[__][:, 1] = h * segments_per_img[__][:, 1] + padh + labels_per_img[:, 1:] = boxes + + labels4.append(labels_per_img) + segments4.extend(segments_per_img) + + # Concat/clip labels + labels4 = np.concatenate(labels4, 0) + # for x in (labels4[:, 1:]): + # np.clip(x, 0, 2 * s, out=x) + labels4[:, 1::2] = np.clip(labels4[:, 1::2], 0, 2 * target_width) + labels4[:, 2::2] = np.clip(labels4[:, 2::2], 0, 2 * target_height) + for __ in range(len(segments4)): + segments4[__][:, 0] = np.clip(segments4[__][:, 0], 0, 2 * target_width) + segments4[__][:, 1] = np.clip(segments4[__][:, 1], 0, 2 * target_height) + + # Augment + return img4, labels4, segments4 + img4, labels4, segments4 = random_affine(img4, labels4, segments4, + degrees=hyp['degrees'], + translate=hyp['translate'], + scale=hyp['scale'], + shear=hyp['shear'], + new_shape=(target_height, target_width)) + + return img4, labels4, segments4 + +def segment2box(segment, width=640, height=640): + # Convert 1 segment label to 1 box label, applying inside-image constraint, i.e. (xy1, xy2, ...) to (xyxy) + x, y = segment.T # segment xy + inside = (x >= 0) & (y >= 0) & (x <= width) & (y <= height) + x, y, = x[inside], y[inside] + return np.array([x.min(), y.min(), x.max(), y.max()]) if any(x) else np.zeros((1, 4)) # xyxy + diff --git a/yolov6/data/seg_datasets.py b/yolov6/data/seg_datasets.py new file mode 100644 index 00000000..8cca6513 --- /dev/null +++ b/yolov6/data/seg_datasets.py @@ -0,0 +1,859 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +import glob +from io import UnsupportedOperation +import os +import os.path as osp +import random +import json +import time +import hashlib +from pathlib import Path +import copy + +from multiprocessing.pool import Pool + +import cv2 +import numpy as np +from tqdm import tqdm +from PIL import ExifTags, Image, ImageOps + +import torch +from torch.utils.data import Dataset +import torch.distributed as dist + +from .seg_data_augment import ( + augment_hsv, + letterbox, + mixup, + random_affine, + mosaic_augmentation, + copy_paste +) +from yolov6.utils.events import LOGGER +import pickle + + +# Parameters +IMG_FORMATS = ["bmp", "jpg", "jpeg", "png", "tif", "tiff", "dng", "webp", "mpo"] +VID_FORMATS = ["mp4", "mov", "avi", "mkv"] +IMG_FORMATS.extend([f.upper() for f in IMG_FORMATS]) +VID_FORMATS.extend([f.upper() for f in VID_FORMATS]) +# Get orientation exif tag +for k, v in ExifTags.TAGS.items(): + if v == "Orientation": + ORIENTATION = k + break + +def img2label_paths(img_paths): + # Define label paths as a function of image paths + sa, sb = f'{os.sep}images{os.sep}', f'{os.sep}labels{os.sep}' # /images/, /labels/ substrings + return [sb.join(x.rsplit(sa, 1)).rsplit('.', 1)[0] + '.txt' for x in img_paths] + +class TrainValDataset(Dataset): + '''YOLOv6 train_loader/val_loader, loads images and labels for training and validation.''' + def __init__( + self, + img_dir, + img_size=640, + batch_size=16, + augment=False, + hyp=None, + rect=False, + check_images=False, + check_labels=False, + stride=32, + pad=0.0, + rank=-1, + data_dict=None, + task="train", + specific_shape = False, + height=1088, + width=1920, + downsample_ratio=4, + overlap=False + ): + assert task.lower() in ("train", "val", "test", "speed"), f"Not supported task: {task}" + + t1 = time.time() + self.__dict__.update(locals()) + if task.lower()!="train": + self.downsample_ratio = 1 + self.main_process = self.rank in (-1, 0) + self.task = self.task.capitalize() + self.class_names = data_dict["names"] + self.img_paths, self.labels = self.get_imgs_labels(self.img_dir) + self.labels, self.segments = self.get_segment(self.labels) + + self.rect = rect + self.specific_shape = specific_shape + self.target_height = height + self.target_width = width + if self.rect: + shapes = [self.img_info[p]["shape"] for p in self.img_paths] + self.shapes = np.array(shapes, dtype=np.float64) + if dist.is_initialized(): + # in DDP mode, we need to make sure all images within batch_size * gpu_num + # will resized and padded to same shape. + sample_batch_size = self.batch_size * dist.get_world_size() + else: + sample_batch_size = self.batch_size + self.batch_indices = np.floor( + np.arange(len(shapes)) / sample_batch_size + ).astype( + np.int_ + ) # batch indices of each image + + self.sort_files_shapes() + + t2 = time.time() + if self.main_process: + LOGGER.info(f"%.1fs for dataset initialization." % (t2 - t1)) + + def __len__(self): + """Get the length of dataset""" + return len(self.img_paths) + + def __getitem__(self, index): + """Fetching a data sample for a given key. + This function applies mosaic and mixup augments during training. + During validation, letterbox augment is applied. + """ + target_shape = ( + (self.target_height, self.target_width) if self.specific_shape else + self.batch_shapes[self.batch_indices[index]] if self.rect + else self.img_size + ) + + # Mosaic Augmentation + if self.augment and random.random() < self.hyp["mosaic"]: + img, labels, segments = self.get_mosaic(index, target_shape) + shapes = None + + + # MixUp augmentation + if random.random() < self.hyp["mixup"]: + img_other, labels_other, segments_other = self.get_mosaic( + random.randint(0, len(self.img_paths) - 1), target_shape + ) + img, labels, segments = mixup(img, labels, segments, img_other, labels_other, segments_other) # To Change + + else: + # Load image + if self.hyp and "shrink_size" in self.hyp: + img, (h0, w0), (h, w) = self.load_image(index, self.hyp["shrink_size"]) + else: + img, (h0, w0), (h, w) = self.load_image(index) + + # letterbox + img, ratio, pad = letterbox(img, target_shape, auto=False, scaleup=self.augment) + shapes = (h0, w0), ((h * ratio / h0, w * ratio / w0), pad) # for COCO mAP rescaling + labels = copy.deepcopy(self.labels[index]) + segments = copy.deepcopy(self.segments[index]) + + if labels.size: + w *= ratio + h *= ratio + # new boxes + boxes = np.copy(labels[:, 1:5]) + boxes[:, 0] = ( + w * (labels[:, 1] - labels[:, 3] / 2) + pad[0] + ) # top left x + boxes[:, 1] = ( + h * (labels[:, 2] - labels[:, 4] / 2) + pad[1] + ) # top left y + boxes[:, 2] = ( + w * (labels[:, 1] + labels[:, 3] / 2) + pad[0] + ) # bottom right x + boxes[:, 3] = ( + h * (labels[:, 2] + labels[:, 4] / 2) + pad[1] + ) # bottom right y + labels[:, 1:] = boxes + + if len(segments): + for i_s in range(len(segments)): + segments[i_s][:, 0] = segments[i_s][:, 0] * ratio * w + pad[0] + segments[i_s][:, 1] = segments[i_s][:, 1] * ratio * h + pad[1] + + if self.augment: + img, labels, segments = random_affine( + img, + labels, + segments, + degrees=self.hyp["degrees"], + translate=self.hyp["translate"], + scale=self.hyp["scale"], + shear=self.hyp["shear"], + new_shape=target_shape, + ) + else: + img, labels, segments = random_affine( + img, + labels, + segments, + degrees=0, + translate=0, + scale=0, + shear=0, + new_shape=target_shape, + task="val" + ) + + + if len(labels): + h, w = img.shape[:2] + + labels[:, [1, 3]] = labels[:, [1, 3]].clip(0, w - 1e-3) # x1, x2 + labels[:, [2, 4]] = labels[:, [2, 4]].clip(0, h - 1e-3) # y1, y2 + + boxes = np.copy(labels[:, 1:]) + boxes[:, 0] = ((labels[:, 1] + labels[:, 3]) / 2) / w # x center + boxes[:, 1] = ((labels[:, 2] + labels[:, 4]) / 2) / h # y center + boxes[:, 2] = (labels[:, 3] - labels[:, 1]) / w # width + boxes[:, 3] = (labels[:, 4] - labels[:, 2]) / h # height + labels[:, 1:] = boxes + lindex = labels[:, 0] >= 0 + masks = self.polygons2masks(img.shape[:2], segments, color=1, downsample_ratio=self.downsample_ratio) + labels = labels[lindex] + masks = masks[lindex] + + else: + masks = np.asarray([]) + + if self.augment: + img, labels, masks = self.general_augment(img, labels, masks.transpose(1, 2, 0) if masks.shape[0]!=0 else masks) + + #? + + masks_out = (torch.from_numpy(masks.copy()) if len(masks) else torch.zeros(1 if self.overlap else len(labels), img.shape[0] // + self.downsample_ratio, img.shape[1] // + self.downsample_ratio)) + + labels_out = torch.zeros((len(labels), 6)) + if len(labels): + labels_out[:, 1:] = torch.from_numpy(labels) + + # Convert + # self.drawit(img, labels, masks, self.img_paths[index], self.task) + img = img.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB + img = np.ascontiguousarray(img) + return torch.from_numpy(img), labels_out, self.img_paths[index], shapes, masks_out + + def load_image(self, index, shrink_size=None): + """Load image. + This function loads image by cv2, resize original image to target shape(img_size) with keeping ratio. + + Returns: + Image, original shape of image, resized image shape + """ + path = self.img_paths[index] + try: + im = cv2.imread(path) + assert im is not None, f"opencv cannot read image correctly or {path} not exists" + except: + im = cv2.cvtColor(np.asarray(Image.open(path)), cv2.COLOR_RGB2BGR) + assert im is not None, f"Image Not Found {path}, workdir: {os.getcwd()}" + + h0, w0 = im.shape[:2] # origin shape + if self.specific_shape: + # keep ratio resize + ratio = min(self.target_width / w0, self.target_height / h0) + + elif shrink_size: + ratio = (self.img_size - shrink_size) / max(h0, w0) + + else: + ratio = self.img_size / max(h0, w0) + + if ratio != 1: + im = cv2.resize( + im, + (int(w0 * ratio), int(h0 * ratio)), + interpolation=cv2.INTER_AREA + if ratio < 1 and not self.augment + else cv2.INTER_LINEAR, + ) + return im, (h0, w0), im.shape[:2] + + @staticmethod + def collate_fn(batch): + """Merges a list of samples to form a mini-batch of Tensor(s)""" + img, label, path, shapes, masks = zip(*batch) + for i, l in enumerate(label): + l[:, 0] = i # add target image index for build_targets() + return torch.stack(img, 0), torch.cat(label, 0), path, shapes, torch.cat(masks, 0) + + @staticmethod + def get_segment(labels): + rlabels = [] + segments = [] + if len(labels) == 0: + return np.asarray([]) + for label in labels: + z1 = []#labels + z2 = []#seg + for l in label: + z1.append(np.asarray(l[:5]).reshape(1, 5).astype(np.float32)) + z2.append(np.asarray(l[1:]).reshape(-1, 2).astype(np.float32)) + if z1: + rlabels.append(np.concatenate(z1, axis = 0)) + segments.append(z2) + else: + t = np.zeros((1, 5), dtype = np.float32) + t[..., 0]= -1 + rlabels.append(t) + segments.append([np.zeros((2, 2), dtype = np.float32)]) + return rlabels, segments + + + + + def get_imgs_labels(self, img_dirs): + if not isinstance(img_dirs, list): + img_dirs = [img_dirs] + # we store the cache img file in the first directory of img_dirs + valid_img_record = osp.join( + osp.dirname(img_dirs[0]), "." + osp.basename(img_dirs[0]) + "_cache.json" + ) + NUM_THREADS = min(8, os.cpu_count()) + img_paths = [] + for img_dir in img_dirs: + assert osp.exists(img_dir), f"{img_dir} is an invalid directory path!" + img_paths += glob.glob(osp.join(img_dir, "**/*"), recursive=True) + + img_paths = sorted( + p for p in img_paths if p.split(".")[-1].lower() in IMG_FORMATS and os.path.isfile(p) + ) + + assert img_paths, f"No images found in {img_dir}." + img_hash = self.get_hash(img_paths) + LOGGER.info(f'img record infomation path is:{valid_img_record}') + if osp.exists(valid_img_record): + with open(valid_img_record, "r") as f: + cache_info = json.load(f) + if "image_hash" in cache_info and cache_info["image_hash"] == img_hash: + img_info = cache_info["information"] + else: + self.check_images = True + else: + self.check_images = True + + # check images + if self.check_images and self.main_process: + img_info = {} + nc, msgs = 0, [] # number corrupt, messages + LOGGER.info( + f"{self.task}: Checking formats of images with {NUM_THREADS} process(es): " + ) + with Pool(NUM_THREADS) as pool: + pbar = tqdm( + pool.imap(TrainValDataset.check_image, img_paths), + total=len(img_paths), + ) + for img_path, shape_per_img, nc_per_img, msg in pbar: + if nc_per_img == 0: # not corrupted + img_info[img_path] = {"shape": shape_per_img} + nc += nc_per_img + if msg: + msgs.append(msg) + pbar.desc = f"{nc} image(s) corrupted" + pbar.close() + if msgs: + LOGGER.info("\n".join(msgs)) + + cache_info = {"information": img_info, "image_hash": img_hash} + # save valid image paths. + with open(valid_img_record, "w") as f: + json.dump(cache_info, f) + + # check and load anns + + img_paths = list(img_info.keys()) + label_paths = img2label_paths(img_paths) + assert label_paths, f"No labels found." + label_hash = self.get_hash(label_paths) + if "label_hash" not in cache_info or cache_info["label_hash"] != label_hash: + self.check_labels = True + + if self.check_labels: + cache_info["label_hash"] = label_hash + nm, nf, ne, nc, msgs = 0, 0, 0, 0, [] # number corrupt, messages + LOGGER.info( + f"{self.task}: Checking formats of labels with {NUM_THREADS} process(es): " + ) + with Pool(NUM_THREADS) as pool: + pbar = pool.imap( + TrainValDataset.check_label_files, zip(img_paths, label_paths) + ) + pbar = tqdm(pbar, total=len(label_paths)) if self.main_process else pbar + for ( + img_path, + labels_per_file, + nc_per_file, + nm_per_file, + nf_per_file, + ne_per_file, + msg, + ) in pbar: + if nc_per_file == 0: + img_info[img_path]["labels"] = labels_per_file + else: + img_info.pop(img_path) + nc += nc_per_file + nm += nm_per_file + nf += nf_per_file + ne += ne_per_file + if msg: + msgs.append(msg) + if self.main_process: + pbar.desc = f"{nf} label(s) found, {nm} label(s) missing, {ne} label(s) empty, {nc} invalid label files" + if self.main_process: + pbar.close() + with open(valid_img_record, "w") as f: + json.dump(cache_info, f) + if msgs: + LOGGER.info("\n".join(msgs)) + if nf == 0: + LOGGER.warning( + f"WARNING: No labels found in {osp.dirname(img_paths[0])}. " + ) + + if self.task.lower() == "val": + if self.data_dict.get("is_coco", False): # use original json file when evaluating on coco dataset. + assert osp.exists(self.data_dict["anno_path"]), "Eval on coco dataset must provide valid path of the annotation file in config file: data/coco.yaml" + else: + assert ( + self.class_names + ), "Class names is required when converting labels to coco format for evaluating." + save_dir = osp.join(osp.dirname(osp.dirname(img_dirs[0])), "annotations") + if not osp.exists(save_dir): + os.mkdir(save_dir) + save_path = osp.join( + save_dir, "instances_" + osp.basename(img_dirs[0]) + ".json" + ) + TrainValDataset.generate_coco_format_labels( + img_info, self.class_names, save_path + ) + + # img_paths, labels = list( + # zip( + # *[ + # ( + # img_path, + # np.array(info["labels"], dtype=np.float32) + # if info["labels"] + # else np.zeros((0, 5), dtype=np.float32), + # ) + # for img_path, info in img_info.items() + # ] + # ) + # ) + img_paths, labels = list( + zip( + *[ + ( + img_path, + info["labels"] + if info["labels"] + else [], + ) + for img_path, info in img_info.items() + ] + ) + ) + self.img_info = img_info + LOGGER.info( + f"{self.task}: Final numbers of valid images: {len(img_paths)}/ labels: {len(labels)}. " + ) + return img_paths, labels + + def get_mosaic(self, index, shape): + """Gets images and labels after mosaic augments""" + indices = [index] + random.choices( + range(0, len(self.img_paths)), k=3 + ) # 3 additional image indices + random.shuffle(indices) + imgs, hs, ws, labels, segments = [], [], [], [], [] + for index in indices: + img, _, (h, w) = self.load_image(index) + labels_per_img = self.labels[index] + segments_per_img = copy.deepcopy(self.segments[index]) + imgs.append(img) + hs.append(h) + ws.append(w) + labels.append(labels_per_img) + segments.append(segments_per_img) + img, labels, segments = mosaic_augmentation(shape, imgs, hs, ws, labels, segments, self.hyp, self.specific_shape, self.target_height, self.target_width) + img, labels, segments = copy_paste(img, labels, segments, 0) + img, labels, segments = random_affine(img, labels, segments, + degrees=self.hyp['degrees'], + translate=self.hyp['translate'], + scale=self.hyp['scale'], + shear=self.hyp['shear'], + new_shape=shape if not self.specific_shape else (self.target_height, self.target_width)) + return img, labels, segments + + def general_augment(self, img, labels, segments): + """Gets images and labels after general augment + This function applies hsv, random ud-flip and random lr-flips augments. + """ + nl = len(labels) + + # HSV color-space + augment_hsv( + img, + hgain=self.hyp["hsv_h"], + sgain=self.hyp["hsv_s"], + vgain=self.hyp["hsv_v"], + ) + + # Flip up-down + if random.random() < self.hyp["flipud"]: + img = np.flipud(img) + if nl: + segments = np.flipud(segments) + labels[:, 2] = 1 - labels[:, 2] + + # Flip left-right + if random.random() < self.hyp["fliplr"]: + img = np.fliplr(img) + if nl: + segments = np.fliplr(segments) + labels[:, 1] = 1 - labels[:, 1] + + return img, labels, segments.transpose(2, 0, 1) if segments.shape[0]!=0 else segments + + def sort_files_shapes(self): + '''Sort by aspect ratio.''' + batch_num = self.batch_indices[-1] + 1 + s = self.shapes # [height, width] + ar = s[:, 1] / s[:, 0] # aspect ratio + irect = ar.argsort() + self.img_paths = [self.img_paths[i] for i in irect] + self.labels = [self.labels[i] for i in irect] + self.segments = [self.segments[i] for i in irect] + self.shapes = s[irect] # wh + ar = ar[irect] + + # Set training image shapes + shapes = [[1, 1]] * batch_num + for i in range(batch_num): + ari = ar[self.batch_indices == i] + mini, maxi = ari.min(), ari.max() + if maxi < 1: + shapes[i] = [1, maxi] + elif mini > 1: + shapes[i] = [1 / mini, 1] + self.batch_shapes = ( + np.ceil(np.array(shapes) * self.img_size / self.stride + self.pad).astype( + np.int_ + ) + * self.stride + ) + + @staticmethod + def check_image(im_file): + '''Verify an image.''' + nc, msg = 0, "" + try: + im = Image.open(im_file) + im.verify() # PIL verify + im = Image.open(im_file) # need to reload the image after using verify() + shape = (im.height, im.width) # (height, width) + try: + im_exif = im._getexif() + if im_exif and ORIENTATION in im_exif: + rotation = im_exif[ORIENTATION] + if rotation in (6, 8): + shape = (shape[1], shape[0]) + except: + im_exif = None + + assert (shape[0] > 9) & (shape[1] > 9), f"image size {shape} <10 pixels" + assert im.format.lower() in IMG_FORMATS, f"invalid image format {im.format}" + if im.format.lower() in ("jpg", "jpeg"): + with open(im_file, "rb") as f: + f.seek(-2, 2) + if f.read() != b"\xff\xd9": # corrupt JPEG + ImageOps.exif_transpose(Image.open(im_file)).save( + im_file, "JPEG", subsampling=0, quality=100 + ) + msg += f"WARNING: {im_file}: corrupt JPEG restored and saved" + return im_file, shape, nc, msg + except Exception as e: + nc = 1 + msg = f"WARNING: {im_file}: ignoring corrupt image: {e}" + return im_file, None, nc, msg + + @staticmethod + def xyn2xy(x, w=640, h=640, padw=0, padh=0): + # Convert normalized segments into pixel segments, shape (n,2) + y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) + y[..., 0] = w * x[..., 0] + padw # top left x + y[..., 1] = h * x[..., 1] + padh # top left y + return y + + @staticmethod + def drawit(img, labels, masks, imgname = "", task = ""): + # Convert normalized segments into pixel segments, shape (n,2) + # There are some bugs in Val! + if task == "Val": + return 0 + import copy + + spsp = copy.deepcopy(img) + for label in labels: + xy = label[1:3] * np.asarray(img.shape[:2])[::-1] + wh = label[3:5] * np.asarray(img.shape[:2])[::-1] + pt1 = (xy - wh / 2).astype(np.int_) + pt2 = (xy + wh / 2).astype(np.int_) + cv2.rectangle(spsp, pt1, pt2, (0,255,255), 1) + ssss = random.randint(0,100000000) + for mask in masks: + if mask.shape[:2]!=(img.shape[0], img.shape[1]): + m = cv2.resize(mask,(img.shape[0], img.shape[1])) + else: + m = mask + m = m.reshape(img.shape[0], img.shape[1], 1) + q = np.ones((img.shape[0], img.shape[1], 1), dtype = np.int_) * 255 * m + q = q * m + s = np.zeros((img.shape[0], img.shape[1], 2)) + s = np.concatenate([s, q], axis = 2) + spsp = cv2.addWeighted(spsp, 1, s.astype(np.int_), 0.5, 0, dtype=cv2.CV_8U) + print(img.shape, labels.shape, masks.shape) + try: + print(cv2.imwrite("/home/hadoop-seccv/ssd/wangzhaonian/yolov6_seg/test_img/{}.jpg".format(ssss), spsp)) + print(imgname, ssss, len(labels), len(masks)) + except: + print("?") + + + @staticmethod + def check_label_files(args): + img_path, lb_path = args + nm, nf, ne, nc, msg = 0, 0, 0, 0, "" # number (missing, found, empty, message + try: + if osp.exists(lb_path): + nf = 1 # label found + with open(lb_path, "r") as f: + labels = [ + x.split() for x in f.read().strip().splitlines() if len(x) > 5 # get which has seg + ] + # labels = np.array(labels, dtype=np.float32) + if len(labels): + # assert all( + # len(l) >= 5 for l in labels + # ), f"{lb_path}: wrong label format." + # assert ( + # labels >= 0 + # ).all(), f"{lb_path}: Label values error: all values in label file must > 0" + # assert ( + # labels[:, 1:] <= 1 + # ).all(), f"{lb_path}: Label values error: all coordinates must be normalized" + + # _, indices = np.unique(labels, axis=0, return_index=True) + # if len(indices) < len(labels): # duplicate row check + # labels = labels[indices] # remove duplicates + # msg += f"WARNING: {lb_path}: {len(labels) - len(indices)} duplicate labels removed" + # labels = labels.tolist() + _t = 0 + else: + ne = 1 # label empty + labels = [] + else: + nm = 1 # label missing + labels = [] + return img_path, labels, nc, nm, nf, ne, msg + except Exception as e: + nc = 1 + msg = f"WARNING: {lb_path}: ignoring invalid labels: {e}" + return img_path, None, nc, nm, nf, ne, msg + + @staticmethod + def polygon2mask(img_size, polygons, color=1, downsample_ratio=1): + mask = np.zeros(img_size, dtype=np.uint8) + polygons = np.asarray(polygons) + polygons = polygons.astype(np.int32) + shape = polygons.shape + polygons = polygons.reshape(shape[0], -1, 2) + cv2.fillPoly(mask, polygons, color=color) + nh, nw = (img_size[0] // downsample_ratio, img_size[1] // downsample_ratio) + # NOTE: fillPoly firstly then resize is trying the keep the same way + # of loss calculation when mask-ratio=1. + mask = cv2.resize(mask, (nw, nh)) + return mask + + def polygons2masks(self, img_size, polygons, color, downsample_ratio=1): + """ + Args: + img_size (tuple): The image size. + polygons (list[np.ndarray]): each polygon is [N, M], + N is the number of polygons, + M is the number of points(Be divided by 2). + """ + masks = [] + for si in range(len(polygons)): + mask = self.polygon2mask(img_size, [polygons[si].reshape(-1)], color, downsample_ratio) + masks.append(mask) + return np.array(masks) + + + def polygons2masks_overlap(self, img_size, segments, downsample_ratio=1): + """Return a (640, 640) overlap mask.""" + masks = np.zeros((img_size[0] // downsample_ratio, img_size[1] // downsample_ratio), + dtype=np.int32 if len(segments) > 255 else np.uint8) + areas = [] + ms = [] + for si in range(len(segments)): + mask = self.polygon2mask( + img_size, + [segments[si].reshape(-1)], + downsample_ratio=downsample_ratio, + color=1, + ) + ms.append(mask) + areas.append(mask.sum()) + areas = np.asarray(areas) + index = np.argsort(-areas) + ms = np.array(ms)[index] + for i in range(len(segments)): + mask = ms[i] * (i + 1) + masks = masks + mask + masks = np.clip(masks, a_min=0, a_max=i + 1) + return masks, index + + @staticmethod + def generate_coco_format_labels(img_info, class_names, save_path): + # for evaluation with pycocotools + dataset = {"categories": [], "annotations": [], "images": []} + for i, class_name in enumerate(class_names): + dataset["categories"].append( + {"id": i, "name": class_name, "supercategory": ""} + ) + + ann_id = 0 + LOGGER.info(f"Convert to COCO format") + for i, (img_path, info) in enumerate(tqdm(img_info.items())): + labels = info["labels"] if info["labels"] else [] + img_id = osp.splitext(osp.basename(img_path))[0] + img_h, img_w = info["shape"] + dataset["images"].append( + { + "file_name": os.path.basename(img_path), + "id": img_id, + "width": img_w, + "height": img_h, + } + ) + if labels: + for label in labels: + c, x, y, w, h = label[:5] + c, x, y, w, h = float(c), float(x), float(y), float(w), float(h) + seg = np.asarray(label[5:]).astype(np.float32) + seg = seg.reshape(-1, 2) + #breakpoint() + seg = seg * np.asarray([img_w, img_h]) + seg = seg.reshape(-1) + # convert x,y,w,h to x1,y1,x2,y2 + x1 = (x - w / 2) * img_w + y1 = (y - h / 2) * img_h + x2 = (x + w / 2) * img_w + y2 = (y + h / 2) * img_h + # cls_id starts from 0 + cls_id = int(c) + w = max(0, x2 - x1) + h = max(0, y2 - y1) + dataset["annotations"].append( + { + "area": h * w, + "bbox": [x1, y1, w, h], + "category_id": cls_id, + "id": ann_id, + "image_id": img_id, + "iscrowd": 0, + # mask + "segmentation": list(seg), + } + ) + ann_id += 1 + + with open(save_path, "w") as f: + json.dump(dataset, f) + LOGGER.info( + f"Convert to COCO format finished. Resutls saved in {save_path}" + ) + + @staticmethod + def get_hash(paths): + """Get the hash value of paths""" + assert isinstance(paths, list), "Only support list currently." + h = hashlib.md5("".join(paths).encode()) + return h.hexdigest() + + +class LoadData: + def __init__(self, path, webcam, webcam_addr): + self.webcam = webcam + self.webcam_addr = webcam_addr + if webcam: # if use web camera + imgp = [] + vidp = [int(webcam_addr) if webcam_addr.isdigit() else webcam_addr] + else: + p = str(Path(path).resolve()) # os-agnostic absolute path + if os.path.isdir(p): + files = sorted(glob.glob(os.path.join(p, '**/*.*'), recursive=True)) # dir + elif os.path.isfile(p): + files = [p] # files + else: + raise FileNotFoundError(f'Invalid path {p}') + imgp = [i for i in files if i.split('.')[-1] in IMG_FORMATS] + vidp = [v for v in files if v.split('.')[-1] in VID_FORMATS] + self.files = imgp + vidp + self.nf = len(self.files) + self.type = 'image' + if len(vidp) > 0: + self.add_video(vidp[0]) # new video + else: + self.cap = None + + # @staticmethod + def checkext(self, path): + if self.webcam: + file_type = 'video' + else: + file_type = 'image' if path.split('.')[-1].lower() in IMG_FORMATS else 'video' + return file_type + + def __iter__(self): + self.count = 0 + return self + + def __next__(self): + if self.count == self.nf: + raise StopIteration + path = self.files[self.count] + if self.checkext(path) == 'video': + self.type = 'video' + ret_val, img = self.cap.read() + while not ret_val: + self.count += 1 + self.cap.release() + if self.count == self.nf: # last video + raise StopIteration + path = self.files[self.count] + self.add_video(path) + ret_val, img = self.cap.read() + else: + # Read image + self.count += 1 + img = cv2.imread(path) # BGR + return img, path, self.cap + + def add_video(self, path): + self.frame = 0 + self.cap = cv2.VideoCapture(path) + self.frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT)) + + def __len__(self): + return self.nf # number of files diff --git a/yolov6/models/efficientrep.py b/yolov6/models/efficientrep.py index 5d0de7ce..4ca75083 100644 --- a/yolov6/models/efficientrep.py +++ b/yolov6/models/efficientrep.py @@ -387,20 +387,11 @@ def __init__( block=RepVGGBlock, csp_e=float(1)/2, fuse_P2=False, - cspsppf=False, - stage_block_type="BepC3" + cspsppf=False ): super().__init__() assert channels_list is not None assert num_repeats is not None - - if stage_block_type == "BepC3": - stage_block = BepC3 - elif stage_block_type == "MBLABlock": - stage_block = MBLABlock - else: - raise NotImplementedError - self.fuse_P2 = fuse_P2 self.stem = block( @@ -417,7 +408,7 @@ def __init__( kernel_size=3, stride=2 ), - stage_block( + BepC3( in_channels=channels_list[1], out_channels=channels_list[1], n=num_repeats[1], @@ -433,7 +424,7 @@ def __init__( kernel_size=3, stride=2 ), - stage_block( + BepC3( in_channels=channels_list[2], out_channels=channels_list[2], n=num_repeats[2], @@ -449,7 +440,7 @@ def __init__( kernel_size=3, stride=2 ), - stage_block( + BepC3( in_channels=channels_list[3], out_channels=channels_list[3], n=num_repeats[3], @@ -469,7 +460,7 @@ def __init__( kernel_size=3, stride=2, ), - stage_block( + BepC3( in_channels=channels_list[4], out_channels=channels_list[4], n=num_repeats[4], @@ -484,7 +475,7 @@ def __init__( kernel_size=3, stride=2, ), - stage_block( + BepC3( in_channels=channels_list[5], out_channels=channels_list[5], n=num_repeats[5], diff --git a/yolov6/models/effidehead_seg.py b/yolov6/models/effidehead_seg.py new file mode 100644 index 00000000..2bfe9843 --- /dev/null +++ b/yolov6/models/effidehead_seg.py @@ -0,0 +1,452 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import math +from yolov6.layers.common import * +from yolov6.assigners.anchor_generator import generate_anchors +from yolov6.utils.general import dist2bbox + + +class Detect(nn.Module): + export = False + '''Efficient Decoupled Head + With hardware-aware degisn, the decoupled head is optimized with + hybridchannels methods. + ''' + def __init__(self, num_classes=80, num_layers=3, inplace=True, head_layers=None, reg_mask=None, use_dfl=True, reg_max=16, nm=32): # detection layer + # nm: number of masks + super().__init__() + assert head_layers is not None + assert reg_mask is not None + self.nc = num_classes # number of classes + self.no = num_classes + 5 + nm # number of outputs per anchor + self.nl = num_layers # number of detection layers + self.nm = nm + self.grid = [torch.zeros(1)] * num_layers + self.prior_prob = 1e-2 + self.inplace = inplace + stride = [8, 16, 32] if num_layers == 3 else [8, 16, 32, 64] # strides computed during build + self.stride = torch.tensor(stride) + self.use_dfl = use_dfl + self.reg_max = reg_max + self.proj_conv = nn.Conv2d(self.reg_max + 1, 1, 1, bias=False) + self.grid_cell_offset = 0.5 + self.grid_cell_size = 5.0 + + # Init decouple head + self.stems = nn.ModuleList() + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + self.cls_preds = nn.ModuleList() + self.reg_preds = nn.ModuleList() + + # Efficient decoupled head layers + for i in range(num_layers): + idx = i*5 + self.stems.append(head_layers[idx]) + self.cls_convs.append(head_layers[idx+1]) + self.reg_convs.append(head_layers[idx+2]) + self.cls_preds.append(head_layers[idx+3]) + self.reg_preds.append(head_layers[idx+4]) + + def initialize_biases(self): + + for conv in self.cls_preds: + b = conv.bias.view(-1, ) + b.data.fill_(-math.log((1 - self.prior_prob) / self.prior_prob)) + conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True) + w = conv.weight + w.data.fill_(0.) + conv.weight = torch.nn.Parameter(w, requires_grad=True) + + for conv in self.reg_preds: + b = conv.bias.view(-1, ) + b.data.fill_(1.0) + conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True) + w = conv.weight + w.data.fill_(0.) + conv.weight = torch.nn.Parameter(w, requires_grad=True) + + self.proj = nn.Parameter(torch.linspace(0, self.reg_max, self.reg_max + 1), requires_grad=False) + self.proj_conv.weight = nn.Parameter(self.proj.view([1, self.reg_max + 1, 1, 1]).clone().detach(), + requires_grad=False) + + def forward(self, x): + if self.training: + cls_score_list = [] + reg_distri_list = [] + + for i in range(self.nl): + x[i] = self.stems[i](x[i]) + cls_x = x[i] + reg_x = x[i] + cls_feat = self.cls_convs[i](cls_x) + cls_output = self.cls_preds[i](cls_feat) + reg_feat = self.reg_convs[i](reg_x) + reg_output = self.reg_preds[i](reg_feat) + + cls_output = torch.sigmoid(cls_output) + cls_score_list.append(cls_output.flatten(2).permute((0, 2, 1))) + reg_distri_list.append(reg_output.flatten(2).permute((0, 2, 1))) + + cls_score_list = torch.cat(cls_score_list, axis=1) + reg_distri_list = torch.cat(reg_distri_list, axis=1) + + return x, cls_score_list, reg_distri_list + else: + cls_score_list = [] + reg_dist_list = [] + + for i in range(self.nl): + b, _, h, w = x[i].shape + l = h * w + x[i] = self.stems[i](x[i]) + cls_x = x[i] + reg_x = x[i] + cls_feat = self.cls_convs[i](cls_x) + cls_output = self.cls_preds[i](cls_feat) + reg_feat = self.reg_convs[i](reg_x) + reg_output = self.reg_preds[i](reg_feat) + + if self.use_dfl: + reg_output = reg_output.reshape([-1, 4, self.reg_max + 1, l]).permute(0, 2, 1, 3) + reg_output = self.proj_conv(F.softmax(reg_output, dim=1)) + + cls_output = torch.sigmoid(cls_output) + + if self.export: + cls_score_list.append(cls_output) + reg_dist_list.append(reg_output) + else: + cls_score_list.append(cls_output.reshape([b, self.nc, l])) + reg_dist_list.append(reg_output.reshape([b, 4, l])) + + if self.export: + return tuple(torch.cat([cls, reg], 1) for cls, reg in zip(cls_score_list, reg_dist_list)) + + cls_score_list = torch.cat(cls_score_list, axis=-1).permute(0, 2, 1) + reg_dist_list = torch.cat(reg_dist_list, axis=-1).permute(0, 2, 1) + + + anchor_points, stride_tensor = generate_anchors( + x, self.stride, self.grid_cell_size, self.grid_cell_offset, device=x[0].device, is_eval=True, mode='af') + + pred_bboxes = dist2bbox(reg_dist_list, anchor_points, box_format='xywh') + pred_bboxes *= stride_tensor + return torch.cat( + [ + pred_bboxes, + torch.ones((b, pred_bboxes.shape[1], 1), device=pred_bboxes.device, dtype=pred_bboxes.dtype), + cls_score_list + ], + axis=-1) + +def build_seg_layer(channels_list, num_anchors, num_classes, reg_max=16, num_layers=3): + + chx = [6, 8, 10] if num_layers == 3 else [8, 9, 10, 11] + + head_layers = nn.Sequential( + # stem0 + ConvBNSiLU( + in_channels=channels_list[chx[0]], + out_channels=channels_list[chx[0]], + kernel_size=1, + stride=1 + ), + # cls_conv0 + ConvBNSiLU( + in_channels=channels_list[chx[0]], + out_channels=channels_list[chx[0]], + kernel_size=3, + stride=1 + ), + # reg_conv0 + ConvBNSiLU( + in_channels=channels_list[chx[0]], + out_channels=channels_list[chx[0]], + kernel_size=3, + stride=1 + ), + # cls_pred0 + nn.Conv2d( + in_channels=channels_list[chx[0]], + out_channels=num_classes * num_anchors, + kernel_size=1 + ), + # reg_pred0 + nn.Conv2d( + in_channels=channels_list[chx[0]], + out_channels=4 * (reg_max + num_anchors), + kernel_size=1 + ), + # stem1 + ConvBNSiLU( + in_channels=channels_list[chx[1]], + out_channels=channels_list[chx[1]], + kernel_size=1, + stride=1 + ), + # cls_conv1 + ConvBNSiLU( + in_channels=channels_list[chx[1]], + out_channels=channels_list[chx[1]], + kernel_size=3, + stride=1 + ), + # reg_conv1 + ConvBNSiLU( + in_channels=channels_list[chx[1]], + out_channels=channels_list[chx[1]], + kernel_size=3, + stride=1 + ), + # cls_pred1 + nn.Conv2d( + in_channels=channels_list[chx[1]], + out_channels=num_classes * num_anchors, + kernel_size=1 + ), + # reg_pred1 + nn.Conv2d( + in_channels=channels_list[chx[1]], + out_channels=4 * (reg_max + num_anchors), + kernel_size=1 + ), + # stem2 + ConvBNSiLU( + in_channels=channels_list[chx[2]], + out_channels=channels_list[chx[2]], + kernel_size=1, + stride=1 + ), + # cls_conv2 + ConvBNSiLU( + in_channels=channels_list[chx[2]], + out_channels=channels_list[chx[2]], + kernel_size=3, + stride=1 + ), + # reg_conv2 + ConvBNSiLU( + in_channels=channels_list[chx[2]], + out_channels=channels_list[chx[2]], + kernel_size=3, + stride=1 + ), + # cls_pred2 + nn.Conv2d( + in_channels=channels_list[chx[2]], + out_channels=num_classes * num_anchors, + kernel_size=1 + ), + # reg_pred2 + nn.Conv2d( + in_channels=channels_list[chx[2]], + out_channels=4 * (reg_max + num_anchors), + kernel_size=1 + ) + ) + + if num_layers == 4: + head_layers.add_module('stem3', + # stem3 + ConvBNSiLU( + in_channels=channels_list[chx[3]], + out_channels=channels_list[chx[3]], + kernel_size=1, + stride=1 + ) + ) + head_layers.add_module('cls_conv3', + # cls_conv3 + ConvBNSiLU( + in_channels=channels_list[chx[3]], + out_channels=channels_list[chx[3]], + kernel_size=3, + stride=1 + ) + ) + head_layers.add_module('reg_conv3', + # reg_conv3 + ConvBNSiLU( + in_channels=channels_list[chx[3]], + out_channels=channels_list[chx[3]], + kernel_size=3, + stride=1 + ) + ) + head_layers.add_module('cls_pred3', + # cls_pred3 + nn.Conv2d( + in_channels=channels_list[chx[3]], + out_channels=num_classes * num_anchors, + kernel_size=1 + ) + ) + head_layers.add_module('reg_pred3', + # reg_pred3 + nn.Conv2d( + in_channels=channels_list[chx[3]], + out_channels=4 * (reg_max + num_anchors), + kernel_size=1 + ) + ) + + return head_layers + + +###### + + +def build_effidehead_layer(channels_list, num_anchors, num_classes, reg_max=16, num_layers=3): + + chx = [6, 8, 10] if num_layers == 3 else [8, 9, 10, 11] + + head_layers = nn.Sequential( + # stem0 + ConvBNSiLU( + in_channels=channels_list[chx[0]], + out_channels=channels_list[chx[0]], + kernel_size=1, + stride=1 + ), + # cls_conv0 + ConvBNSiLU( + in_channels=channels_list[chx[0]], + out_channels=channels_list[chx[0]], + kernel_size=3, + stride=1 + ), + # reg_conv0 + ConvBNSiLU( + in_channels=channels_list[chx[0]], + out_channels=channels_list[chx[0]], + kernel_size=3, + stride=1 + ), + # cls_pred0 + nn.Conv2d( + in_channels=channels_list[chx[0]], + out_channels=num_classes * num_anchors, + kernel_size=1 + ), + # reg_pred0 + nn.Conv2d( + in_channels=channels_list[chx[0]], + out_channels=4 * (reg_max + num_anchors), + kernel_size=1 + ), + # stem1 + ConvBNSiLU( + in_channels=channels_list[chx[1]], + out_channels=channels_list[chx[1]], + kernel_size=1, + stride=1 + ), + # cls_conv1 + ConvBNSiLU( + in_channels=channels_list[chx[1]], + out_channels=channels_list[chx[1]], + kernel_size=3, + stride=1 + ), + # reg_conv1 + ConvBNSiLU( + in_channels=channels_list[chx[1]], + out_channels=channels_list[chx[1]], + kernel_size=3, + stride=1 + ), + # cls_pred1 + nn.Conv2d( + in_channels=channels_list[chx[1]], + out_channels=num_classes * num_anchors, + kernel_size=1 + ), + # reg_pred1 + nn.Conv2d( + in_channels=channels_list[chx[1]], + out_channels=4 * (reg_max + num_anchors), + kernel_size=1 + ), + # stem2 + ConvBNSiLU( + in_channels=channels_list[chx[2]], + out_channels=channels_list[chx[2]], + kernel_size=1, + stride=1 + ), + # cls_conv2 + ConvBNSiLU( + in_channels=channels_list[chx[2]], + out_channels=channels_list[chx[2]], + kernel_size=3, + stride=1 + ), + # reg_conv2 + ConvBNSiLU( + in_channels=channels_list[chx[2]], + out_channels=channels_list[chx[2]], + kernel_size=3, + stride=1 + ), + # cls_pred2 + nn.Conv2d( + in_channels=channels_list[chx[2]], + out_channels=num_classes * num_anchors, + kernel_size=1 + ), + # reg_pred2 + nn.Conv2d( + in_channels=channels_list[chx[2]], + out_channels=4 * (reg_max + num_anchors), + kernel_size=1 + ) + ) + + if num_layers == 4: + head_layers.add_module('stem3', + # stem3 + ConvBNSiLU( + in_channels=channels_list[chx[3]], + out_channels=channels_list[chx[3]], + kernel_size=1, + stride=1 + ) + ) + head_layers.add_module('cls_conv3', + # cls_conv3 + ConvBNSiLU( + in_channels=channels_list[chx[3]], + out_channels=channels_list[chx[3]], + kernel_size=3, + stride=1 + ) + ) + head_layers.add_module('reg_conv3', + # reg_conv3 + ConvBNSiLU( + in_channels=channels_list[chx[3]], + out_channels=channels_list[chx[3]], + kernel_size=3, + stride=1 + ) + ) + head_layers.add_module('cls_pred3', + # cls_pred3 + nn.Conv2d( + in_channels=channels_list[chx[3]], + out_channels=num_classes * num_anchors, + kernel_size=1 + ) + ) + head_layers.add_module('reg_pred3', + # reg_pred3 + nn.Conv2d( + in_channels=channels_list[chx[3]], + out_channels=4 * (reg_max + num_anchors), + kernel_size=1 + ) + ) + + return head_layers diff --git a/yolov6/models/heads/effidehead_fuseab_seg.py b/yolov6/models/heads/effidehead_fuseab_seg.py new file mode 100644 index 00000000..80272928 --- /dev/null +++ b/yolov6/models/heads/effidehead_fuseab_seg.py @@ -0,0 +1,551 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import math +from yolov6.layers.common import * +from yolov6.assigners.anchor_generator import generate_anchors +from yolov6.utils.general import dist2bbox + + +class Detect(nn.Module): + export = False + '''Efficient Decoupled Head for fusing anchor-base branches. + ''' + def __init__(self, num_classes=80, anchors=None, num_layers=3, inplace=True, head_layers=None, reg_mask=None, use_dfl=True, reg_max=16, nm=32, fuse_ab=False): # detection layer + super().__init__() + assert head_layers is not None + assert reg_mask is not None + self.nc = num_classes # number of classes + self.no = num_classes + 5 + nm # number of outputs per anchor + self.nl = num_layers # number of detection layers + self.nm = nm # number of masks + if isinstance(anchors, (list, tuple)): + self.na = len(anchors[0]) // 2 + else: + self.na = anchors + self.grid = [torch.zeros(1)] * num_layers + self.fuse_ab = fuse_ab + self.prior_prob = 1e-2 + self.inplace = inplace + stride = [8, 16, 32] if num_layers == 3 else [8, 16, 32, 64] # strides computed during build + self.stride = torch.tensor(stride) + self.use_dfl = use_dfl + self.reg_max = reg_max + self.proj_conv = nn.Conv2d(self.reg_max + 1, 1, 1, bias=False) + self.grid_cell_offset = 0.5 + self.grid_cell_size = 5.0 + self.anchors_init= ((torch.tensor(anchors) / self.stride[:,None])).reshape(self.nl, self.na, 2) + self.reg_mask = reg_mask + + # Init decouple head + self.stems = nn.ModuleList() + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + self.seg_convs = nn.ModuleList() + self.cls_preds = nn.ModuleList() + self.reg_preds = nn.ModuleList() + self.seg_preds = nn.ModuleList() + self.cls_preds_ab = nn.ModuleList() + self.reg_preds_ab = nn.ModuleList() + self.seg_preds_ab = nn.ModuleList() + self.seg_proto = nn.ModuleList() + self.seg_proto.append(reg_mask[0]) + + + # Efficient decoupled head layers + for i in range(num_layers): + idx = i*10 + self.stems.append(head_layers[idx]) + self.cls_convs.append(head_layers[idx+1]) + self.reg_convs.append(head_layers[idx+2]) + self.seg_convs.append(head_layers[idx+3]) + self.cls_preds.append(head_layers[idx+4]) + self.reg_preds.append(head_layers[idx+5]) + self.seg_preds.append(head_layers[idx+6]) + if self.fuse_ab: + self.cls_preds_ab.append(head_layers[idx+7]) + self.reg_preds_ab.append(head_layers[idx+8]) + self.seg_preds_ab.append(head_layers[idx+9]) + + + def initialize_biases(self): + + for conv in self.cls_preds: + b = conv.bias.view(-1, ) + b.data.fill_(-math.log((1 - self.prior_prob) / self.prior_prob)) + conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True) + w = conv.weight + w.data.fill_(0.) + conv.weight = torch.nn.Parameter(w, requires_grad=True) + + if self.fuse_ab: + for conv in self.cls_preds_ab: + b = conv.bias.view(-1, ) + b.data.fill_(-math.log((1 - self.prior_prob) / self.prior_prob)) + conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True) + w = conv.weight + w.data.fill_(0.) + conv.weight = torch.nn.Parameter(w, requires_grad=True) + + for conv in self.reg_preds: + b = conv.bias.view(-1, ) + b.data.fill_(1.0) + conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True) + w = conv.weight + w.data.fill_(0.) + conv.weight = torch.nn.Parameter(w, requires_grad=True) + + if self.fuse_ab: + for conv in self.reg_preds_ab: + b = conv.bias.view(-1, ) + b.data.fill_(1.0) + conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True) + w = conv.weight + w.data.fill_(0.) + conv.weight = torch.nn.Parameter(w, requires_grad=True) + + self.proj = nn.Parameter(torch.linspace(0, self.reg_max, self.reg_max + 1), requires_grad=False) + self.proj_conv.weight = nn.Parameter(self.proj.view([1, self.reg_max + 1, 1, 1]).clone().detach(), + requires_grad=False) + + def handleseg_af(self, sgot_lst, sg_msk_lst): + ''' + sg_msk_lst --> lst sg_msk: segmask: Shape(bs, 32, w, h) + sgot_lst --> lst sgot: seg_output_conf: Shape(bs, n, 32) + ''' + mask_res = [] + for i in range(len(sgot_lst)): + sgot = sgot_lst[i] + sg_msk = sg_msk_lst[i] + t_mask_res = [] + for j in range(sgot.shape[0]): + sgot_t = sgot[j] # (n, 32) + sg_msk_t = sg_msk[j] # (32, w, h) + m_t = (sgot_t@sg_msk_t.reshape(32, -1)).reshape(-1, *sg_msk_t.shape[1:]) + m_t = m_t.unsqueeze(0) + t_mask_res.append(m_t) + mask_res.append(torch.cat(t_mask_res, 0).flatten(0,1)) + return mask_res + + def handleseg_ab(self, sgot_lst, sg_msk_lst): + ''' + sg_msk_lst --> lst sg_msk: segmask: Shape(bs, 32, w, h) + sgot_lst --> lst sgot: seg_output_conf: Shape(bs, num_of_anchors, h, w, 32) + sgot.flatten(1, 3) -> Shape(bs, n*num_of_anchors, 32) + for j in range(bs) -> ((n*num_of_anchor, 32)@(32, w0, h0) = (n*num_of_anchor, 32)@(32, w0, h0)) + ''' + mask_res = [] + for i in range(len(sgot_lst)): + sgot = sgot_lst[i] + sg_msk = sg_msk_lst[i] + s_shape = sgot.shape[1:4] + sgot = sgot.flatten(1, 3) + t_mask_res = [] + for j in range(sgot.shape[0]): + sgot_t = sgot[j] # (n, 32) + sg_msk_t = sg_msk[j] # (32, w, h) + m_t = (sgot_t@sg_msk_t.reshape(32, -1)).reshape(-1, *sg_msk_t.shape[1:]) + m_t = m_t.unsqueeze(0) + t_mask_res.append(m_t) + mask_res.append(torch.cat(t_mask_res, 0).flatten(0,1)) + return mask_res + + + + + + def forward(self, x): + if self.training: + device = x[0].device + cls_score_list_af = [] + reg_dist_list_af = [] + cls_score_list_ab = [] + reg_dist_list_ab = [] + seg_conf_list_af = [] + seg_conf_list_ab = [] + seg_list = [] + af_seg_list = [] + ab_seg_list = [] + + seg_mask = self.seg_proto[0](x[0]) + seg_list.append(seg_mask) + + + + for i in range(self.nl): + b, _, h, w = x[i].shape + l = h * w + + + x[i] = self.stems[i](x[i]) + + + cls_x = x[i] + reg_x = x[i] + seg_x = x[i] + + cls_feat = self.cls_convs[i](cls_x) + reg_feat = self.reg_convs[i](reg_x) + seg_feat = self.seg_convs[i](seg_x) + + #anchor_base + if self.fuse_ab: + cls_output_ab = self.cls_preds_ab[i](cls_feat) + reg_output_ab = self.reg_preds_ab[i](reg_feat) + seg_output_ab = self.seg_preds_ab[i](seg_feat) + + cls_output_ab = torch.sigmoid(cls_output_ab) + seg_output_ab = torch.sigmoid(seg_output_ab) + if self.fuse_ab: + seg_conf_list_ab.append(seg_output_ab.reshape(b, self.na, -1, h, w).permute(0,1,3,4,2)) + cls_output_ab = cls_output_ab.reshape(b, self.na, -1, h, w).permute(0,1,3,4,2) + cls_score_list_ab.append(cls_output_ab.flatten(1,3)) + + + reg_output_ab = reg_output_ab.reshape(b, self.na, -1, h, w).permute(0,1,3,4,2) + reg_output_ab[..., 2:4] = ((reg_output_ab[..., 2:4].sigmoid() * 2) ** 2 ) * (self.anchors_init[i].reshape(1, self.na, 1, 1, 2).to(device)) + reg_dist_list_ab.append(reg_output_ab.flatten(1,3)) + + #anchor_free + cls_output_af = self.cls_preds[i](cls_feat) + reg_output_af = self.reg_preds[i](reg_feat) + seg_output_af = self.seg_preds[i](seg_feat) + + cls_output_af = torch.sigmoid(cls_output_af) + # seg_output_af = torch.sigmoid(seg_output_af) + seg_conf_list_af.append(seg_output_af.flatten(2).permute((0, 2, 1))) + + cls_score_list_af.append(cls_output_af.flatten(2).permute((0, 2, 1))) + reg_dist_list_af.append(reg_output_af.flatten(2).permute((0, 2, 1))) + + #Not support fuseab now + if False: + ab_seg_list = self.handleseg_ab(seg_conf_list_ab, seg_list) if self.fuse_ab else [] + cls_score_list_ab = torch.cat(cls_score_list_ab, axis=1) + reg_dist_list_ab = torch.cat(reg_dist_list_ab, axis=1) + cls_score_list_af = torch.cat(cls_score_list_af, axis=1) + reg_dist_list_af = torch.cat(reg_dist_list_af, axis=1) + + return x, cls_score_list_ab, reg_dist_list_ab, cls_score_list_af, reg_dist_list_af, [seg_conf_list_af, seg_list], ab_seg_list + + else: + device = x[0].device + cls_score_list_af = [] + reg_dist_list_af = [] + seg_list = [] + seg_conf_list_af = [] + seg_mask = self.seg_proto[0](x[0]) + seg_list.append(seg_mask) + + for i in range(self.nl): + b, _, h, w = x[i].shape + l = h * w + + + x[i] = self.stems[i](x[i]) + + cls_x = x[i] + reg_x = x[i] + seg_x = x[i] + + cls_feat = self.cls_convs[i](cls_x) + reg_feat = self.reg_convs[i](reg_x) + seg_feat = self.seg_convs[i](seg_x) + + #anchor_free + cls_output_af = self.cls_preds[i](cls_feat) + reg_output_af = self.reg_preds[i](reg_feat) + seg_output_af = self.seg_preds[i](seg_feat) + + if self.use_dfl: + reg_output_af = reg_output_af.reshape([-1, 4, self.reg_max + 1, l]).permute(0, 2, 1, 3) + reg_output_af = self.proj_conv(F.softmax(reg_output_af, dim=1)) + + cls_output_af = torch.sigmoid(cls_output_af) + # seg_output_af = torch.sigmoid(seg_output_af) + proto_no = (torch.ones(b, 1, l) * i).cuda() + + + if self.export: + cls_score_list_af.append(cls_output_af) + reg_dist_list_af.append(reg_output_af) + seg_conf_list_af.append(seg_output_af) + else: + cls_score_list_af.append(cls_output_af.reshape([b, self.nc, l])) + reg_dist_list_af.append(reg_output_af.reshape([b, 4, l])) + seg_conf_list_af.append(torch.cat([proto_no, seg_output_af.reshape([b, 32, l])], axis = 1)) #[which_proto, (32...)] + + if self.export: + return tuple(torch.cat([cls, reg, seg], 1) for cls, reg, seg in zip(cls_score_list_af, reg_dist_list_af, seg_conf_list_af)), seg_list[0] + + cls_score_list_af = torch.cat(cls_score_list_af, axis=-1).permute(0, 2, 1) + reg_dist_list_af = torch.cat(reg_dist_list_af, axis=-1).permute(0, 2, 1) + seg_conf_list_af = torch.cat(seg_conf_list_af, axis=-1).permute(0, 2, 1) + + + + #anchor_free + anchor_points_af, stride_tensor_af = generate_anchors( + x, self.stride, self.grid_cell_size, self.grid_cell_offset, device=x[0].device, is_eval=True, mode='af') + + pred_bboxes_af = dist2bbox(reg_dist_list_af, anchor_points_af, box_format='xywh') + pred_bboxes_af *= stride_tensor_af + + pred_bboxes = pred_bboxes_af + cls_score_list = cls_score_list_af + + return torch.cat( + [ + pred_bboxes, + torch.ones((b, pred_bboxes.shape[1], 1), device=pred_bboxes.device, dtype=pred_bboxes.dtype), + cls_score_list + ], + axis=-1), seg_list, seg_conf_list_af + +class Proto(nn.Module): + # Borrow from YOLOv5 + def __init__(self, num_layers, channels_list, pos, c_=256, c2=32): # ch_in, number of protos, number of masks + super().__init__() + chx = [6, 8, 10] if num_layers == 3 else [8, 9, 10, 11] + c1 = channels_list[chx[pos]] + self.cv1 = Conv(c1, c_, k=3) + self.upsample = nn.Upsample(scale_factor=2, mode='nearest') + self.cv2 = Conv(c_, c_, k=3) + self.cv3 = Conv(c_, c2) + + def forward(self, x): + return self.cv3(self.cv2(self.upsample(self.cv1(x)))) + +def autopad(k, p=None, d=1): # kernel, padding, dilation + # Pad to 'same' shape outputs + if d > 1: + k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-size + if p is None: + p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad + return p + + +class Conv(nn.Module): + # Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation) + default_act = nn.SiLU() # default activation + + def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True): + super().__init__() + self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False) + self.bn = nn.BatchNorm2d(c2) + self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity() + + def forward(self, x): + return self.act(self.bn(self.conv(x))) + + def forward_fuse(self, x): + return self.act(self.conv(x)) + + +def build_effidehead_layer(channels_list, num_anchors, num_classes, reg_max=16, num_layers=3, num_masks=32, fuse_ab=False): + + chx = [6, 8, 10] if num_layers == 3 else [8, 9, 10, 11] + + head_layers = nn.Sequential( + # stem0 + ConvBNSiLU( + in_channels=channels_list[chx[0]], + out_channels=channels_list[chx[0]], + kernel_size=1, + stride=1 + ), + # cls_conv0 + ConvBNSiLU( + in_channels=channels_list[chx[0]], + out_channels=channels_list[chx[0]], + kernel_size=3, + stride=1 + ), + # reg_conv0 + ConvBNSiLU( + in_channels=channels_list[chx[0]], + out_channels=channels_list[chx[0]], + kernel_size=3, + stride=1 + ), + # seg_conv0 + ConvBNSiLU( + in_channels=channels_list[chx[0]], + out_channels=channels_list[chx[0]], + kernel_size=3, + stride=1 + ), + # cls_pred0_af + nn.Conv2d( + in_channels=channels_list[chx[0]], + out_channels=num_classes, + kernel_size=1 + ), + # reg_pred0_af + nn.Conv2d( + in_channels=channels_list[chx[0]], + out_channels=4 * (reg_max + 1), + kernel_size=1 + ), + # seg_pred0_af + nn.Conv2d( + in_channels=channels_list[chx[0]], + out_channels=num_masks, + kernel_size=1 + ), + # cls_pred0_3ab + nn.Conv2d( + in_channels=channels_list[chx[0]], + out_channels=num_classes * num_anchors, + kernel_size=1 + ), + # reg_pred0_3ab + nn.Conv2d( + in_channels=channels_list[chx[0]], + out_channels=4 * num_anchors, + kernel_size=1 + ), + # seg_pred0_3ab + nn.Conv2d( + in_channels=channels_list[chx[0]], + out_channels=num_masks * num_anchors, + kernel_size=1 + ), + # stem1 + ConvBNSiLU( + in_channels=channels_list[chx[1]], + out_channels=channels_list[chx[1]], + kernel_size=1, + stride=1 + ), + # cls_conv1 + ConvBNSiLU( + in_channels=channels_list[chx[1]], + out_channels=channels_list[chx[1]], + kernel_size=3, + stride=1 + ), + # reg_conv1 + ConvBNSiLU( + in_channels=channels_list[chx[1]], + out_channels=channels_list[chx[1]], + kernel_size=3, + stride=1 + ), + # seg_conv1 + ConvBNSiLU( + in_channels=channels_list[chx[1]], + out_channels=channels_list[chx[1]], + kernel_size=3, + stride=1 + ), + # cls_pred1_af + nn.Conv2d( + in_channels=channels_list[chx[1]], + out_channels=num_classes, + kernel_size=1 + ), + # reg_pred1_af + nn.Conv2d( + in_channels=channels_list[chx[1]], + out_channels=4 * (reg_max + 1), + kernel_size=1 + ), + # seg_pred1_af + nn.Conv2d( + in_channels=channels_list[chx[1]], + out_channels=num_masks, + kernel_size=1 + ), + # cls_pred1_3ab + nn.Conv2d( + in_channels=channels_list[chx[1]], + out_channels=num_classes * num_anchors, + kernel_size=1 + ), + # reg_pred1_3ab + nn.Conv2d( + in_channels=channels_list[chx[1]], + out_channels=4 * num_anchors, + kernel_size=1 + ), + # seg_pred1_3ab + nn.Conv2d( + in_channels=channels_list[chx[1]], + out_channels=num_masks * num_anchors, + kernel_size=1 + ), + # stem2 + ConvBNSiLU( + in_channels=channels_list[chx[2]], + out_channels=channels_list[chx[2]], + kernel_size=1, + stride=1 + ), + # cls_conv2 + ConvBNSiLU( + in_channels=channels_list[chx[2]], + out_channels=channels_list[chx[2]], + kernel_size=3, + stride=1 + ), + # reg_conv2 + ConvBNSiLU( + in_channels=channels_list[chx[2]], + out_channels=channels_list[chx[2]], + kernel_size=3, + stride=1 + ), + # seg_conv2 + ConvBNSiLU( + in_channels=channels_list[chx[2]], + out_channels=channels_list[chx[2]], + kernel_size=3, + stride=1 + ), + # cls_pred2_af + nn.Conv2d( + in_channels=channels_list[chx[2]], + out_channels=num_classes, + kernel_size=1 + ), + # reg_pred2_af + nn.Conv2d( + in_channels=channels_list[chx[2]], + out_channels=4 * (reg_max + 1), + kernel_size=1 + ), + # seg_pred2_af + nn.Conv2d( + in_channels=channels_list[chx[2]], + out_channels=num_masks, + kernel_size=1 + ), + # cls_pred2_3ab + nn.Conv2d( + in_channels=channels_list[chx[2]], + out_channels=num_classes * num_anchors, + kernel_size=1 + ), + # reg_pred2_3ab + nn.Conv2d( + in_channels=channels_list[chx[2]], + out_channels=4 * num_anchors, + kernel_size=1 + ), + # seg_pred2_3ab + nn.Conv2d( + in_channels=channels_list[chx[2]], + out_channels=num_masks * num_anchors, + kernel_size=1 + ), + ) + + return head_layers + + + + + + + diff --git a/yolov6/models/heads/effidehead_fuseab_seg_solo.py b/yolov6/models/heads/effidehead_fuseab_seg_solo.py new file mode 100644 index 00000000..61bd1328 --- /dev/null +++ b/yolov6/models/heads/effidehead_fuseab_seg_solo.py @@ -0,0 +1,540 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import math +from yolov6.layers.common import * +from yolov6.assigners.anchor_generator import generate_anchors +from yolov6.utils.general import dist2bbox + + +class Detect(nn.Module): + export = False + '''Efficient Decoupled Head for fusing anchor-base branches. + ''' + def __init__(self, num_classes=80, anchors=None, num_layers=3, inplace=True, head_layers=None, reg_mask=None, use_dfl=True, reg_max=16, nm=32, fuse_ab=False): # detection layer + super().__init__() + assert head_layers is not None + assert reg_mask is not None + self.nc = num_classes # number of classes + self.no = num_classes + 5 + nm # number of outputs per anchor + self.nl = num_layers # number of detection layers + self.nm = nm # number of masks + if isinstance(anchors, (list, tuple)): + self.na = len(anchors[0]) // 2 + else: + self.na = anchors + self.grid = [torch.zeros(1)] * num_layers + self.fuse_ab = fuse_ab + self.prior_prob = 1e-2 + self.inplace = inplace + stride = [8, 16, 32] if num_layers == 3 else [8, 16, 32, 64] # strides computed during build + self.stride = torch.tensor(stride) + self.use_dfl = use_dfl + self.reg_max = reg_max + self.proj_conv = nn.Conv2d(self.reg_max + 1, 1, 1, bias=False) + self.grid_cell_offset = 0.5 + self.grid_cell_size = 5.0 + self.anchors_init= ((torch.tensor(anchors) / self.stride[:,None])).reshape(self.nl, self.na, 2) + self.reg_mask = reg_mask + + # Init decouple head + self.stems = nn.ModuleList() + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + self.seg_convs = nn.ModuleList() + self.cls_preds = nn.ModuleList() + self.reg_preds = nn.ModuleList() + self.seg_preds = nn.ModuleList() + self.cls_preds_ab = nn.ModuleList() + self.reg_preds_ab = nn.ModuleList() + self.seg_preds_ab = nn.ModuleList() + self.seg_proto = nn.ModuleList() + self.seg_proto.append(reg_mask[0]) + self.seg_proto.append(reg_mask[1]) + self.seg_proto.append(reg_mask[2]) + + + # Efficient decoupled head layers + for i in range(num_layers): + idx = i*10 + self.stems.append(head_layers[idx]) + self.cls_convs.append(head_layers[idx+1]) + self.reg_convs.append(head_layers[idx+2]) + self.seg_convs.append(head_layers[idx+3]) + self.cls_preds.append(head_layers[idx+4]) + self.reg_preds.append(head_layers[idx+5]) + self.seg_preds.append(head_layers[idx+6]) + if self.fuse_ab: + self.cls_preds_ab.append(head_layers[idx+7]) + self.reg_preds_ab.append(head_layers[idx+8]) + self.seg_preds_ab.append(head_layers[idx+9]) + + + def initialize_biases(self): + + for conv in self.cls_preds: + b = conv.bias.view(-1, ) + b.data.fill_(-math.log((1 - self.prior_prob) / self.prior_prob)) + conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True) + w = conv.weight + w.data.fill_(0.) + conv.weight = torch.nn.Parameter(w, requires_grad=True) + + if self.fuse_ab: + for conv in self.cls_preds_ab: + b = conv.bias.view(-1, ) + b.data.fill_(-math.log((1 - self.prior_prob) / self.prior_prob)) + conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True) + w = conv.weight + w.data.fill_(0.) + conv.weight = torch.nn.Parameter(w, requires_grad=True) + + for conv in self.reg_preds: + b = conv.bias.view(-1, ) + b.data.fill_(1.0) + conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True) + w = conv.weight + w.data.fill_(0.) + conv.weight = torch.nn.Parameter(w, requires_grad=True) + + if self.fuse_ab: + for conv in self.reg_preds_ab: + b = conv.bias.view(-1, ) + b.data.fill_(1.0) + conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True) + w = conv.weight + w.data.fill_(0.) + conv.weight = torch.nn.Parameter(w, requires_grad=True) + + self.proj = nn.Parameter(torch.linspace(0, self.reg_max, self.reg_max + 1), requires_grad=False) + self.proj_conv.weight = nn.Parameter(self.proj.view([1, self.reg_max + 1, 1, 1]).clone().detach(), + requires_grad=False) + + + def handleseg_ab(self, sgot_lst, sg_msk_lst): + ''' + sg_msk_lst --> lst sg_msk: segmask: Shape(bs, 32, w, h) + sgot_lst --> lst sgot: seg_output_conf: Shape(bs, num_of_anchors, h, w, 32) + sgot.flatten(1, 3) -> Shape(bs, n*num_of_anchors, 32) + for j in range(bs) -> ((n*num_of_anchor, 32)@(32, w0, h0) = (n*num_of_anchor, 32)@(32, w0, h0)) + ''' + mask_res = [] + for i in range(len(sgot_lst)): + sgot = sgot_lst[i] + sg_msk = sg_msk_lst[i] + s_shape = sgot.shape[1:4] + sgot = sgot.flatten(1, 3) + t_mask_res = [] + for j in range(sgot.shape[0]): + sgot_t = sgot[j] # (n, 32) + sg_msk_t = sg_msk[j] # (32, w, h) + m_t = (sgot_t@sg_msk_t.reshape(self.nm, -1)).reshape(-1, *sg_msk_t.shape[1:]) + m_t = m_t.unsqueeze(0) + t_mask_res.append(m_t) + mask_res.append(torch.cat(t_mask_res, 0).flatten(0,1)) + return mask_res + + + + + + def forward(self, x): + if self.training: + device = x[0].device + cls_score_list_af = [] + reg_dist_list_af = [] + cls_score_list_ab = [] + reg_dist_list_ab = [] + seg_conf_list_af = [] + seg_conf_list_ab = [] + seg_list = [] + af_seg_list = [] + ab_seg_list = [] + + s1 = self.seg_proto[0](x[0]) + s2 = self.seg_proto[1](x[1]) + s3 = self.seg_proto[2](x[2]) + seg_mask = s1 + s2 + s3 + seg_list.append(seg_mask) + + + + for i in range(self.nl): + b, _, h, w = x[i].shape + l = h * w + + + x[i] = self.stems[i](x[i]) + + + cls_x = x[i] + reg_x = x[i] + seg_x = x[i] + + cls_feat = self.cls_convs[i](cls_x) + reg_feat = self.reg_convs[i](reg_x) + seg_feat = self.seg_convs[i](seg_x) + + #anchor_base + if self.fuse_ab: + cls_output_ab = self.cls_preds_ab[i](cls_feat) + reg_output_ab = self.reg_preds_ab[i](reg_feat) + seg_output_ab = self.seg_preds_ab[i](seg_feat) + + cls_output_ab = torch.sigmoid(cls_output_ab) + seg_output_ab = torch.sigmoid(seg_output_ab) + if self.fuse_ab: + seg_conf_list_ab.append(seg_output_ab.reshape(b, self.na, -1, h, w).permute(0,1,3,4,2)) + cls_output_ab = cls_output_ab.reshape(b, self.na, -1, h, w).permute(0,1,3,4,2) + cls_score_list_ab.append(cls_output_ab.flatten(1,3)) + + + reg_output_ab = reg_output_ab.reshape(b, self.na, -1, h, w).permute(0,1,3,4,2) + reg_output_ab[..., 2:4] = ((reg_output_ab[..., 2:4].sigmoid() * 2) ** 2 ) * (self.anchors_init[i].reshape(1, self.na, 1, 1, 2).to(device)) + reg_dist_list_ab.append(reg_output_ab.flatten(1,3)) + + #anchor_free + cls_output_af = self.cls_preds[i](cls_feat) + reg_output_af = self.reg_preds[i](reg_feat) + seg_output_af = self.seg_preds[i](seg_feat) + + cls_output_af = torch.sigmoid(cls_output_af) + # seg_output_af = torch.sigmoid(seg_output_af) + seg_conf_list_af.append(seg_output_af.flatten(2).permute((0, 2, 1))) + + cls_score_list_af.append(cls_output_af.flatten(2).permute((0, 2, 1))) + reg_dist_list_af.append(reg_output_af.flatten(2).permute((0, 2, 1))) + + #Not support fuseab now + if False: + ab_seg_list = self.handleseg_ab(seg_conf_list_ab, seg_list) if self.fuse_ab else [] + cls_score_list_ab = torch.cat(cls_score_list_ab, axis=1) + reg_dist_list_ab = torch.cat(reg_dist_list_ab, axis=1) + cls_score_list_af = torch.cat(cls_score_list_af, axis=1) + reg_dist_list_af = torch.cat(reg_dist_list_af, axis=1) + + return x, cls_score_list_ab, reg_dist_list_ab, cls_score_list_af, reg_dist_list_af, [seg_conf_list_af, seg_list], ab_seg_list + + else: + device = x[0].device + cls_score_list_af = [] + reg_dist_list_af = [] + seg_list = [] + seg_conf_list_af = [] + s1 = self.seg_proto[0](x[0]) + s2 = self.seg_proto[1](x[1]) + s3 = self.seg_proto[2](x[2]) + seg_mask = s1 + s2 + s3 + seg_list.append(seg_mask) + + for i in range(self.nl): + b, _, h, w = x[i].shape + l = h * w + + + x[i] = self.stems[i](x[i]) + + cls_x = x[i] + reg_x = x[i] + seg_x = x[i] + + cls_feat = self.cls_convs[i](cls_x) + reg_feat = self.reg_convs[i](reg_x) + seg_feat = self.seg_convs[i](seg_x) + + #anchor_free + cls_output_af = self.cls_preds[i](cls_feat) + reg_output_af = self.reg_preds[i](reg_feat) + seg_output_af = self.seg_preds[i](seg_feat) + + if self.use_dfl: + reg_output_af = reg_output_af.reshape([-1, 4, self.reg_max + 1, l]).permute(0, 2, 1, 3) + reg_output_af = self.proj_conv(F.softmax(reg_output_af, dim=1)) + + cls_output_af = torch.sigmoid(cls_output_af) + proto_no = (torch.ones(b, 1, l) * i).cuda() + + + if self.export: + cls_score_list_af.append(cls_output_af) + reg_dist_list_af.append(reg_output_af) + seg_conf_list_af.append(seg_output_af) + else: + cls_score_list_af.append(cls_output_af.reshape([b, self.nc, l])) + reg_dist_list_af.append(reg_output_af.reshape([b, 4, l])) + seg_conf_list_af.append(torch.cat([proto_no, seg_output_af.reshape([b, 67, l])], axis = 1)) #[which_proto, (32...)] + + if self.export: + return tuple(torch.cat([cls, reg, seg], 1) for cls, reg, seg in zip(cls_score_list_af, reg_dist_list_af, seg_conf_list_af)), seg_list[0] + + cls_score_list_af = torch.cat(cls_score_list_af, axis=-1).permute(0, 2, 1) + reg_dist_list_af = torch.cat(reg_dist_list_af, axis=-1).permute(0, 2, 1) + seg_conf_list_af = torch.cat(seg_conf_list_af, axis=-1).permute(0, 2, 1) + + + + #anchor_free + anchor_points_af, stride_tensor_af = generate_anchors( + x, self.stride, self.grid_cell_size, self.grid_cell_offset, device=x[0].device, is_eval=True, mode='af') + + pred_bboxes_af = dist2bbox(reg_dist_list_af, anchor_points_af, box_format='xywh') + pred_bboxes_af *= stride_tensor_af + + pred_bboxes = pred_bboxes_af + cls_score_list = cls_score_list_af + + return torch.cat( + [ + pred_bboxes, + torch.ones((b, pred_bboxes.shape[1], 1), device=pred_bboxes.device, dtype=pred_bboxes.dtype), + cls_score_list + ], + axis=-1), seg_list, seg_conf_list_af + +class Proto(nn.Module): + # Borrowed from YOLOv5 + def __init__(self, num_layers, channels_list, pos, c_=256, c2=64, scale_factor=2): # ch_in, number of protos, number of masks + super().__init__() + chx = [6, 8, 10] if num_layers == 3 else [8, 9, 10, 11] + c1 = channels_list[chx[pos]] + self.cv1 = Conv(c1, c_, k=3) + self.upsample = nn.Upsample(scale_factor=scale_factor, mode='nearest') + self.cv2 = Conv(c_, c_, k=3) + self.cv3 = Conv(c_, c2) + + def forward(self, x): + return self.cv3(self.cv2(self.upsample(self.cv1(x)))) + +def autopad(k, p=None, d=1): # kernel, padding, dilation + # Pad to 'same' shape outputs + if d > 1: + k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] # actual kernel-size + if p is None: + p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad + return p + + +class Conv(nn.Module): + # Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation) + default_act = nn.SiLU() # default activation + + def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True): + super().__init__() + self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False) + self.bn = nn.BatchNorm2d(c2) + self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity() + + def forward(self, x): + return self.act(self.bn(self.conv(x))) + + def forward_fuse(self, x): + return self.act(self.conv(x)) + + +def build_effidehead_layer(channels_list, num_anchors, num_classes, reg_max=16, num_layers=3, num_masks=67, fuse_ab=False): + + chx = [6, 8, 10] if num_layers == 3 else [8, 9, 10, 11] + + head_layers = nn.Sequential( + # stem0 + ConvBNSiLU( + in_channels=channels_list[chx[0]], + out_channels=channels_list[chx[0]], + kernel_size=1, + stride=1 + ), + # cls_conv0 + ConvBNSiLU( + in_channels=channels_list[chx[0]], + out_channels=channels_list[chx[0]], + kernel_size=3, + stride=1 + ), + # reg_conv0 + ConvBNSiLU( + in_channels=channels_list[chx[0]], + out_channels=channels_list[chx[0]], + kernel_size=3, + stride=1 + ), + # seg_conv0 + ConvBNSiLU( + in_channels=channels_list[chx[0]], + out_channels=channels_list[chx[0]], + kernel_size=3, + stride=1 + ), + # cls_pred0_af + nn.Conv2d( + in_channels=channels_list[chx[0]], + out_channels=num_classes, + kernel_size=1 + ), + # reg_pred0_af + nn.Conv2d( + in_channels=channels_list[chx[0]], + out_channels=4 * (reg_max + 1), + kernel_size=1 + ), + # seg_pred0_af + nn.Conv2d( + in_channels=channels_list[chx[0]], + out_channels=num_masks, + kernel_size=1 + ), + # cls_pred0_3ab + nn.Conv2d( + in_channels=channels_list[chx[0]], + out_channels=num_classes * num_anchors, + kernel_size=1 + ), + # reg_pred0_3ab + nn.Conv2d( + in_channels=channels_list[chx[0]], + out_channels=4 * num_anchors, + kernel_size=1 + ), + # seg_pred0_3ab + nn.Conv2d( + in_channels=channels_list[chx[0]], + out_channels=num_masks * num_anchors, + kernel_size=1 + ), + # stem1 + ConvBNSiLU( + in_channels=channels_list[chx[1]], + out_channels=channels_list[chx[1]], + kernel_size=1, + stride=1 + ), + # cls_conv1 + ConvBNSiLU( + in_channels=channels_list[chx[1]], + out_channels=channels_list[chx[1]], + kernel_size=3, + stride=1 + ), + # reg_conv1 + ConvBNSiLU( + in_channels=channels_list[chx[1]], + out_channels=channels_list[chx[1]], + kernel_size=3, + stride=1 + ), + # seg_conv1 + ConvBNSiLU( + in_channels=channels_list[chx[1]], + out_channels=channels_list[chx[1]], + kernel_size=3, + stride=1 + ), + # cls_pred1_af + nn.Conv2d( + in_channels=channels_list[chx[1]], + out_channels=num_classes, + kernel_size=1 + ), + # reg_pred1_af + nn.Conv2d( + in_channels=channels_list[chx[1]], + out_channels=4 * (reg_max + 1), + kernel_size=1 + ), + # seg_pred1_af + nn.Conv2d( + in_channels=channels_list[chx[1]], + out_channels=num_masks, + kernel_size=1 + ), + # cls_pred1_3ab + nn.Conv2d( + in_channels=channels_list[chx[1]], + out_channels=num_classes * num_anchors, + kernel_size=1 + ), + # reg_pred1_3ab + nn.Conv2d( + in_channels=channels_list[chx[1]], + out_channels=4 * num_anchors, + kernel_size=1 + ), + # seg_pred1_3ab + nn.Conv2d( + in_channels=channels_list[chx[1]], + out_channels=num_masks * num_anchors, + kernel_size=1 + ), + # stem2 + ConvBNSiLU( + in_channels=channels_list[chx[2]], + out_channels=channels_list[chx[2]], + kernel_size=1, + stride=1 + ), + # cls_conv2 + ConvBNSiLU( + in_channels=channels_list[chx[2]], + out_channels=channels_list[chx[2]], + kernel_size=3, + stride=1 + ), + # reg_conv2 + ConvBNSiLU( + in_channels=channels_list[chx[2]], + out_channels=channels_list[chx[2]], + kernel_size=3, + stride=1 + ), + # seg_conv2 + ConvBNSiLU( + in_channels=channels_list[chx[2]], + out_channels=channels_list[chx[2]], + kernel_size=3, + stride=1 + ), + # cls_pred2_af + nn.Conv2d( + in_channels=channels_list[chx[2]], + out_channels=num_classes, + kernel_size=1 + ), + # reg_pred2_af + nn.Conv2d( + in_channels=channels_list[chx[2]], + out_channels=4 * (reg_max + 1), + kernel_size=1 + ), + # seg_pred2_af + nn.Conv2d( + in_channels=channels_list[chx[2]], + out_channels=num_masks, + kernel_size=1 + ), + # cls_pred2_3ab + nn.Conv2d( + in_channels=channels_list[chx[2]], + out_channels=num_classes * num_anchors, + kernel_size=1 + ), + # reg_pred2_3ab + nn.Conv2d( + in_channels=channels_list[chx[2]], + out_channels=4 * num_anchors, + kernel_size=1 + ), + # seg_pred2_3ab + nn.Conv2d( + in_channels=channels_list[chx[2]], + out_channels=num_masks * num_anchors, + kernel_size=1 + ), + ) + + return head_layers + + + + + + + diff --git a/yolov6/models/losses/loss.py b/yolov6/models/losses/loss.py index ec534923..c4fe8d87 100644 --- a/yolov6/models/losses/loss.py +++ b/yolov6/models/losses/loss.py @@ -30,8 +30,6 @@ def __init__(self, ): self.fpn_strides = fpn_strides - self.cached_feat_sizes = [torch.Size([0, 0]) for _ in fpn_strides] - self.cached_anchors = None self.grid_cell_size = grid_cell_size self.grid_cell_offset = grid_cell_offset self.num_classes = num_classes @@ -60,13 +58,8 @@ def __call__( ): feats, pred_scores, pred_distri = outputs - if all(feat.shape[2:] == cfsize for feat, cfsize in zip(feats, self.cached_feat_sizes)): - anchors, anchor_points, n_anchors_list, stride_tensor = self.cached_anchors - else: - self.cached_feat_sizes = [feat.shape[2:] for feat in feats] - anchors, anchor_points, n_anchors_list, stride_tensor = \ - generate_anchors(feats, self.fpn_strides, self.grid_cell_size, self.grid_cell_offset, device=feats[0].device) - self.cached_anchors = anchors, anchor_points, n_anchors_list, stride_tensor + anchors, anchor_points, n_anchors_list, stride_tensor = \ + generate_anchors(feats, self.fpn_strides, self.grid_cell_size, self.grid_cell_offset, device=feats[0].device) assert pred_scores.type() == pred_distri.type() gt_bboxes_scale = torch.tensor([batch_width, batch_height, batch_width, batch_height]).type_as(pred_scores) diff --git a/yolov6/models/losses/seg_loss.py b/yolov6/models/losses/seg_loss.py new file mode 100644 index 00000000..04a25ecd --- /dev/null +++ b/yolov6/models/losses/seg_loss.py @@ -0,0 +1,532 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +import torch +import torch.nn as nn +import numpy as np +import torch.nn.functional as F +from yolov6.assigners.anchor_generator import generate_anchors +from yolov6.utils.general import dist2bbox, bbox2dist, xywh2xyxy, box_iou +from yolov6.utils.figure_iou import IOUloss +from yolov6.assigners.atss_assigner_seg import ATSSAssigner +from yolov6.assigners.tal_assigner_seg import TaskAlignedAssigner +import time +import pickle + +class ComputeLoss: + '''Loss computation func.''' + def __init__(self, + fpn_strides=[8, 16, 32], + grid_cell_size=5.0, + grid_cell_offset=0.5, + num_classes=80, + ori_img_size=640, + warmup_epoch=4, + use_dfl=True, + reg_max=16, + nm=32, + iou_type='giou', + loss_weight={ + 'class': 1.0, + 'iou': 2.5, + 'dfl': 0.5, + 'seg': 2.5}, + ): + + self.fpn_strides = fpn_strides + self.grid_cell_size = grid_cell_size + self.grid_cell_offset = grid_cell_offset + self.num_classes = num_classes + self.ori_img_size = ori_img_size + self.nm = nm + self.tt = nm + self.warmup_epoch = warmup_epoch + self.warmup_assigner = ATSSAssigner(9, num_classes=self.num_classes) + self.formal_assigner = TaskAlignedAssigner(topk=13, num_classes=self.num_classes, alpha=1.0, beta=6.0) + + self.use_dfl = use_dfl + self.reg_max = reg_max + self.proj = nn.Parameter(torch.linspace(0, self.reg_max, self.reg_max + 1), requires_grad=False) + self.iou_type = iou_type + self.varifocal_loss = VarifocalLoss().cuda() + self.bbox_loss = BboxLoss(self.num_classes, self.reg_max, self.use_dfl, self.iou_type).cuda() + self.loss_weight = loss_weight + + def __call__( + self, + outputs, + targets, + epoch_num, + step_num, + batch_height, + batch_width, + segmasks, + img=None, + ): + + feats, pred_scores, pred_distri, pred_seg = outputs # seg_list:shape(3)(b, nm, mw, mh) seg_conf_list:shape(3):(b, l ,nm) + seg_cf, seg_proto = pred_seg + anchors, anchor_points, n_anchors_list, stride_tensor = \ + generate_anchors(feats, self.fpn_strides, self.grid_cell_size, self.grid_cell_offset, device=feats[0].device) + + assert pred_scores.type() == pred_distri.type() + gt_bboxes_scale = torch.tensor([batch_width, batch_height, batch_width, batch_height]).type_as(pred_scores) + batch_size = pred_scores.shape[0] + + targets, gt_segmasks =self.preprocess(targets, batch_size, gt_bboxes_scale, segmasks) + gt_labels = targets[:, :, :1] + gt_bboxes = targets[:, :, 1:] #xyxy + mask_gt = (gt_bboxes.sum(-1, keepdim=True) > 0).float() + + anchor_points_s = anchor_points / stride_tensor + pred_bboxes = self.bbox_decode(anchor_points_s, pred_distri) #xyxy + try: + if epoch_num < self.warmup_epoch: + target_labels, target_bboxes, target_scores, fg_mask, target_segmasks = \ + self.warmup_assigner( + anchors, + n_anchors_list, + gt_labels, + gt_bboxes, + mask_gt, + pred_bboxes.detach() * stride_tensor, + gt_segmasks) + else: + target_labels, target_bboxes, target_scores, fg_mask, idx_lst = \ + self.formal_assigner( + pred_scores.detach(), + pred_bboxes.detach() * stride_tensor, + anchor_points, + gt_labels, + gt_bboxes, + mask_gt, + gt_segmasks) + + except RuntimeError: + print( + "OOM RuntimeError is raised due to the huge memory cost during label assignment. \ + CPU mode is applied in this batch. If you want to avoid this issue, \ + try to reduce the batch size or image size." + ) + torch.cuda.empty_cache() + print("------------CPU Mode for This Batch-------------") + if epoch_num < self.warmup_epoch: + _anchors = anchors.cpu().float() + _n_anchors_list = n_anchors_list + _gt_labels = gt_labels.cpu().float() + _gt_bboxes = gt_bboxes.cpu().float() + _mask_gt = mask_gt.cpu().float() + _pred_bboxes = pred_bboxes.detach().cpu().float() + _stride_tensor = stride_tensor.cpu().float() + _segmasks = gt_segmasks.cpu().float() + + target_labels, target_bboxes, target_scores, fg_mask, target_segmasks = \ + self.warmup_assigner( + _anchors, + _n_anchors_list, + _gt_labels, + _gt_bboxes, + _mask_gt, + _pred_bboxes * _stride_tensor, + _segmasks) + + else: + _pred_scores = pred_scores.detach().cpu().float() + _pred_bboxes = pred_bboxes.detach().cpu().float() + _anchor_points = anchor_points.cpu().float() + _gt_labels = gt_labels.cpu().float() + _gt_bboxes = gt_bboxes.cpu().float() + _mask_gt = mask_gt.cpu().float() + _stride_tensor = stride_tensor.cpu().float() + _segmasks = gt_segmasks.cpu().float() + + target_labels, target_bboxes, target_scores, fg_mask, idx_lst = \ + self.formal_assigner( + _pred_scores, + _pred_bboxes * _stride_tensor, + _anchor_points, + _gt_labels, + _gt_bboxes, + _mask_gt, + _segmasks) + + target_labels = target_labels.cuda() + target_bboxes = target_bboxes.cuda() + target_scores = target_scores.cuda() + fg_mask = fg_mask.cuda() + for _ in idx_lst: + _ = _.cuda() + + + if step_num % 10 == 0: + torch.cuda.empty_cache() + + # rescale bbox + target_bboxes /= stride_tensor + + # cls loss + target_labels = torch.where(fg_mask > 0, target_labels, torch.full_like(target_labels, self.num_classes)) + one_hot_label = F.one_hot(target_labels.long(), self.num_classes + 1)[..., :-1] + loss_cls = self.varifocal_loss(pred_scores, target_scores, one_hot_label) + + + target_scores_sum = target_scores.sum() + + # avoid devide zero error, devide by zero will cause loss to be inf or nan. + # if target_scores_sum is 0, loss_cls equals to 0 alson + if target_scores_sum > 1: + loss_cls /= target_scores_sum + + # bbox loss + loss_iou, loss_dfl = self.bbox_loss(pred_distri, pred_bboxes, anchor_points_s, target_bboxes, + target_scores, target_scores_sum, fg_mask) + + loss_seg = self.mask_loss(gt_segmasks, seg_cf, seg_proto, target_bboxes, fg_mask, idx_lst, target_scores, target_scores_sum) + + loss = self.loss_weight['class'] * loss_cls + \ + self.loss_weight['iou'] * loss_iou + \ + self.loss_weight['dfl'] * loss_dfl + \ + self.loss_weight['seg'] * loss_seg + + + return loss, \ + torch.cat(((self.loss_weight['iou'] * loss_iou).unsqueeze(0), + (self.loss_weight['dfl'] * loss_dfl).unsqueeze(0), + (self.loss_weight['class'] * loss_cls).unsqueeze(0), + (self.loss_weight['seg'] * loss_seg).unsqueeze(0))).detach() + + def preprocess(self, targets, batch_size, scale_tensor, segmask): + targets_list = np.zeros((batch_size, 1, 5)).tolist() + cu = [] + already = [] + # seg_list = np.zeros((batch_size, 1, *segmask.shape[1:])).tolist() + for i, item in enumerate(targets.cpu().numpy().tolist()): + index = int(item[0]) + targets_list[index].append(item[1:]) + if index not in already: + already.append(index) + cu.append(i) + cu.append(segmask.shape[0]) + max_len = max((len(l) for l in targets_list)) + segmasks = torch.zeros(batch_size, max_len - 1, segmask.shape[-2], segmask.shape[-1]).cuda() + if len(already) != 0: + for i in range(len(already)): + j = already[i] + start = cu[i] + end = cu[i+1] + segmasks[j, : end - start] = segmask[start: end].clone() + targets = torch.from_numpy(np.array(list(map(lambda l:l + [[-1,0,0,0,0]]*(max_len - len(l)), targets_list)))[:,1:,:]).to(targets.device) + + batch_target = targets[:, :, 1:5].mul_(scale_tensor) + targets[..., 1:] = xywh2xyxy(batch_target) + return targets, segmasks + + def bbox_decode(self, anchor_points, pred_dist): + if self.use_dfl: + batch_size, n_anchors, _ = pred_dist.shape + pred_dist = F.softmax(pred_dist.view(batch_size, n_anchors, 4, self.reg_max + 1), dim=-1).matmul(self.proj.to(pred_dist.device)) + return dist2bbox(pred_dist, anchor_points) + + def mask_loss(self, gt_segmasks, seg_cf, seg_proto, txyxy_ori, fg_mask, idx_lst, target_scores=None, target_scores_sum=None): + # pred_mask_lst -> list + ''' + pred_mask -> Shape(n1, w, h) + gt_mask -> Shape(n, img_w, img_h) + xyxy -> Shape(n, 4) + sum(n1, n2, n3, ...) = n + torch.abs((xyxy[..., 3] - xyxy[..., 1]) * (xyxy[..., 4] - xyxy[..., 2])) -> area + fg_mask --> (bs, tsize) + idx -> (bs, tsize) + gt_segmasks -> (bs, labelsize, w, h) + ''' + sl = 0 + sl2 = 0 + bl = [2, 4, 8] + num_pos = fg_mask.sum() + tloss = torch.zeros(1).float().cuda() + if num_pos<=0: + for ipred in seg_proto: + tloss += (ipred.sum() * 0.) + for ipred in seg_cf: + tloss += (ipred.sum() * 0.) + return tloss[0] + + + xyxy_mask = fg_mask.unsqueeze(-1).repeat([1, 1, 4]) + mtarget_scores = target_scores.sum(-1) # (bs, nl, 1) + + sl = 0 + qf = len(idx_lst) == 1 and len(idx_lst[0].shape) == 2 + if qf: + idx_lst = idx_lst[0] + for j in range(len(seg_cf)): + ishape = 0 + pshape = 0 + + iseg_proto = seg_proto[0] # (bs, 32, h, w) + bs = iseg_proto.shape[0] + iseg_cf = seg_cf[j] # (bs, part_n, 32) + + pshape = iseg_proto.shape[-1] + ishape = iseg_cf.shape[1] # (1) = part_n + idx = idx_lst[:, sl: sl + ishape] # (bs, part_n) + + ifg_mask = fg_mask[:, sl: sl + ishape] # (n) --> (bs, part_n) + itarget_scores = mtarget_scores[:, sl: sl + ishape] + if ifg_mask.sum() <= 0: + tloss += (iseg_proto.sum() * 0.) + tloss += (iseg_cf.sum() * 0.) + continue + target_sg = [] + pred_sg = [] + ixyxy_lst = [] + mask_weight = [] + for i in range(bs): + idx_thisbatch = torch.masked_select(idx[i], ifg_mask[i]) #(casize) + igt_segmasks = gt_segmasks.reshape(-1, *gt_segmasks.shape[-2:])[idx_thisbatch] # (?1, h?, w?) --> (?2, h?, w?) + imask_weight = torch.masked_select(itarget_scores[i], ifg_mask[i]).unsqueeze(-1) + mask_weight.append(imask_weight) + target_sg.append(igt_segmasks) + tiseg_cf = torch.masked_select(iseg_cf[i], ifg_mask[i].unsqueeze(-1).repeat(1, self.tt)) # (?2, 32) + tiseg_cf = tiseg_cf.reshape(-1, self.tt) + ipred_seg = (tiseg_cf@iseg_proto[i].reshape(self.tt, -1)).reshape(-1, pshape, pshape) # (?2, h, w) + ixyxy = torch.masked_select(txyxy_ori[i, sl: sl + ishape], xyxy_mask[i, sl: sl + ishape, :]).reshape(-1, 4) # (n, 4) --> (part_n, 4) --> (?2, 4) + ixyxy_lst.append(ixyxy) + pred_sg.append(ipred_seg) + + + + + bxyxy = torch.cat(ixyxy_lst, dim = 0) * bl[j] + bpred_seg = torch.cat(pred_sg, dim = 0) + bgt_seg = torch.cat(target_sg, dim = 0) + masks_weight = torch.cat(mask_weight, dim = 0).reshape(-1) + if tuple(bgt_seg.shape[-2:]) != (pshape, pshape): # downsample + bgt_seg = F.interpolate(bgt_seg[None], (pshape, pshape), mode='nearest')[0] + area = torch.abs((bxyxy[..., 2] - bxyxy[..., 0]) * (bxyxy[..., 3] - bxyxy[..., 1])) + area = area / (pshape) + area = area / (pshape) + + + + + + sl += ishape + loss = F.binary_cross_entropy_with_logits(bpred_seg, bgt_seg, reduction='none') + + loss = (self.crop_mask(loss, bxyxy).mean(dim=(1, 2)) / area) * masks_weight + loss = loss.sum() + tloss += loss + if target_scores_sum > 1: + tloss[0] = tloss[0] / target_scores_sum + return tloss[0] / len(seg_cf) + + + @staticmethod + def crop_mask(masks, boxes): + """ + "Crop" predicted masks by zeroing out everything not in the predicted bbox. + Vectorized by Chong (thanks Chong). + + Args: + - masks should be a size [n, h, w] tensor of masks + - boxes should be a size [n, 4] tensor of bbox coords in relative point form + """ + + n, h, w = masks.shape + x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) # x1 shape(1,1,n) + r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :] # rows shape(1,w,1) + c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None] # cols shape(h,1,1) + + return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2)) + + +class VarifocalLoss(nn.Module): + def __init__(self): + super(VarifocalLoss, self).__init__() + + def forward(self, pred_score,gt_score, label, alpha=0.75, gamma=2.0): + + weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label + with torch.cuda.amp.autocast(enabled=False): + loss = (F.binary_cross_entropy(pred_score.float(), gt_score.float(), reduction='none') * weight).sum() + + return loss + + +class BboxLoss(nn.Module): + def __init__(self, num_classes, reg_max, use_dfl=False, iou_type='giou'): + super(BboxLoss, self).__init__() + self.num_classes = num_classes + self.iou_loss = IOUloss(box_format='xyxy', iou_type=iou_type, eps=1e-10) + self.reg_max = reg_max + self.use_dfl = use_dfl + + def forward(self, pred_dist, pred_bboxes, anchor_points, + target_bboxes, target_scores, target_scores_sum, fg_mask): + + # select positive samples mask + num_pos = fg_mask.sum() + if num_pos > 0: + # iou loss + bbox_mask = fg_mask.unsqueeze(-1).repeat([1, 1, 4]) + pred_bboxes_pos = torch.masked_select(pred_bboxes, + bbox_mask).reshape([-1, 4]) + target_bboxes_pos = torch.masked_select( + target_bboxes, bbox_mask).reshape([-1, 4]) + bbox_weight = torch.masked_select( + target_scores.sum(-1), fg_mask).unsqueeze(-1) + loss_iou = self.iou_loss(pred_bboxes_pos, + target_bboxes_pos) * bbox_weight + if target_scores_sum > 1: + loss_iou = loss_iou.sum() / target_scores_sum + else: + loss_iou = loss_iou.sum() + + # dfl loss + if self.use_dfl: + dist_mask = fg_mask.unsqueeze(-1).repeat( + [1, 1, (self.reg_max + 1) * 4]) + pred_dist_pos = torch.masked_select( + pred_dist, dist_mask).reshape([-1, 4, self.reg_max + 1]) + target_ltrb = bbox2dist(anchor_points, target_bboxes, self.reg_max) + target_ltrb_pos = torch.masked_select( + target_ltrb, bbox_mask).reshape([-1, 4]) + loss_dfl = self._df_loss(pred_dist_pos, + target_ltrb_pos) * bbox_weight + if target_scores_sum > 1: + loss_dfl = loss_dfl.sum() / target_scores_sum + else: + loss_dfl = loss_dfl.sum() + else: + loss_dfl = pred_dist.sum() * 0. + + else: + loss_iou = pred_dist.sum() * 0. + loss_dfl = pred_dist.sum() * 0. + + return loss_iou, loss_dfl + + def _df_loss(self, pred_dist, target): + target_left = target.to(torch.long) + target_right = target_left + 1 + weight_left = target_right.to(torch.float) - target + weight_right = 1 - weight_left + loss_left = F.cross_entropy( + pred_dist.view(-1, self.reg_max + 1), target_left.view(-1), reduction='none').view( + target_left.shape) * weight_left + loss_right = F.cross_entropy( + pred_dist.view(-1, self.reg_max + 1), target_right.view(-1), reduction='none').view( + target_left.shape) * weight_right + return (loss_left + loss_right).mean(-1, keepdim=True) + +def dice_loss(pred, + target, + weight=None, + eps=1e-3, + reduction='mean', + naive_dice=False, + avg_factor=None): + """Calculate dice loss, there are two forms of dice loss is supported: + + - the one proposed in `V-Net: Fully Convolutional Neural + Networks for Volumetric Medical Image Segmentation + `_. + - the dice loss in which the power of the number in the + denominator is the first power instead of the second + power. + + Args: + pred (torch.Tensor): The prediction, has a shape (n, *) + target (torch.Tensor): The learning label of the prediction, + shape (n, *), same shape of pred. + weight (torch.Tensor, optional): The weight of loss for each + prediction, has a shape (n,). Defaults to None. + eps (float): Avoid dividing by zero. Default: 1e-3. + reduction (str, optional): The method used to reduce the loss into + a scalar. Defaults to 'mean'. + Options are "none", "mean" and "sum". + naive_dice (bool, optional): If false, use the dice + loss defined in the V-Net paper, otherwise, use the + naive dice loss in which the power of the number in the + denominator is the first power instead of the second + power.Defaults to False. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + """ + + input = pred.flatten(1) + target = target.flatten(1).float() + + a = torch.sum(input * target, 1) + if naive_dice: + b = torch.sum(input, 1) + c = torch.sum(target, 1) + d = (2 * a + eps) / (b + c + eps) + else: + b = torch.sum(input * input, 1) + eps + c = torch.sum(target * target, 1) + eps + d = (2 * a) / (b + c) + + loss = 1 - d + if weight is not None: + assert weight.ndim == loss.ndim + assert len(weight) == len(pred) + loss = weight_reduce_loss(loss, weight, reduction, avg_factor) + return loss + +def weight_reduce_loss(loss, + weight=None, + reduction='mean', + avg_factor=None): + """Apply element-wise weight and reduce loss. + + Args: + loss (Tensor): Element-wise loss. + weight (Optional[Tensor], optional): Element-wise weights. + Defaults to None. + reduction (str, optional): Same as built-in losses of PyTorch. + Defaults to 'mean'. + avg_factor (Optional[float], optional): Average factor when + computing the mean of losses. Defaults to None. + + Returns: + Tensor: Processed loss values. + """ + # if weight is specified, apply element-wise weight + if weight is not None: + loss = loss * weight + + # if avg_factor is not specified, just reduce the loss + if avg_factor is None: + loss = reduce_loss(loss, reduction) + else: + # if reduction is mean, then average the loss by avg_factor + if reduction == 'mean': + # Avoid causing ZeroDivisionError when avg_factor is 0.0, + # i.e., all labels of an image belong to ignore index. + eps = torch.finfo(torch.float32).eps + loss = loss.sum() / (avg_factor + eps) + # if reduction is 'none', then do nothing, otherwise raise an error + elif reduction != 'none': + raise ValueError('avg_factor can not be used with reduction="sum"') + return loss + +def reduce_loss(loss, reduction): + """Reduce loss as specified. + + Args: + loss (Tensor): Elementwise loss tensor. + reduction (str): Options are "none", "mean" and "sum". + + Return: + Tensor: Reduced loss tensor. + """ + reduction_enum = F._Reduction.get_enum(reduction) + # none: 0, elementwise_mean:1, sum: 2 + if reduction_enum == 0: + return loss + elif reduction_enum == 1: + return loss.mean() + elif reduction_enum == 2: + return loss.sum() \ No newline at end of file diff --git a/yolov6/models/losses/seg_loss_solo_main.py b/yolov6/models/losses/seg_loss_solo_main.py new file mode 100644 index 00000000..3a329beb --- /dev/null +++ b/yolov6/models/losses/seg_loss_solo_main.py @@ -0,0 +1,583 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +import torch +import torch.nn as nn +import numpy as np +import torch.nn.functional as F +from yolov6.assigners.anchor_generator import generate_anchors +from yolov6.utils.general import dist2bbox, bbox2dist, xywh2xyxy, box_iou +from yolov6.utils.figure_iou import IOUloss +from yolov6.assigners.atss_assigner_seg import ATSSAssigner +from yolov6.assigners.tal_assigner_seg import TaskAlignedAssigner +import time +import pickle + +class ComputeLoss: + '''Loss computation func.''' + def __init__(self, + fpn_strides=[8, 16, 32], + grid_cell_size=5.0, + grid_cell_offset=0.5, + num_classes=80, + ori_img_size=640, + warmup_epoch=4, + use_dfl=True, + reg_max=16, + weight_nums = 66, + bias_nums = 1, + nm = 64, + dyconv_channels = 66, + iou_type='giou', + loss_weight={ + 'class': 1.0, + 'iou': 2.5, + 'dfl': 0.5, + 'seg': 2.5}, + ): + + self.fpn_strides = fpn_strides + self.grid_cell_size = grid_cell_size + self.grid_cell_offset = grid_cell_offset + self.num_classes = num_classes + self.ori_img_size = ori_img_size + self.nm = nm + self.tt = nm + bias_nums + 2 + self.weight_nums = [nm + 2] + self.bias_nums = [bias_nums] + self.dyconv_channels = dyconv_channels + + self.warmup_epoch = warmup_epoch + self.warmup_assigner = ATSSAssigner(9, num_classes=self.num_classes) + self.formal_assigner = TaskAlignedAssigner(topk=13, num_classes=self.num_classes, alpha=1.0, beta=6.0) + + self.use_dfl = use_dfl + self.reg_max = reg_max + self.proj = nn.Parameter(torch.linspace(0, self.reg_max, self.reg_max + 1), requires_grad=False) + self.iou_type = iou_type + self.varifocal_loss = VarifocalLoss().cuda() + self.bbox_loss = BboxLoss(self.num_classes, self.reg_max, self.use_dfl, self.iou_type).cuda() + self.loss_weight = loss_weight + self.dice = True + + def parse_dynamic_params(self, flatten_kernels): + """split kernel head prediction to conv weight and bias.""" + n_inst = flatten_kernels.size(0) + n_layers = len(self.weight_nums) + params_splits = list( + torch.split_with_sizes( + flatten_kernels, self.weight_nums + self.bias_nums, dim=1)) + weight_splits = params_splits[:n_layers] + bias_splits = params_splits[n_layers:] + for i in range(n_layers): + if i < n_layers - 1: + weight_splits[i] = weight_splits[i].reshape( + n_inst * self.dyconv_channels, -1, 1, 1) + bias_splits[i] = bias_splits[i].reshape(n_inst * + self.dyconv_channels) + else: + weight_splits[i] = weight_splits[i].reshape(n_inst, -1, 1, 1) + bias_splits[i] = bias_splits[i].reshape(n_inst) + + return weight_splits, bias_splits + + def handle_proto_coord(self, proto): + _ = proto.shape[-1] + x = torch.arange(0, 1, step = 1 / _).unsqueeze(0).unsqueeze(0).repeat(1, _, 1).to(proto.dtype).to(proto.device) + y = torch.arange(0, 1, step = 1 / _).unsqueeze(0).T.unsqueeze(0).repeat(1, 1, _).to(proto.dtype).to(proto.device) + return torch.cat([proto, x, y]).reshape(1, -1, _, _) + + def __call__( + self, + outputs, + targets, + epoch_num, + step_num, + batch_height, + batch_width, + segmasks, + img=None, + ): + + + feats, pred_scores, pred_distri, pred_seg = outputs # seg_list:shape(3)(b, nm, mw, mh) seg_conf_list:shape(3):(b, l ,nm) + seg_cf, seg_proto = pred_seg + anchors, anchor_points, n_anchors_list, stride_tensor = \ + generate_anchors(feats, self.fpn_strides, self.grid_cell_size, self.grid_cell_offset, device=feats[0].device) + + assert pred_scores.type() == pred_distri.type() + gt_bboxes_scale = torch.tensor([batch_width, batch_height, batch_width, batch_height]).type_as(pred_scores) + batch_size = pred_scores.shape[0] + + targets, gt_segmasks =self.preprocess(targets, batch_size, gt_bboxes_scale, segmasks) + gt_labels = targets[:, :, :1] + gt_bboxes = targets[:, :, 1:] #xyxy + mask_gt = (gt_bboxes.sum(-1, keepdim=True) > 0).float() + + + # pboxes + anchor_points_s = anchor_points / stride_tensor + pred_bboxes = self.bbox_decode(anchor_points_s, pred_distri) #xyxy + + + try: + if epoch_num < self.warmup_epoch: + target_labels, target_bboxes, target_scores, fg_mask, target_segmasks = \ + self.warmup_assigner( + anchors, + n_anchors_list, + gt_labels, + gt_bboxes, + mask_gt, + pred_bboxes.detach() * stride_tensor, + gt_segmasks) + else: + target_labels, target_bboxes, target_scores, fg_mask, idx_lst = \ + self.formal_assigner( + pred_scores.detach(), + pred_bboxes.detach() * stride_tensor, + anchor_points, + gt_labels, + gt_bboxes, + mask_gt, + gt_segmasks) + + except RuntimeError: + print( + "OOM RuntimeError is raised due to the huge memory cost during label assignment. \ + CPU mode is applied in this batch. If you want to avoid this issue, \ + try to reduce the batch size or image size." + ) + torch.cuda.empty_cache() + print("------------CPU Mode for This Batch-------------") + if epoch_num < self.warmup_epoch: + _anchors = anchors.cpu().float() + _n_anchors_list = n_anchors_list + _gt_labels = gt_labels.cpu().float() + _gt_bboxes = gt_bboxes.cpu().float() + _mask_gt = mask_gt.cpu().float() + _pred_bboxes = pred_bboxes.detach().cpu().float() + _stride_tensor = stride_tensor.cpu().float() + _segmasks = gt_segmasks.cpu().float() + + target_labels, target_bboxes, target_scores, fg_mask, target_segmasks = \ + self.warmup_assigner( + _anchors, + _n_anchors_list, + _gt_labels, + _gt_bboxes, + _mask_gt, + _pred_bboxes * _stride_tensor, + _segmasks) + + else: + _pred_scores = pred_scores.detach().cpu().float() + _pred_bboxes = pred_bboxes.detach().cpu().float() + _anchor_points = anchor_points.cpu().float() + _gt_labels = gt_labels.cpu().float() + _gt_bboxes = gt_bboxes.cpu().float() + _mask_gt = mask_gt.cpu().float() + _stride_tensor = stride_tensor.cpu().float() + _segmasks = gt_segmasks.cpu().float() + + target_labels, target_bboxes, target_scores, fg_mask, idx_lst = \ + self.formal_assigner( + _pred_scores, + _pred_bboxes * _stride_tensor, + _anchor_points, + _gt_labels, + _gt_bboxes, + _mask_gt, + _segmasks) + + target_labels = target_labels.cuda() + target_bboxes = target_bboxes.cuda() + target_scores = target_scores.cuda() + fg_mask = fg_mask.cuda() + for _ in idx_lst: + _ = _.cuda() + + if step_num % 10 == 0: + torch.cuda.empty_cache() + + # rescale bbox + target_bboxes /= stride_tensor + + # cls loss + target_labels = torch.where(fg_mask > 0, target_labels, torch.full_like(target_labels, self.num_classes)) + one_hot_label = F.one_hot(target_labels.long(), self.num_classes + 1)[..., :-1] + loss_cls = self.varifocal_loss(pred_scores, target_scores, one_hot_label) + + + target_scores_sum = target_scores.sum() + + + if target_scores_sum > 1: + loss_cls /= target_scores_sum + + # bbox loss + loss_iou, loss_dfl = self.bbox_loss(pred_distri, pred_bboxes, anchor_points_s, target_bboxes, + target_scores, target_scores_sum, fg_mask) + + loss_seg = self.mask_loss(gt_segmasks, seg_cf, seg_proto, target_bboxes, fg_mask, idx_lst, target_scores, target_scores_sum, epoch=0) + + loss = self.loss_weight['class'] * loss_cls + \ + self.loss_weight['iou'] * loss_iou + \ + self.loss_weight['dfl'] * loss_dfl + \ + self.loss_weight['seg'] * loss_seg + + + return loss, \ + torch.cat(((self.loss_weight['iou'] * loss_iou).unsqueeze(0), + (self.loss_weight['dfl'] * loss_dfl).unsqueeze(0), + (self.loss_weight['class'] * loss_cls).unsqueeze(0), + (self.loss_weight['seg'] * loss_seg).unsqueeze(0))).detach() + + def preprocess(self, targets, batch_size, scale_tensor, segmask): + targets_list = np.zeros((batch_size, 1, 5)).tolist() + cu = [] + already = [] + for i, item in enumerate(targets.cpu().numpy().tolist()): + index = int(item[0]) + targets_list[index].append(item[1:]) + if index not in already: + already.append(index) + cu.append(i) + cu.append(segmask.shape[0]) + max_len = max((len(l) for l in targets_list)) + segmasks = torch.zeros(batch_size, max_len - 1, segmask.shape[-2], segmask.shape[-1]).cuda() + if len(already) != 0: + for i in range(len(already)): + j = already[i] + start = cu[i] + end = cu[i+1] + segmasks[j, : end - start] = segmask[start: end].clone() + targets = torch.from_numpy(np.array(list(map(lambda l:l + [[-1,0,0,0,0]]*(max_len - len(l)), targets_list)))[:,1:,:]).to(targets.device) + + batch_target = targets[:, :, 1:5].mul_(scale_tensor) + targets[..., 1:] = xywh2xyxy(batch_target) + return targets, segmasks + + def bbox_decode(self, anchor_points, pred_dist): + if self.use_dfl: + batch_size, n_anchors, _ = pred_dist.shape + pred_dist = F.softmax(pred_dist.view(batch_size, n_anchors, 4, self.reg_max + 1), dim=-1).matmul(self.proj.to(pred_dist.device)) + return dist2bbox(pred_dist, anchor_points) + + def mask_loss(self, gt_segmasks, seg_cf, seg_proto, txyxy_ori_s, fg_mask, idx_lst, target_scores=None, target_scores_sum=None, epoch=0): + # pred_mask_lst -> list + ''' + pred_mask -> Shape(n1, w, h) + gt_mask -> Shape(n, img_w, img_h) + xyxy -> Shape(n, 4) + sum(n1, n2, n3, ...) = n + torch.abs((xyxy[..., 3] - xyxy[..., 1]) * (xyxy[..., 4] - xyxy[..., 2])) -> area + fg_mask --> (bs, tsize) + idx -> (bs, tsize) + gt_segmasks -> (bs, labelsize, w, h) + ''' + sl = 0 + sl2 = 0 + bl = [2, 4, 8] + num_pos = fg_mask.sum() + tloss = torch.zeros(1).float().cuda() + if num_pos<=0: + for ipred in seg_proto: + tloss += (ipred.sum() * 0.) + for ipred in seg_cf: + tloss += (ipred.sum() * 0.) + return tloss[0] + + + xyxy_mask = fg_mask.unsqueeze(-1).repeat([1, 1, 4]) + mtarget_scores = target_scores.sum(-1) # (bs, nl, 1) + + sl = 0 + qf = len(idx_lst) == 1 and len(idx_lst[0].shape) == 2 + if qf: + idx_lst = idx_lst[0] + _ = [_i.shape[1] for _i in seg_cf] + sp = [2, 4, 8] + fpn = [] + for i in range(0, 3): + fpn.extend([sp[i]] * _[i]) + fpn = torch.Tensor(fpn).unsqueeze(-1).cuda() + txyxy_ori = txyxy_ori_s * fpn.unsqueeze(0).repeat(seg_cf[0].shape[0], 1, 1) + iseg_cf = torch.cat(seg_cf, axis = 1) + iseg_proto = seg_proto[0] # (bs, 32, h, w) + bs = iseg_proto.shape[0] + if fg_mask.sum()<=0: + tloss += (iseg_proto.sum() * 0.) + tloss += (iseg_cf.sum() * 0.) + return tloss[0] + + pshape = iseg_proto.shape[-1] + ishape = iseg_cf.shape[1] # (1) = part_n + idx = idx_lst[:, :] # (bs, part_n) + + ifg_mask = fg_mask[:, :] # (n) --> (bs, part_n) + itarget_scores = mtarget_scores[:, :] + target_sg = [] + pred_sg = [] + ixyxy_lst = [] + mask_weight = [] + for i in range(bs): + siproto = self.handle_proto_coord(iseg_proto[i]) + iproto = siproto.reshape(1, -1, *siproto.shape[-2:]) + idx_thisbatch = torch.masked_select(idx[i], ifg_mask[i]) #(casize) + igt_segmasks = gt_segmasks.reshape(-1, *gt_segmasks.shape[-2:])[idx_thisbatch] # (?1, h?, w?) --> (?2, h?, w?) + imask_weight = torch.masked_select(itarget_scores[i], ifg_mask[i]).unsqueeze(-1) + tiseg_cf = torch.masked_select(iseg_cf[i], ifg_mask[i].unsqueeze(-1).repeat(1, self.tt)) # (?2, 32) + tiseg_cf = tiseg_cf.reshape(-1, self.tt) + num_inst = tiseg_cf.shape[0] + if num_inst == 0: + tloss[0] += (tiseg_cf.sum() * 0.) + continue + mask_weight.append(imask_weight) + target_sg.append(igt_segmasks) + weights, biases = self.parse_dynamic_params(tiseg_cf) + n_layers = len(weights) + for _i, (weight, bias) in enumerate(zip(weights, biases)): + x = F.conv2d( + iproto, weight, bias=bias, stride=1, padding=0, groups=1) + if _i < n_layers - 1: + x = F.relu(x) + x = x.reshape(num_inst, *iproto.shape[-2:]) + ixyxy = torch.masked_select(txyxy_ori[i, :], xyxy_mask[i, :, :]).reshape(-1, 4) # (n, 4) --> (part_n, 4) --> (?2, 4) + ixyxy_lst.append(ixyxy) + pred_sg.append(x) + bxyxy = torch.cat(ixyxy_lst, dim = 0) + bpred_seg = torch.cat(pred_sg, dim = 0) + bgt_seg = torch.cat(target_sg, dim = 0) + masks_weight = torch.cat(mask_weight, dim = 0).reshape(-1) + if tuple(bgt_seg.shape[-2:]) != (pshape, pshape): # downsample + bgt_seg = F.interpolate(bgt_seg[None], (pshape, pshape), mode='nearest')[0] + area = torch.abs((bxyxy[..., 2] - bxyxy[..., 0]) * (bxyxy[..., 3] - bxyxy[..., 1])) + area = area / (pshape) + area = area / (pshape) + + if not self.dice: + loss = F.binary_cross_entropy_with_logits(bpred_seg, bgt_seg, reduction='none') + loss = (self.crop_mask(loss, bxyxy).mean(dim=(1, 2)) / area) * masks_weight + loss = loss.sum() + tloss += loss + if target_scores_sum > 1: + tloss[0] = tloss[0] / target_scores_sum + return tloss[0] / len(seg_cf) + else: + bpred_seg = bpred_seg.sigmoid() + if epoch <= 160: + loss = dice_loss(bpred_seg, bgt_seg, masks_weight, reduction='mean', avg_factor=target_scores_sum if target_scores_sum > 1 else 1) + else: + loss = dice_loss(bpred_seg, bgt_seg, reduction='mean') + tloss += loss + return tloss[0] + + @staticmethod + def crop_mask(masks, boxes): + """ + "Crop" predicted masks by zeroing out everything not in the predicted bbox. + Vectorized by Chong (thanks Chong). + + Args: + - masks should be a size [n, h, w] tensor of masks + - boxes should be a size [n, 4] tensor of bbox coords in relative point form + """ + + n, h, w = masks.shape + x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) # x1 shape(1,1,n) + r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :] # rows shape(1,w,1) + c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None] # cols shape(h,1,1) + + return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2)) + + +class VarifocalLoss(nn.Module): + def __init__(self): + super(VarifocalLoss, self).__init__() + + def forward(self, pred_score,gt_score, label, alpha=0.75, gamma=2.0): + + weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label + with torch.cuda.amp.autocast(enabled=False): + loss = (F.binary_cross_entropy(pred_score.float(), gt_score.float(), reduction='none') * weight).sum() + + return loss + + +class BboxLoss(nn.Module): + def __init__(self, num_classes, reg_max, use_dfl=False, iou_type='giou'): + super(BboxLoss, self).__init__() + self.num_classes = num_classes + self.iou_loss = IOUloss(box_format='xyxy', iou_type=iou_type, eps=1e-10) + self.reg_max = reg_max + self.use_dfl = use_dfl + + def forward(self, pred_dist, pred_bboxes, anchor_points, + target_bboxes, target_scores, target_scores_sum, fg_mask): + + # select positive samples mask + num_pos = fg_mask.sum() + if num_pos > 0: + # iou loss + bbox_mask = fg_mask.unsqueeze(-1).repeat([1, 1, 4]) + pred_bboxes_pos = torch.masked_select(pred_bboxes, + bbox_mask).reshape([-1, 4]) + target_bboxes_pos = torch.masked_select( + target_bboxes, bbox_mask).reshape([-1, 4]) + bbox_weight = torch.masked_select( + target_scores.sum(-1), fg_mask).unsqueeze(-1) + loss_iou = self.iou_loss(pred_bboxes_pos, + target_bboxes_pos) * bbox_weight + if target_scores_sum > 1: + loss_iou = loss_iou.sum() / target_scores_sum + else: + loss_iou = loss_iou.sum() + + # dfl loss + if self.use_dfl: + dist_mask = fg_mask.unsqueeze(-1).repeat( + [1, 1, (self.reg_max + 1) * 4]) + pred_dist_pos = torch.masked_select( + pred_dist, dist_mask).reshape([-1, 4, self.reg_max + 1]) + target_ltrb = bbox2dist(anchor_points, target_bboxes, self.reg_max) + target_ltrb_pos = torch.masked_select( + target_ltrb, bbox_mask).reshape([-1, 4]) + loss_dfl = self._df_loss(pred_dist_pos, + target_ltrb_pos) * bbox_weight + if target_scores_sum > 1: + loss_dfl = loss_dfl.sum() / target_scores_sum + else: + loss_dfl = loss_dfl.sum() + else: + loss_dfl = pred_dist.sum() * 0. + + else: + loss_iou = pred_dist.sum() * 0. + loss_dfl = pred_dist.sum() * 0. + + return loss_iou, loss_dfl + + def _df_loss(self, pred_dist, target): + target_left = target.to(torch.long) + target_right = target_left + 1 + weight_left = target_right.to(torch.float) - target + weight_right = 1 - weight_left + loss_left = F.cross_entropy( + pred_dist.view(-1, self.reg_max + 1), target_left.view(-1), reduction='none').view( + target_left.shape) * weight_left + loss_right = F.cross_entropy( + pred_dist.view(-1, self.reg_max + 1), target_right.view(-1), reduction='none').view( + target_left.shape) * weight_right + return (loss_left + loss_right).mean(-1, keepdim=True) + +def dice_loss(pred, + target, + weight=None, + eps=1e-3, + reduction='mean', + naive_dice=False, + avg_factor=None): + """Calculate dice loss, there are two forms of dice loss is supported: + Borrowed from MMDetection + - the one proposed in `V-Net: Fully Convolutional Neural + Networks for Volumetric Medical Image Segmentation + `_. + - the dice loss in which the power of the number in the + denominator is the first power instead of the second + power. + + Args: + pred (torch.Tensor): The prediction, has a shape (n, *) + target (torch.Tensor): The learning label of the prediction, + shape (n, *), same shape of pred. + weight (torch.Tensor, optional): The weight of loss for each + prediction, has a shape (n,). Defaults to None. + eps (float): Avoid dividing by zero. Default: 1e-3. + reduction (str, optional): The method used to reduce the loss into + a scalar. Defaults to 'mean'. + Options are "none", "mean" and "sum". + naive_dice (bool, optional): If false, use the dice + loss defined in the V-Net paper, otherwise, use the + naive dice loss in which the power of the number in the + denominator is the first power instead of the second + power.Defaults to False. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + """ + + input = pred.flatten(1) + target = target.flatten(1).float() + + a = torch.sum(input * target, 1) + if naive_dice: + b = torch.sum(input, 1) + c = torch.sum(target, 1) + d = (2 * a + eps) / (b + c + eps) + else: + b = torch.sum(input * input, 1) + eps + c = torch.sum(target * target, 1) + eps + d = (2 * a) / (b + c) + + loss = 1 - d + if weight is not None: + assert weight.ndim == loss.ndim + assert len(weight) == len(pred) + loss = weight_reduce_loss(loss, weight, reduction, avg_factor) + return loss + +def weight_reduce_loss(loss, + weight=None, + reduction='none', + avg_factor=None): + """Apply element-wise weight and reduce loss. + + Args: + loss (Tensor): Element-wise loss. + weight (Optional[Tensor], optional): Element-wise weights. + Defaults to None. + reduction (str, optional): Same as built-in losses of PyTorch. + Defaults to 'mean'. + avg_factor (Optional[float], optional): Average factor when + computing the mean of losses. Defaults to None. + + Returns: + Tensor: Processed loss values. + """ + # if weight is specified, apply element-wise weight + if weight is not None: + loss = loss * weight + + # if avg_factor is not specified, just reduce the loss + if avg_factor is None: + loss = reduce_loss(loss, reduction) + else: + # if reduction is mean, then average the loss by avg_factor + if reduction == 'mean': + # Avoid causing ZeroDivisionError when avg_factor is 0.0, + # i.e., all labels of an image belong to ignore index. + eps = torch.finfo(torch.float32).eps + loss = loss.sum() / (avg_factor + eps) + # if reduction is 'none', then do nothing, otherwise raise an error + elif reduction != 'none': + raise ValueError('avg_factor can not be used with reduction="sum"') + return loss + +def reduce_loss(loss, reduction): + """Reduce loss as specified. + + Args: + loss (Tensor): Elementwise loss tensor. + reduction (str): Options are "none", "mean" and "sum". + + Return: + Tensor: Reduced loss tensor. + """ + reduction_enum = F._Reduction.get_enum(reduction) + # none: 0, elementwise_mean:1, sum: 2 + if reduction_enum == 0: + return loss + elif reduction_enum == 1: + return loss.mean() + elif reduction_enum == 2: + return loss.sum() \ No newline at end of file diff --git a/yolov6/models/reppan.py b/yolov6/models/reppan.py index 2114f521..820f4211 100644 --- a/yolov6/models/reppan.py +++ b/yolov6/models/reppan.py @@ -551,22 +551,14 @@ def __init__( channels_list=None, num_repeats=None, block=BottleRep, - csp_e=float(1)/2, - stage_block_type="BepC3" + csp_e=float(1)/2 ): super().__init__() - if stage_block_type == "BepC3": - stage_block = BepC3 - elif stage_block_type == "MBLABlock": - stage_block = MBLABlock - else: - raise NotImplementedError - assert channels_list is not None assert num_repeats is not None - self.Rep_p4 = stage_block( + self.Rep_p4 = BepC3( in_channels=channels_list[3] + channels_list[5], # 512 + 256 out_channels=channels_list[5], # 256 n=num_repeats[5], @@ -574,7 +566,7 @@ def __init__( block=block ) - self.Rep_p3 = stage_block( + self.Rep_p3 = BepC3( in_channels=channels_list[2] + channels_list[6], # 256 + 128 out_channels=channels_list[6], # 128 n=num_repeats[6], @@ -582,7 +574,7 @@ def __init__( block=block ) - self.Rep_n3 = stage_block( + self.Rep_n3 = BepC3( in_channels=channels_list[6] + channels_list[7], # 128 + 128 out_channels=channels_list[8], # 256 n=num_repeats[7], @@ -590,7 +582,7 @@ def __init__( block=block ) - self.Rep_n4 = stage_block( + self.Rep_n4 = BepC3( in_channels=channels_list[5] + channels_list[9], # 256 + 256 out_channels=channels_list[10], # 512 n=num_repeats[8], @@ -795,21 +787,13 @@ def __init__( channels_list=None, num_repeats=None, block=BottleRep, - csp_e=float(1)/2, - stage_block_type="BepC3" + csp_e=float(1)/2 ): super().__init__() assert channels_list is not None assert num_repeats is not None - if stage_block_type == "BepC3": - stage_block = BepC3 - elif stage_block_type == "MBLABlock": - stage_block = MBLABlock - else: - raise NotImplementedError - self.reduce_layer0 = ConvBNReLU( in_channels=channels_list[5], # 1024 out_channels=channels_list[6], # 512 @@ -822,7 +806,7 @@ def __init__( out_channels=channels_list[6], # 512 ) - self.Rep_p5 = stage_block( + self.Rep_p5 = BepC3( in_channels=channels_list[4] + channels_list[6], # 768 + 512 out_channels=channels_list[6], # 512 n=num_repeats[6], @@ -842,7 +826,7 @@ def __init__( out_channels=channels_list[7] # 256 ) - self.Rep_p4 = stage_block( + self.Rep_p4 = BepC3( in_channels=channels_list[3] + channels_list[7], # 512 + 256 out_channels=channels_list[7], # 256 n=num_repeats[7], @@ -862,7 +846,7 @@ def __init__( out_channels=channels_list[8] # 128 ) - self.Rep_p3 = stage_block( + self.Rep_p3 = BepC3( in_channels=channels_list[2] + channels_list[8], # 256 + 128 out_channels=channels_list[8], # 128 n=num_repeats[8], @@ -877,7 +861,7 @@ def __init__( stride=2 ) - self.Rep_n4 = stage_block( + self.Rep_n4 = BepC3( in_channels=channels_list[8] + channels_list[8], # 128 + 128 out_channels=channels_list[9], # 256 n=num_repeats[9], @@ -892,7 +876,7 @@ def __init__( stride=2 ) - self.Rep_n5 = stage_block( + self.Rep_n5 = BepC3( in_channels=channels_list[7] + channels_list[9], # 256 + 256 out_channels=channels_list[10], # 512 n=num_repeats[10], @@ -907,7 +891,7 @@ def __init__( stride=2 ) - self.Rep_n6 = stage_block( + self.Rep_n6 = BepC3( in_channels=channels_list[6] + channels_list[10], # 512 + 512 out_channels=channels_list[11], # 1024 n=num_repeats[11], @@ -962,21 +946,13 @@ def __init__( channels_list=None, num_repeats=None, block=BottleRep, - csp_e=float(1)/2, - stage_block_type="BepC3" + csp_e=float(1)/2 ): super().__init__() assert channels_list is not None assert num_repeats is not None - if stage_block_type == "BepC3": - stage_block = BepC3 - elif stage_block_type == "MBLABlock": - stage_block = MBLABlock - else: - raise NotImplementedError - self.reduce_layer0 = ConvBNReLU( in_channels=channels_list[5], # 1024 out_channels=channels_list[6], # 512 @@ -989,7 +965,7 @@ def __init__( out_channels=channels_list[6], # 512 ) - self.Rep_p5 = stage_block( + self.Rep_p5 = BepC3( in_channels=channels_list[6], # 512 out_channels=channels_list[6], # 512 n=num_repeats[6], @@ -1009,7 +985,7 @@ def __init__( out_channels=channels_list[7], # 256 ) - self.Rep_p4 = stage_block( + self.Rep_p4 = BepC3( in_channels=channels_list[7], # 256 out_channels=channels_list[7], # 256 n=num_repeats[7], @@ -1029,7 +1005,7 @@ def __init__( out_channels=channels_list[8], # 128 ) - self.Rep_p3 = stage_block( + self.Rep_p3 = BepC3( in_channels=channels_list[8], # 128 out_channels=channels_list[8], # 128 n=num_repeats[8], @@ -1044,7 +1020,7 @@ def __init__( stride=2 ) - self.Rep_n4 = stage_block( + self.Rep_n4 = BepC3( in_channels=channels_list[8] + channels_list[8], # 128 + 128 out_channels=channels_list[9], # 256 n=num_repeats[9], @@ -1059,7 +1035,7 @@ def __init__( stride=2 ) - self.Rep_n5 = stage_block( + self.Rep_n5 = BepC3( in_channels=channels_list[7] + channels_list[9], # 256 + 256 out_channels=channels_list[10], # 512 n=num_repeats[10], @@ -1074,7 +1050,7 @@ def __init__( stride=2 ) - self.Rep_n6 = stage_block( + self.Rep_n6 = BepC3( in_channels=channels_list[6] + channels_list[10], # 512 + 512 out_channels=channels_list[11], # 1024 n=num_repeats[11], diff --git a/yolov6/models/yolo.py b/yolov6/models/yolo.py index 2f37f1b1..5e121b79 100644 --- a/yolov6/models/yolo.py +++ b/yolov6/models/yolo.py @@ -63,6 +63,11 @@ def build_network(config, channels, num_classes, num_layers, fuse_ab=False, dist channels_list_neck = config.model.neck.out_channels use_dfl = config.model.head.use_dfl reg_max = config.model.head.reg_max + issolo = config.model.head.issolo + isseg = config.model.head.isseg + npr = config.model.head.npr + npr = make_divisible(npr * width_mul, 8) + nm = config.model.head.nm num_repeat = [(max(round(i * depth_mul), 1) if i > 1 else i) for i in (num_repeat_backbone + num_repeat_neck)] channels_list = [make_divisible(i * width_mul, 8) for i in (channels_list_backbone + channels_list_neck)] @@ -110,8 +115,20 @@ def build_network(config, channels, num_classes, num_layers, fuse_ab=False, dist num_repeats=num_repeat, block=block ) - - if distill_ns: + if isseg: + if issolo: + from yolov6.models.heads.effidehead_fuseab_seg_solo import Detect, build_effidehead_layer, Proto + anchors_init = config.model.head.anchors_init + head_layers = build_effidehead_layer(channels_list, 3, num_classes, reg_max=reg_max, num_layers=num_layers, num_masks=nm + 2 + 1, fuse_ab=fuse_ab) + reg_masks = [Proto(num_layers, channels_list, 0, npr, nm, scale_factor=2), Proto(num_layers, channels_list, 1, npr, nm, scale_factor=4), Proto(num_layers, channels_list, 2, npr, nm, scale_factor=8)] + head = Detect(num_classes, anchors_init, num_layers, head_layers=head_layers, use_dfl=use_dfl, reg_mask=reg_masks, fuse_ab=fuse_ab, nm=nm + 2 + 1) + else: + from yolov6.models.heads.effidehead_fuseab_seg import Detect, build_effidehead_layer, Proto + anchors_init = config.model.head.anchors_init + head_layers = build_effidehead_layer(channels_list, 3, num_classes, reg_max=reg_max, num_layers=num_layers, num_masks=nm, fuse_ab=fuse_ab) + reg_masks = [Proto(num_layers, channels_list, 0, npr, nm)] + head = Detect(num_classes, anchors_init, num_layers, head_layers=head_layers, use_dfl=use_dfl, reg_mask=reg_masks, fuse_ab=fuse_ab) + elif distill_ns: from yolov6.models.heads.effidehead_distill_ns import Detect, build_effidehead_layer if num_layers != 3: LOGGER.error('ERROR in: Distill mode not fit on n/s models with P6 head.\n') diff --git a/yolov6/utils/general.py b/yolov6/utils/general.py index cb4418cd..e144f95d 100644 --- a/yolov6/utils/general.py +++ b/yolov6/utils/general.py @@ -5,7 +5,6 @@ import math import torch import requests -import pkg_resources as pkg from pathlib import Path from yolov6.utils.events import LOGGER @@ -94,7 +93,6 @@ def download_ckpt(path): LOGGER.info(f"checkpoint {basename} not exist, try to downloaded it from github.") # need to update the link with every release url = f"https://github.com/meituan/YOLOv6/releases/download/0.4.0/{basename}" - LOGGER.warning(f"downloading url is: {url}, pealse make sure the version of the downloading model is correspoing to the code version!") r = requests.get(url, allow_redirects=True) assert r.status_code == 200, "Unable to download checkpoints, manually download it" open(path, 'wb').write(r.content) @@ -115,13 +113,3 @@ def check_img_size(imgsz, s=32, floor=0): if new_size != imgsz: LOGGER.warning(f'--img-size {imgsz} must be multiple of max stride {s}, updating to {new_size}') return new_size - - -def check_version(current='0.0.0', minimum='0.0.0', name='version ', pinned=False, hard=False, verbose=False): - # Check whether the package's version is match the required version. - current, minimum = (pkg.parse_version(x) for x in (current, minimum)) - result = (current == minimum) if pinned else (current >= minimum) # bool - if hard: - info = f'⚠️ {name}{minimum} is required by YOLOv6, but {name}{current} is currently installed' - assert result, info # assert minimum version requirement - return result diff --git a/yolov6/utils/metrics.py b/yolov6/utils/metrics.py index cbfa130e..c54b4f8a 100644 --- a/yolov6/utils/metrics.py +++ b/yolov6/utils/metrics.py @@ -9,8 +9,9 @@ import torch import warnings from . import general +import torch.nn.functional as F -def ap_per_class(tp, conf, pred_cls, target_cls, plot=False, save_dir='.', names=()): +def ap_per_class_v6(tp, conf, pred_cls, target_cls, plot=False, save_dir='.', names=(), prefix = ''): """ Compute the average precision, given the recall and precision curves. Source: https://github.com/rafaelpadilla/Object-Detection-Metrics. # Arguments @@ -57,7 +58,7 @@ def ap_per_class(tp, conf, pred_cls, target_cls, plot=False, save_dir='.', names # AP from recall-precision curve for j in range(tp.shape[1]): - ap[ci, j], mpre, mrec = compute_ap(recall[:, j], precision[:, j]) + ap[ci, j], mpre, mrec = compute_ap_v6(recall[:, j], precision[:, j]) if plot and j == 0: py.append(np.interp(px, mrec, mpre)) # precision at mAP@0.5 @@ -71,8 +72,112 @@ def ap_per_class(tp, conf, pred_cls, target_cls, plot=False, save_dir='.', names # i = f1.mean(0).argmax() # max F1 index # return p[:, i], r[:, i], ap, f1[:, i], unique_classes.astype('int32') - return p, r, ap, f1, unique_classes.astype('int32') + AP50_F1_max_idx = len(f1.mean(0)) - f1.mean(0)[::-1].argmax() -1 + ap50, ap = ap[:, 0], ap.mean(1) + mp, mr, map50, map = p[:, AP50_F1_max_idx].mean(), r[:, AP50_F1_max_idx].mean(), ap50.mean(), ap.mean() + return mp, mr, map50, map, AP50_F1_max_idx +def ap_per_class(tp, conf, pred_cls, target_cls, plot=False, save_dir='.', names=(), eps=1e-16, prefix=''): + """ Compute the average precision, given the recall and precision curves. + Source: https://github.com/rafaelpadilla/Object-Detection-Metrics. + # Arguments + tp: True positives (nparray, nx1 or nx10). + conf: Objectness value from 0-1 (nparray). + pred_cls: Predicted object classes (nparray). + target_cls: True object classes (nparray). + plot: Plot precision-recall curve at mAP@0.5 + save_dir: Plot save directory + # Returns + The average precision as computed in py-faster-rcnn. + """ + + # Sort by objectness + i = np.argsort(-conf) + tp, conf, pred_cls = tp[i], conf[i], pred_cls[i] + + # Find unique classes + unique_classes, nt = np.unique(target_cls, return_counts=True) + nc = unique_classes.shape[0] # number of classes, number of detections + + # Create Precision-Recall curve and compute AP for each class + px, py = np.linspace(0, 1, 1000), [] # for plotting + ap, p, r = np.zeros((nc, tp.shape[1])), np.zeros((nc, 1000)), np.zeros((nc, 1000)) + for ci, c in enumerate(unique_classes): + i = pred_cls == c + n_l = nt[ci] # number of labels + n_p = i.sum() # number of predictions + if n_p == 0 or n_l == 0: + continue + + # Accumulate FPs and TPs + fpc = (1 - tp[i]).cumsum(0) + tpc = tp[i].cumsum(0) + + # Recall + recall = tpc / (n_l + eps) # recall curve + r[ci] = np.interp(-px, -conf[i], recall[:, 0], left=0) # negative x, xp because xp decreases + + # Precision + precision = tpc / (tpc + fpc) # precision curve + p[ci] = np.interp(-px, -conf[i], precision[:, 0], left=1) # p at pr_score + + # AP from recall-precision curve + for j in range(tp.shape[1]): + ap[ci, j], mpre, mrec = compute_ap(recall[:, j], precision[:, j]) + if plot and j == 0: + py.append(np.interp(px, mrec, mpre)) # precision at mAP@0.5 + + # Compute F1 (harmonic mean of precision and recall) + f1 = 2 * p * r / (p + r + eps) + plot = False + if plot: + names = [v for k, v in names.items() if k in unique_classes] # list: only classes that have data + names = dict(enumerate(names)) # to dict + plot_pr_curve(px, py, ap, Path(save_dir) / f'{prefix}PR_curve.png', names) + plot_mc_curve(px, f1, Path(save_dir) / f'{prefix}F1_curve.png', names, ylabel='F1') + plot_mc_curve(px, p, Path(save_dir) / f'{prefix}P_curve.png', names, ylabel='Precision') + plot_mc_curve(px, r, Path(save_dir) / f'{prefix}R_curve.png', names, ylabel='Recall') + + i = smooth(f1.mean(0), 0.1).argmax() # max F1 index + p, r, f1 = p[:, i], r[:, i], f1[:, i] + tp = (r * nt).round() # true positives + fp = (tp / (p + eps) - tp).round() # false positives + return tp, fp, p, r, f1, ap, unique_classes.astype(int) + +def smooth(y, f=0.05): + # Box filter of fraction f + nf = round(len(y) * f * 2) // 2 + 1 # number of filter elements (must be odd) + p = np.ones(nf // 2) # ones padding + yp = np.concatenate((p * y[0], y, p * y[-1]), 0) # y padded + return np.convolve(yp, np.ones(nf) / nf, mode='valid') # y-smoothed + + +def compute_ap_v6(recall, precision): + """ Compute the average precision, given the recall and precision curves + # Arguments + recall: The recall curve (list) + precision: The precision curve (list) + # Returns + Average precision, precision curve, recall curve + """ + + # Append sentinel values to beginning and end + mrec = np.concatenate(([0.0], recall, [1.0])) + mpre = np.concatenate(([1.0], precision, [0.0])) + + # Compute the precision envelope + mpre = np.flip(np.maximum.accumulate(np.flip(mpre))) + + # Integrate area under curve + method = 'interp' # methods: 'continuous', 'interp' + if method == 'interp': + x = np.linspace(0, 1, 101) # 101-point interp (COCO) + ap = np.trapz(np.interp(x, mrec, mpre), x) # integrate + else: # 'continuous' + i = np.where(mrec[1:] != mrec[:-1])[0] # points where x axis (recall) changes + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) # area under curve + + return ap, mpre, mrec def compute_ap(recall, precision): """ Compute the average precision, given the recall and precision curves @@ -101,7 +206,6 @@ def compute_ap(recall, precision): return ap, mpre, mrec -# Plots ---------------------------------------------------------------------------------------------------------------- def plot_pr_curve(px, py, ap, save_dir='pr_curve.png', names=()): # Precision-recall curve @@ -142,17 +246,54 @@ def plot_mc_curve(px, py, save_dir='mc_curve.png', names=(), xlabel='Confidence' plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left") fig.savefig(Path(save_dir), dpi=250) -def process_batch(detections, labels, iouv): +# def process_batch(detections, labels, iouv): +# """ +# Return correct predictions matrix. Both sets of boxes are in (x1, y1, x2, y2) format. +# Arguments: +# detections (Array[N, 6]), x1, y1, x2, y2, conf, class +# labels (Array[M, 5]), class, x1, y1, x2, y2 +# Returns: +# correct (Array[N, 10]), for 10 IoU levels +# """ +# correct = np.zeros((detections.shape[0], iouv.shape[0])).astype(bool) +# iou = general.box_iou(labels[:, 1:], detections[:, :4]) +# correct_class = labels[:, 0:1] == detections[:, 5] +# for i in range(len(iouv)): +# x = torch.where((iou >= iouv[i]) & correct_class) # IoU > threshold and classes match +# if x[0].shape[0]: +# matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), 1).cpu().numpy() # [label, detect, iou] +# if x[0].shape[0] > 1: +# matches = matches[matches[:, 2].argsort()[::-1]] +# matches = matches[np.unique(matches[:, 1], return_index=True)[1]] +# # matches = matches[matches[:, 2].argsort()[::-1]] +# matches = matches[np.unique(matches[:, 0], return_index=True)[1]] +# correct[matches[:, 1].astype(int), i] = True +# return torch.tensor(correct, dtype=torch.bool, device=iouv.device) + +def process_batch(detections, labels, iouv, pred_masks=None, gt_masks=None, overlap=False, masks=False): """ - Return correct predictions matrix. Both sets of boxes are in (x1, y1, x2, y2) format. + Return correct prediction matrix Arguments: - detections (Array[N, 6]), x1, y1, x2, y2, conf, class - labels (Array[M, 5]), class, x1, y1, x2, y2 + detections (array[N, 6]), x1, y1, x2, y2, conf, class + labels (array[M, 5]), class, x1, y1, x2, y2 Returns: - correct (Array[N, 10]), for 10 IoU levels + correct (array[N, 10]), for 10 IoU levels """ + if masks: + gt_masks = gt_masks.to(pred_masks.device) + if overlap: + nl = len(labels) + index = torch.arange(nl, device=gt_masks.device).view(nl, 1, 1) + 1 + gt_masks = gt_masks.repeat(nl, 1, 1) # shape(1,640,640) -> (n,640,640) + gt_masks = torch.where(gt_masks == index, 1.0, 0.0) + if gt_masks.shape[1:] != pred_masks.shape[1:]: + gt_masks = F.interpolate(gt_masks[None].to(torch.float32), pred_masks.shape[1:], mode='bilinear', align_corners=False)[0] + gt_masks = gt_masks.gt_(0.5) + iou = mask_iou(gt_masks.view(gt_masks.shape[0], -1).float(), pred_masks.view(pred_masks.shape[0], -1)).to(iouv.device) + else: # boxes + iou = box_iou(labels[:, 1:], detections[:, :4]).to(iouv.device) + correct = np.zeros((detections.shape[0], iouv.shape[0])).astype(bool) - iou = general.box_iou(labels[:, 1:], detections[:, :4]) correct_class = labels[:, 0:1] == detections[:, 5] for i in range(len(iouv)): x = torch.where((iou >= iouv[i]) & correct_class) # IoU > threshold and classes match @@ -256,3 +397,232 @@ def plot(self, normalize=True, save_dir='', names=()): def print(self): for i in range(self.nc + 1): print(' '.join(map(str, self.matrix[i]))) + + +def ap_per_class_box_and_mask( + tp_m, + tp_b, + conf, + pred_cls, + target_cls, + plot=False, + save_dir='.', + names=(), + is_v6=False +): + """ + Args: + tp_b: tp of boxes. + tp_m: tp of masks. + other arguments see `func: ap_per_class`. + #return p, r, ap, f1, unique_classes.astype('int32') + """ + if not is_v6: + results_boxes = ap_per_class(tp_b, + conf, + pred_cls, + target_cls, + plot=plot, + save_dir=save_dir, + names=names, + prefix='Box')[2:] + results_masks = ap_per_class(tp_m, + conf, + pred_cls, + target_cls, + plot=plot, + save_dir=save_dir, + names=names, + prefix='Mask')[2:] + + results = { + 'boxes': { + 'p': results_boxes[0], + 'r': results_boxes[1], + 'ap': results_boxes[3], + 'f1': results_boxes[2], + 'ap_class': results_boxes[4]}, + 'masks': { + 'p': results_masks[0], + 'r': results_masks[1], + 'ap': results_masks[3], + 'f1': results_masks[2], + 'ap_class': results_masks[4]}} + return results + else: + results_boxes = ap_per_class_v6(tp_b, + conf, + pred_cls, + target_cls, + plot=plot, + save_dir=save_dir, + names=names, + prefix='Box') + results_masks = ap_per_class(tp_m, + conf, + pred_cls, + target_cls, + plot=plot, + save_dir=save_dir, + names=names, + prefix='Mask') + return results_boxes, results_masks + +class Metric: + + def __init__(self) -> None: + self.p = [] # (nc, ) + self.r = [] # (nc, ) + self.f1 = [] # (nc, ) + self.all_ap = [] # (nc, 10) + self.ap_class_index = [] # (nc, ) + + @property + def ap50(self): + """AP@0.5 of all classes. + Return: + (nc, ) or []. + """ + return self.all_ap[:, 0] if len(self.all_ap) else [] + + @property + def ap(self): + """AP@0.5:0.95 + Return: + (nc, ) or []. + """ + return self.all_ap.mean(1) if len(self.all_ap) else [] + + @property + def mp(self): + """mean precision of all classes. + Return: + float. + """ + return self.p.mean() if len(self.p) else 0.0 + + @property + def mr(self): + """mean recall of all classes. + Return: + float. + """ + return self.r.mean() if len(self.r) else 0.0 + + @property + def map50(self): + """Mean AP@0.5 of all classes. + Return: + float. + """ + return self.all_ap[:, 0].mean() if len(self.all_ap) else 0.0 + + @property + def map(self): + """Mean AP@0.5:0.95 of all classes. + Return: + float. + """ + return self.all_ap.mean() if len(self.all_ap) else 0.0 + + def mean_results(self): + """Mean of results, return mp, mr, map50, map""" + return (self.mp, self.mr, self.map50, self.map) + + def class_result(self, i): + """class-aware result, return p[i], r[i], ap50[i], ap[i]""" + return (self.p[i], self.r[i], self.ap50[i], self.ap[i]) + + def get_maps(self, nc): + maps = np.zeros(nc) + self.map + for i, c in enumerate(self.ap_class_index): + maps[c] = self.ap[i] + return maps + + def update(self, results): + """ + Args: + results: tuple(p, r, ap, f1, ap_class) + """ + p, r, all_ap, f1, ap_class_index = results + self.p = p + self.r = r + self.all_ap = all_ap + self.f1 = f1 + self.ap_class_index = ap_class_index + + +class Metrics: + """Metric for boxes and masks.""" + + def __init__(self) -> None: + self.metric_box = Metric() + self.metric_mask = Metric() + + def update(self, results): + """ + Args: + results: Dict{'boxes': Dict{}, 'masks': Dict{}} + """ + self.metric_box.update(list(results['boxes'].values())) + self.metric_mask.update(list(results['masks'].values())) + + def mean_results(self): + return self.metric_box.mean_results() + self.metric_mask.mean_results() + + def class_result(self, i): + return self.metric_box.class_result(i) + self.metric_mask.class_result(i) + + def get_maps(self, nc): + return self.metric_box.get_maps(nc) + self.metric_mask.get_maps(nc) + + @property + def ap_class_index(self): + # boxes and masks have the same ap_class_index + return self.metric_box.ap_class_index + +def mask_iou(mask1, mask2, eps=1e-7): + """ + mask1: [N, n] m1 means number of predicted objects + mask2: [M, n] m2 means number of gt objects + Note: n means image_w x image_h + + return: masks iou, [N, M] + """ + mask1 = mask1.float() + intersection = torch.matmul(mask1, mask2.t()).clamp(0) + union = (mask1.sum(1)[:, None] + mask2.sum(1)[None]) - intersection # (area1 + area2) - intersection + return intersection / (union + eps) + + +def masks_iou(mask1, mask2, eps=1e-7): + """ + mask1: [N, n] m1 means number of predicted objects + mask2: [N, n] m2 means number of gt objects + Note: n means image_w x image_h + + return: masks iou, (N, ) + """ + intersection = (mask1 * mask2).sum(1).clamp(0) # (N, ) + union = (mask1.sum(1) + mask2.sum(1))[None] - intersection # (area1 + area2) - intersection + return intersection / (union + eps) + +def box_iou(box1, box2, eps=1e-7): + # https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py + """ + Return intersection-over-union (Jaccard index) of boxes. + Both sets of boxes are expected to be in (x1, y1, x2, y2) format. + Arguments: + box1 (Tensor[N, 4]) + box2 (Tensor[M, 4]) + Returns: + iou (Tensor[N, M]): the NxM matrix containing the pairwise + IoU values for every element in boxes1 and boxes2 + """ + + # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2) + (a1, a2), (b1, b2) = box1.unsqueeze(1).chunk(2, 2), box2.unsqueeze(0).chunk(2, 2) + inter = (torch.min(a2, b2) - torch.max(a1, b1)).clamp(0).prod(2) + + # IoU = inter / (area1 + area2 - inter) + return inter / ((a2 - a1).prod(2) + (b2 - b1).prod(2) - inter + eps) \ No newline at end of file diff --git a/yolov6/utils/nms.py b/yolov6/utils/nms.py index 0f812642..c7369ba0 100644 --- a/yolov6/utils/nms.py +++ b/yolov6/utils/nms.py @@ -103,3 +103,164 @@ def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, classes=Non break # time limit exceeded return output + + +def non_max_suppression_seg(predictions, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, multi_label=False, max_det=300): + """Runs Non-Maximum Suppression (NMS) on inference results. + This code is borrowed from: https://github.com/ultralytics/yolov5/blob/47233e1698b89fc437a4fb9463c815e9171be955/utils/general.py#L775 + Args: + prediction: (tensor), with shape [N, 5 + num_classes], N is the number of bboxes. + conf_thres: (float) confidence threshold. + iou_thres: (float) iou threshold. + classes: (None or list[int]), if a list is provided, nms only keep the classes you provide. + agnostic: (bool), when it is set to True, we do class-independent nms, otherwise, different class would do nms respectively. + multi_label: (bool), when it is set to True, one box can have multi labels, otherwise, one box only huave one label. + max_det:(int), max number of output bboxes. + + Returns: + list of detections, echo item is one tensor with shape (num_boxes, 6), 6 is for [xyxy, conf, cls]. + """ + prediction = predictions[0] + confs = predictions[2] # (bs, which_proto, fs) + prediction = torch.cat([prediction, confs], axis=2)# (bs, l ,5 + num_classes + 33) + + num_classes = prediction.shape[2] - 5 - 33 # number of classes + pred_candidates = torch.logical_and(prediction[..., 4] > conf_thres, torch.max(prediction[..., 5: 5 + num_classes], axis=-1)[0] > conf_thres) # candidates + # Check the parameters. + assert 0 <= conf_thres <= 1, f'conf_thresh must be in 0.0 to 1.0, however {conf_thres} is provided.' + assert 0 <= iou_thres <= 1, f'iou_thres must be in 0.0 to 1.0, however {iou_thres} is provided.' + + # Function settings. + max_wh = 4096 # maximum box width and height + max_nms = 30000 # maximum number of boxes put into torchvision.ops.nms() + time_limit = 10.0 # quit the function when nms cost time exceed the limit time. + multi_label &= num_classes > 1 # multiple labels per box + + tik = time.time() + output = [torch.zeros((0, 6 + 33), device=prediction.device)] * prediction.shape[0] + for img_idx, x in enumerate(prediction): # image index, image inference + x = x[pred_candidates[img_idx]] # confidence + + # If no box remains, skip the next process. + if not x.shape[0]: + continue + + # confidence multiply the objectness + x[:, 5: 5 + num_classes] *= x[:, 4:5] # conf = obj_conf * cls_conf + + # (center x, center y, width, height) to (x1, y1, x2, y2) + box = xywh2xyxy(x[:, :4]) + segconf = x[:, 5 + num_classes: ] + + # Detections matrix's shape is (n,6), each row represents (xyxy, conf, cls) + if multi_label: + box_idx, class_idx = (x[:, 5: 5 + num_classes] > conf_thres).nonzero(as_tuple=False).T + x = torch.cat((box[box_idx], x[box_idx, class_idx + 5, None], class_idx[:, None].float(), segconf[box_idx]), 1) + else: # Only keep the class with highest scores. + conf, class_idx = x[:, 5: 5 + num_classes].max(1, keepdim=True) + x = torch.cat((box, conf, class_idx.float(), segconf), 1)[conf.view(-1) > conf_thres] + + # Filter by class, only keep boxes whose category is in classes. + if classes is not None: + x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)] + + # Check shape + num_box = x.shape[0] # number of boxes + if not num_box: # no boxes kept. + continue + elif num_box > max_nms: # excess max boxes' number. + x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence + + # Batched NMS + class_offset = x[:, 5:6] * (0 if agnostic else max_wh) # classes + boxes, scores = x[:, :4] + class_offset, x[:, 4] # boxes (offset by class), scores + keep_box_idx = torchvision.ops.nms(boxes, scores, iou_thres) # NMS + if keep_box_idx.shape[0] > max_det: # limit detections + keep_box_idx = keep_box_idx[:max_det] + + output[img_idx] = x[keep_box_idx] + if (time.time() - tik) > time_limit: + print(f'WARNING: NMS cost time exceed the limited {time_limit}s.') + break # time limit exceeded + + return output + +def non_max_suppression_seg_solo(predictions, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, multi_label=False, max_det=300): + """Runs Non-Maximum Suppression (NMS) on inference results. + This code is borrowed from: https://github.com/ultralytics/yolov5/blob/47233e1698b89fc437a4fb9463c815e9171be955/utils/general.py#L775 + Args: + prediction: (tensor), with shape [N, 5 + num_classes], N is the number of bboxes. + conf_thres: (float) confidence threshold. + iou_thres: (float) iou threshold. + classes: (None or list[int]), if a list is provided, nms only keep the classes you provide. + agnostic: (bool), when it is set to True, we do class-independent nms, otherwise, different class would do nms respectively. + multi_label: (bool), when it is set to True, one box can have multi labels, otherwise, one box only huave one label. + max_det:(int), max number of output bboxes. + + Returns: + list of detections, echo item is one tensor with shape (num_boxes, 6), 6 is for [xyxy, conf, cls]. + """ + prediction = predictions[0] + confs = predictions[2] # (bs, which_proto, fs) + prediction = torch.cat([prediction, confs], axis=2)# (bs, l ,5 + num_classes + 68) + + num_classes = prediction.shape[2] - 5 - 68 # number of classes + pred_candidates = torch.logical_and(prediction[..., 4] > conf_thres, torch.max(prediction[..., 5: 5 + num_classes], axis=-1)[0] > conf_thres) # candidates + # Check the parameters. + assert 0 <= conf_thres <= 1, f'conf_thresh must be in 0.0 to 1.0, however {conf_thres} is provided.' + assert 0 <= iou_thres <= 1, f'iou_thres must be in 0.0 to 1.0, however {iou_thres} is provided.' + + # Function settings. + max_wh = 4096 # maximum box width and height + max_nms = 30000 # maximum number of boxes put into torchvision.ops.nms() + time_limit = 10.0 # quit the function when nms cost time exceed the limit time. + multi_label &= num_classes > 1 # multiple labels per box + + tik = time.time() + output = [torch.zeros((0, 6 + 68), device=prediction.device)] * prediction.shape[0] + for img_idx, x in enumerate(prediction): # image index, image inference + x = x[pred_candidates[img_idx]] # confidence + + # If no box remains, skip the next process. + if not x.shape[0]: + continue + + # confidence multiply the objectness + x[:, 5: 5 + num_classes] *= x[:, 4:5] # conf = obj_conf * cls_conf + + # (center x, center y, width, height) to (x1, y1, x2, y2) + box = xywh2xyxy(x[:, :4]) + segconf = x[:, 5 + num_classes: ] + + # Detections matrix's shape is (n,6), each row represents (xyxy, conf, cls) + if multi_label: + box_idx, class_idx = (x[:, 5: 5 + num_classes] > conf_thres).nonzero(as_tuple=False).T + x = torch.cat((box[box_idx], x[box_idx, class_idx + 5, None], class_idx[:, None].float(), segconf[box_idx]), 1) + else: # Only keep the class with highest scores. + conf, class_idx = x[:, 5: 5 + num_classes].max(1, keepdim=True) + x = torch.cat((box, conf, class_idx.float(), segconf), 1)[conf.view(-1) > conf_thres] + + # Filter by class, only keep boxes whose category is in classes. + if classes is not None: + x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)] + + # Check shape + num_box = x.shape[0] # number of boxes + if not num_box: # no boxes kept. + continue + elif num_box > max_nms: # excess max boxes' number. + x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence + + # Batched NMS + class_offset = x[:, 5:6] * (0 if agnostic else max_wh) # classes + boxes, scores = x[:, :4] + class_offset, x[:, 4] # boxes (offset by class), scores + keep_box_idx = torchvision.ops.nms(boxes, scores, iou_thres) # NMS + if keep_box_idx.shape[0] > max_det: # limit detections + keep_box_idx = keep_box_idx[:max_det] + + output[img_idx] = x[keep_box_idx] + if (time.time() - tik) > time_limit: + print(f'WARNING: NMS cost time exceed the limited {time_limit}s.') + break # time limit exceeded + + return output diff --git a/yolov6/utils/test1.py b/yolov6/utils/test1.py new file mode 100644 index 00000000..246494f2 --- /dev/null +++ b/yolov6/utils/test1.py @@ -0,0 +1,23 @@ +def process_batch(detections, labels, iouv): + """ + Return correct predictions matrix. Both sets of boxes are in (x1, y1, x2, y2) format. + Arguments: + detections (Array[N, 6]), x1, y1, x2, y2, conf, class + labels (Array[M, 5]), class, x1, y1, x2, y2 + Returns: + correct (Array[N, 10]), for 10 IoU levels + """ + correct = np.zeros((detections.shape[0], iouv.shape[0])).astype(bool) + iou = general.box_iou(labels[:, 1:], detections[:, :4]) + correct_class = labels[:, 0:1] == detections[:, 5] + for i in range(len(iouv)): + x = torch.where((iou >= iouv[i]) & correct_class) # IoU > threshold and classes match + if x[0].shape[0]: + matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), 1).cpu().numpy() # [label, detect, iou] + if x[0].shape[0] > 1: + matches = matches[matches[:, 2].argsort()[::-1]] + matches = matches[np.unique(matches[:, 1], return_index=True)[1]] + # matches = matches[matches[:, 2].argsort()[::-1]] + matches = matches[np.unique(matches[:, 0], return_index=True)[1]] + correct[matches[:, 1].astype(int), i] = True + return torch.tensor(correct, dtype=torch.bool, device=iouv.device) \ No newline at end of file diff --git a/yolov6/utils/test2.py b/yolov6/utils/test2.py new file mode 100644 index 00000000..f21ad021 --- /dev/null +++ b/yolov6/utils/test2.py @@ -0,0 +1,37 @@ +def process_batch(detections, labels, iouv, pred_masks=None, gt_masks=None, overlap=False, masks=False): + """ + Return correct prediction matrix + Arguments: + detections (array[N, 6]), x1, y1, x2, y2, conf, class + labels (array[M, 5]), class, x1, y1, x2, y2 + Returns: + correct (array[N, 10]), for 10 IoU levels + """ + #breakpoint() + if masks: + gt_masks = gt_masks.to(pred_masks.device) + if overlap: + nl = len(labels) + index = torch.arange(nl, device=gt_masks.device).view(nl, 1, 1) + 1 + gt_masks = gt_masks.repeat(nl, 1, 1) # shape(1,640,640) -> (n,640,640) + gt_masks = torch.where(gt_masks == index, 1.0, 0.0) + if gt_masks.shape[1:] != pred_masks.shape[1:]: + gt_masks = F.interpolate(gt_masks[None].to(torch.float32), pred_masks.shape[1:], mode='bilinear', align_corners=False)[0] + gt_masks = gt_masks.gt_(0.5) + iou = mask_iou(gt_masks.view(gt_masks.shape[0], -1), pred_masks.view(pred_masks.shape[0], -1)).to(iouv.device) + else: # boxes + iou = box_iou(labels[:, 1:], detections[:, :4]).to(iouv.device) + + correct = np.zeros((detections.shape[0], iouv.shape[0])).astype(bool) + correct_class = labels[:, 0:1] == detections[:, 5] + for i in range(len(iouv)): + x = torch.where((iou >= iouv[i]) & correct_class) # IoU > threshold and classes match + if x[0].shape[0]: + matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), 1).cpu().numpy() # [label, detect, iou] + if x[0].shape[0] > 1: + matches = matches[matches[:, 2].argsort()[::-1]] + matches = matches[np.unique(matches[:, 1], return_index=True)[1]] + # matches = matches[matches[:, 2].argsort()[::-1]] + matches = matches[np.unique(matches[:, 0], return_index=True)[1]] + correct[matches[:, 1].astype(int), i] = True + return torch.tensor(correct, dtype=torch.bool, device=iouv.device) \ No newline at end of file