diff --git a/README.md b/README.md
index 92d8ee93..47d1deab 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ English | [简体中文](README_cn.md)
      <a href="https://www.kaggle.com/code/housanduo/yolov6"><img src="https://kaggle.com/static/images/open-in-kaggle.svg" alt="Open In Kaggle"></a>
   </div>
  <br>
-
+cp
 ## YOLOv6
 
 Implementation of paper:
diff --git a/configs/base/README.md b/configs/base/README.md
deleted file mode 100644
index 77ef5a4e..00000000
--- a/configs/base/README.md
+++ /dev/null
@@ -1,26 +0,0 @@
-## YOLOv6 base model
-
-English | [简体中文](./README_cn.md)
-
-### Features
-
-- Use only regular convolution and Relu activation functions.
-
-- Apply CSP (1/2 channel dim) blocks in the network structure, except for Nano base model.
-
-Advantage:
-- Adopt a unified network structure and configuration, and the accuracy loss of the PTQ 8-bit quantization model is negligible.
-- Suitable for users who are just getting started or who need to apply, optimize and deploy an 8-bit quantization model quickly and frequently.
-
-
-### Performance
-
-| Model                                                         | Size | mAP<sup>val<br/>0.5:0.95 | Speed<sup>T4<br/>TRT FP16 b1 <br/>(FPS) | Speed<sup>T4<br/>TRT FP16 b32 <br/>(FPS) | Speed<sup>T4<br/>TRT INT8 b1 <br/>(FPS) | Speed<sup>T4<br/>TRT INT8 b32 <br/>(FPS) | Params<br/><sup> (M) | FLOPs<br/><sup> (G) |
-| :--------------------------------------------------------------------------------------------- | --- | ----------------- | ----- | ---- | ---- | ---- | ----- | ------ |
-| [**YOLOv6-N-base**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6n_base.pt) | 640 | 36.6<sup>distill  | 727   | 1302 | 814  | 1805 | 4.65  | 11.46  |
-| [**YOLOv6-S-base**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6s_base.pt) | 640 | 45.3<sup>distill  | 346   | 525  | 487  | 908  | 13.14 | 30.6   |
-| [**YOLOv6-M-base**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6m_base.pt) | 640 | 49.4<sup>distill  | 179   | 245  | 284  | 439  | 28.33 | 72.30  |
-| [**YOLOv6-L-base**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6l_base.pt) | 640 | 51.1<sup>distill  | 116   | 157  | 196  | 288  | 59.61 | 150.89 |
-
-- Speed is tested with TensorRT 8.2.4.2 on T4.
-- The processes of model training, evaluation, and inference are the same as the original ones. For details, please refer to [this README](https://github.com/meituan/YOLOv6#quick-start).
diff --git a/configs/base/README_cn.md b/configs/base/README_cn.md
deleted file mode 100644
index b6b01d14..00000000
--- a/configs/base/README_cn.md
+++ /dev/null
@@ -1,25 +0,0 @@
-## YOLOv6 基础版模型
-
-简体中文 | [English](./README.md)
-
-### 模型特点
-
-- 仅使用常规卷积和Relu激活函数
-
-- 网络结构均采用CSP (1/2通道) block，Nano网络除外。
-
-优势：
-- 采用统一的网络结构和配置，且 PTQ 8位量化模型精度损失较小，适合刚入门或有快速迭代部署8位量化模型需求的用户。
-
-
-### 模型指标
-
-| 模型 | 尺寸 | mAP<sup>val<br/>0.5:0.95 | 速度<sup>T4<br/>TRT FP16 b1 <br/>(FPS) | 速度<sup>T4<br/>TRT FP16 b32 <br/>(FPS) | 速度<sup>T4<br/>TRT INT8 b1 <br/>(FPS) | 速度<sup>T4<br/>TRT INT8 b32 <br/>(FPS) | Params<br/><sup> (M) | FLOPs<br/><sup> (G) |
-| :--------------------------------------------------------------------------------------------- | --- | ----------------- | ----- | ---- | ---- | ---- | ----- | ------ |
-| [**YOLOv6-N-base**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6n_base.pt) | 640 | 36.6<sup>distill  | 727   | 1302 | 814  | 1805 | 4.65  | 11.46  |
-| [**YOLOv6-S-base**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6s_base.pt) | 640 | 45.3<sup>distill  | 346   | 525  | 487  | 908  | 13.14 | 30.6   |
-| [**YOLOv6-M-base**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6m_base.pt) | 640 | 49.4<sup>distill  | 179   | 245  | 284  | 439  | 28.33 | 72.30  |
-| [**YOLOv6-L-base**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6l_base.pt) | 640 | 51.1<sup>distill  | 116   | 157  | 196  | 288  | 59.61 | 150.89 |
-
-- 速度是在 T4 上测试的，TensorRT 版本为 8.4.2.4；
-- 模型训练、评估、推理流程与原来保持一致，具体可参考 [首页 README 文档](https://github.com/meituan/YOLOv6/blob/main/README_cn.md#%E5%BF%AB%E9%80%9F%E5%BC%80%E5%A7%8B)。
diff --git a/configs/base/yolov6l_base_finetune.py b/configs/base/yolov6l_base_finetune.py
deleted file mode 100644
index 7e8dc062..00000000
--- a/configs/base/yolov6l_base_finetune.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# YOLOv6 large base model
-model = dict(
-    type='YOLOv6l_base',
-    depth_multiple=1.0,
-    width_multiple=1.0,
-    pretrained=None,
-    backbone=dict(
-        type='CSPBepBackbone',
-        num_repeats=[1, 6, 12, 18, 6],
-        out_channels=[64, 128, 256, 512, 1024],
-        csp_e=float(1)/2,
-        fuse_P2=True,
-        ),
-    neck=dict(
-        type='CSPRepBiFPANNeck',
-        num_repeats=[12, 12, 12, 12],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        csp_e=float(1)/2,
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=1,
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        iou_type='giou',
-        use_dfl=True,
-        reg_max=16, #if use_dfl is False, please set reg_max to 0
-        distill_weight={
-            'class': 2.0,
-            'dfl': 1.0,
-        },
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.0032,
-    lrf=0.12,
-    momentum=0.843,
-    weight_decay=0.00036,
-    warmup_epochs=2.0,
-    warmup_momentum=0.5,
-    warmup_bias_lr=0.05
-)
-
-data_aug = dict(
-    hsv_h=0.0138,
-    hsv_s=0.664,
-    hsv_v=0.464,
-    degrees=0.373,
-    translate=0.245,
-    scale=0.898,
-    shear=0.602,
-    flipud=0.00856,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.243,
-)
-training_mode = "conv_relu"
diff --git a/configs/base/yolov6m_base_finetune.py b/configs/base/yolov6m_base_finetune.py
deleted file mode 100644
index af5449ec..00000000
--- a/configs/base/yolov6m_base_finetune.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# YOLOv6m medium/large base model
-model = dict(
-    type='YOLOv6m_base',
-    pretrained=None,
-    depth_multiple=0.80,
-    width_multiple=0.75,
-    backbone=dict(
-        type='CSPBepBackbone',
-        num_repeats=[1, 6, 12, 18, 6],
-        out_channels=[64, 128, 256, 512, 1024],
-        csp_e=float(1)/2,
-        fuse_P2=True,
-        ),
-    neck=dict(
-        type='CSPRepBiFPANNeck',
-        num_repeats=[12, 12, 12, 12],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        csp_e=float(1)/2,
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=3,
-        anchors_init=[[10,13, 19,19, 33,23],
-                      [30,61, 59,59, 59,119],
-                      [116,90, 185,185, 373,326]],
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type='giou',
-        use_dfl=True,
-        reg_max=16, #if use_dfl is False, please set reg_max to 0
-        distill_weight={
-            'class': 0.8,
-            'dfl': 1.0,
-        },
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.0032,
-    lrf=0.12,
-    momentum=0.843,
-    weight_decay=0.00036,
-    warmup_epochs=2.0,
-    warmup_momentum=0.5,
-    warmup_bias_lr=0.05
-)
-
-data_aug = dict(
-    hsv_h=0.0138,
-    hsv_s=0.664,
-    hsv_v=0.464,
-    degrees=0.373,
-    translate=0.245,
-    scale=0.898,
-    shear=0.602,
-    flipud=0.00856,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.243,
-)
-training_mode = "conv_relu"
diff --git a/configs/base/yolov6n_base.py b/configs/base/yolov6n_base.py
deleted file mode 100644
index 8340ca60..00000000
--- a/configs/base/yolov6n_base.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# YOLOv6s nano base model
-model = dict(
-    type='YOLOv6n_base',
-    pretrained=None,
-    depth_multiple=0.33,
-    width_multiple=0.25,
-    backbone=dict(
-        type='EfficientRep',
-        num_repeats=[1, 6, 12, 18, 6],
-        out_channels=[64, 128, 256, 512, 1024],
-        fuse_P2=True,
-        cspsppf=True,
-        ),
-    neck=dict(
-        type='RepBiFPANNeck',
-        num_repeats=[12, 12, 12, 12],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=3,
-        anchors_init=[[10,13, 19,19, 33,23],
-                      [30,61, 59,59, 59,119],
-                      [116,90, 185,185, 373,326]],
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type='giou',
-        use_dfl=True, # set to True if you want to further train with distillation
-        reg_max=16, # set to 16 if you want to further train with distillation
-        distill_weight={
-            'class': 1.0,
-            'dfl': 1.0,
-        },
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.01,
-    lrf=0.01,
-    momentum=0.937,
-    weight_decay=0.0005,
-    warmup_epochs=3.0,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.5,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.0,
-)
-training_mode = "conv_relu"
diff --git a/configs/base/yolov6n_base_finetune.py b/configs/base/yolov6n_base_finetune.py
deleted file mode 100644
index 593c3def..00000000
--- a/configs/base/yolov6n_base_finetune.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# YOLOv6s nanao base model
-model = dict(
-    type='YOLOv6n_base',
-    pretrained=None,
-    depth_multiple=0.33,
-    width_multiple=0.25,
-    backbone=dict(
-        type='EfficientRep',
-        num_repeats=[1, 6, 12, 18, 6],
-        out_channels=[64, 128, 256, 512, 1024],
-        fuse_P2=True,
-        cspsppf=True,
-        ),
-    neck=dict(
-        type='RepBiFPANNeck',
-        num_repeats=[12, 12, 12, 12],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=3,
-        anchors_init=[[10,13, 19,19, 33,23],
-                      [30,61, 59,59, 59,119],
-                      [116,90, 185,185, 373,326]],
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type='giou',
-        use_dfl=False, # set to True if you want to further train with distillation
-        reg_max=0, # set to 16 if you want to further train with distillation
-        distill_weight={
-            'class': 1.0,
-            'dfl': 1.0,
-        },
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.0032,
-    lrf=0.12,
-    momentum=0.843,
-    weight_decay=0.00036,
-    warmup_epochs=2.0,
-    warmup_momentum=0.5,
-    warmup_bias_lr=0.05
-)
-
-data_aug = dict(
-    hsv_h=0.0138,
-    hsv_s=0.664,
-    hsv_v=0.464,
-    degrees=0.373,
-    translate=0.245,
-    scale=0.898,
-    shear=0.602,
-    flipud=0.00856,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.243,
-)
-training_mode = "conv_relu"
diff --git a/configs/base/yolov6s_base.py b/configs/base/yolov6s_base.py
deleted file mode 100644
index 4e28c178..00000000
--- a/configs/base/yolov6s_base.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# YOLOv6s small base model
-model = dict(
-    type='YOLOv6s_base',
-    pretrained=None,
-    depth_multiple=0.70,
-    width_multiple=0.50,
-    backbone=dict(
-        type='CSPBepBackbone',
-        num_repeats=[1, 6, 12, 18, 6],
-        out_channels=[64, 128, 256, 512, 1024],
-        csp_e=float(1)/2,
-        fuse_P2=True,
-        cspsppf=True,
-        ),
-    neck=dict(
-        type='CSPRepBiFPANNeck',#CSPRepPANNeck
-        num_repeats=[12, 12, 12, 12],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        csp_e=float(1)/2,
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=3,
-        anchors_init=[[10,13, 19,19, 33,23],
-                      [30,61, 59,59, 59,119],
-                      [116,90, 185,185, 373,326]],
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type='giou',
-        use_dfl=True, # set to True if you want to further train with distillation
-        reg_max=16, # set to 16 if you want to further train with distillation
-        distill_weight={
-            'class': 1.0,
-            'dfl': 1.0,
-        },
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.01,
-    lrf=0.01,
-    momentum=0.937,
-    weight_decay=0.0005,
-    warmup_epochs=3.0,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.5,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.0,
-)
-training_mode = "conv_relu"
diff --git a/configs/base/yolov6s_base_finetune.py b/configs/base/yolov6s_base_finetune.py
deleted file mode 100644
index eb4d2159..00000000
--- a/configs/base/yolov6s_base_finetune.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# YOLOv6s small base model
-model = dict(
-    type='YOLOv6s_base',
-    pretrained=None,
-    depth_multiple=0.70,
-    width_multiple=0.50,
-    backbone=dict(
-        type='CSPBepBackbone',
-        num_repeats=[1, 6, 12, 18, 6],
-        out_channels=[64, 128, 256, 512, 1024],
-        csp_e=float(1)/2,
-        fuse_P2=True,
-        cspsppf=True,
-        ),
-    neck=dict(
-        type='CSPRepBiFPANNeck',
-        num_repeats=[12, 12, 12, 12],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        csp_e=float(1)/2,
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=3,
-        anchors_init=[[10,13, 19,19, 33,23],
-                      [30,61, 59,59, 59,119],
-                      [116,90, 185,185, 373,326]],
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type='giou',
-        use_dfl=False, # set to True if you want to further train with distillation
-        reg_max=0, # set to 16 if you want to further train with distillation
-        distill_weight={
-            'class': 1.0,
-            'dfl': 1.0,
-        },
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.0032,
-    lrf=0.12,
-    momentum=0.843,
-    weight_decay=0.00036,
-    warmup_epochs=2.0,
-    warmup_momentum=0.5,
-    warmup_bias_lr=0.05
-)
-
-data_aug = dict(
-    hsv_h=0.0138,
-    hsv_s=0.664,
-    hsv_v=0.464,
-    degrees=0.373,
-    translate=0.245,
-    scale=0.898,
-    shear=0.602,
-    flipud=0.00856,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.243,
-)
-training_mode = "conv_relu"
diff --git a/configs/experiment/eval_640_repro.py b/configs/experiment/eval_640_repro.py
deleted file mode 100644
index 1f6a6217..00000000
--- a/configs/experiment/eval_640_repro.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# eval param for different scale
-
-eval_params = dict(
-    default = dict(
-        img_size=640,
-        shrink_size=2,
-        infer_on_rect=False,
-    ),
-    yolov6n = dict(
-        img_size=640,
-        shrink_size=4,
-        infer_on_rect=False,
-    ),
-    yolov6t = dict(
-        img_size=640,
-        shrink_size=6,
-        infer_on_rect=False,
-    ),
-    yolov6s = dict(
-        img_size=640,
-        shrink_size=6,
-        infer_on_rect=False,
-    ),
-    yolov6m = dict(
-        img_size=640,
-        shrink_size=4,
-        infer_on_rect=False,
-    ),
-    yolov6l = dict(
-        img_size=640,
-        shrink_size=4,
-        infer_on_rect=False,
-    ),
-    yolov6l_relu = dict(
-        img_size=640,
-        shrink_size=2,
-        infer_on_rect=False,
-    ),
-    yolov6n6 = dict(
-        img_size=1280,
-        shrink_size=17,
-        infer_on_rect=False,
-    ),
-    yolov6s6 = dict(
-        img_size=1280,
-        shrink_size=8,
-        infer_on_rect=False,
-    ),
-    yolov6m6 = dict(
-        img_size=1280,
-        shrink_size=64,
-        infer_on_rect=False,
-    ),
-    yolov6l6 = dict(
-        img_size=1280,
-        shrink_size=41,
-        infer_on_rect=False,
-    ),
-    yolov6s_mbla = dict(
-        img_size=640,
-        shrink_size=7,
-        infer_on_rect=False,
-    ),
-    yolov6m_mbla = dict(
-        img_size=640,
-        shrink_size=7,
-        infer_on_rect=False,
-    ),
-    yolov6l_mbla = dict(
-        img_size=640,
-        shrink_size=7,
-        infer_on_rect=False,
-    ),
-    yolov6x_mbla = dict(
-        img_size=640,
-        shrink_size=3,
-        infer_on_rect=False,
-    )
-)
diff --git a/configs/experiment/yolov6n_with_eval_params.py b/configs/experiment/yolov6n_with_eval_params.py
deleted file mode 100644
index e7366b33..00000000
--- a/configs/experiment/yolov6n_with_eval_params.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# YOLOv6n model with eval param(when traing)
-model = dict(
-    type='YOLOv6n',
-    pretrained=None,
-    depth_multiple=0.33,
-    width_multiple=0.25,
-    backbone=dict(
-        type='EfficientRep',
-        num_repeats=[1, 6, 12, 18, 6],
-        out_channels=[64, 128, 256, 512, 1024],
-        ),
-    neck=dict(
-        type='RepPANNeck',
-        num_repeats=[12, 12, 12, 12],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=1,
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        iou_type='siou',
-        use_dfl=False,
-        reg_max=0 #if use_dfl is False, please set reg_max to 0
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.02, #0.01 # 0.02
-    lrf=0.01,
-    momentum=0.937,
-    weight_decay=0.0005,
-    warmup_epochs=3.0,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.5,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.0,
-)
-
-# Eval params when eval model.
-# If eval_params item is list, eg conf_thres=[0.03, 0.03],
-# first will be used in train.py and second will be used in eval.py.
-eval_params = dict(
-    batch_size=None,  #None mean will be the same as batch on one device * 2
-    img_size=None,  #None mean will be the same as train image size
-    conf_thres=0.03,
-    iou_thres=0.65,
-
-    #pading and scale coord
-    shrink_size=None, # None mean will not shrink the image.
-    infer_on_rect=True,
-
-    #metric
-    verbose=False,
-    do_coco_metric=True,
-    do_pr_metric=False,
-    plot_curve=False,
-    plot_confusion_matrix=False
-)
diff --git a/configs/experiment/yolov6s_csp_scaled.py b/configs/experiment/yolov6s_csp_scaled.py
deleted file mode 100644
index ba28843a..00000000
--- a/configs/experiment/yolov6s_csp_scaled.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# YOLOv6m model
-model = dict(
-    type='YOLOv6s_csp',
-    pretrained=None,
-    depth_multiple=0.70,
-    width_multiple=0.50,
-    backbone=dict(
-        type='CSPBepBackbone',
-        num_repeats=[1, 6, 12, 18, 6],
-        out_channels=[64, 128, 256, 512, 1024],
-        csp_e=float(1)/2,
-        ),
-    neck=dict(
-        type='CSPRepPANNeck',
-        num_repeats=[12, 12, 12, 12],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        csp_e=float(1)/2,
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=1,
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        iou_type='giou',
-        use_dfl=False,
-        reg_max=0 #if use_dfl is False, please set reg_max to 0
-    )
-)
-
-solver=dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.01,
-    lrf=0.01,
-    momentum=0.937,
-    weight_decay=0.0005,
-    warmup_epochs=3.0,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.9,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.1,
-)
diff --git a/configs/experiment/yolov6t.py b/configs/experiment/yolov6t.py
deleted file mode 100644
index afacd436..00000000
--- a/configs/experiment/yolov6t.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# YOLOv6t model
-model = dict(
-    type='YOLOv6t',
-    pretrained=None,
-    depth_multiple=0.33,
-    width_multiple=0.375,
-    backbone=dict(
-        type='EfficientRep',
-        num_repeats=[1, 6, 12, 18, 6],
-        out_channels=[64, 128, 256, 512, 1024],
-        ),
-    neck=dict(
-        type='RepPANNeck',
-        num_repeats=[12, 12, 12, 12],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=1,
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        iou_type='siou',
-        use_dfl=False,
-        reg_max=0 #if use_dfl is False, please set reg_max to 0
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.01,
-    lrf=0.01,
-    momentum=0.937,
-    weight_decay=0.0005,
-    warmup_epochs=3.0,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.5,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.0,
-)
diff --git a/configs/experiment/yolov6t_csp_scaled.py b/configs/experiment/yolov6t_csp_scaled.py
deleted file mode 100644
index e8ba99a9..00000000
--- a/configs/experiment/yolov6t_csp_scaled.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# YOLOv6n model
-model = dict(
-    type='YOLOv6n_csp',
-    pretrained=None,
-    depth_multiple=0.60,
-    width_multiple=0.50,
-    backbone=dict(
-        type='CSPBepBackbone',
-        num_repeats=[1, 6, 12, 18, 6],
-        out_channels=[64, 128, 256, 512, 1024],
-        csp_e=float(1)/2,
-        ),
-    neck=dict(
-        type='CSPRepPANNeck',
-        num_repeats=[12, 12, 12, 12],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        csp_e=float(1)/2,
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=1,
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        iou_type='giou',
-        use_dfl=False,
-        reg_max=0 #if use_dfl is False, please set reg_max to 0
-    )
-)
-
-solver=dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.01,
-    lrf=0.01,
-    momentum=0.937,
-    weight_decay=0.0005,
-    warmup_epochs=3.0,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.9,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.1,
-)
diff --git a/configs/experiment/yolov6t_finetune.py b/configs/experiment/yolov6t_finetune.py
deleted file mode 100644
index 8be47416..00000000
--- a/configs/experiment/yolov6t_finetune.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# YOLOv6t model
-model = dict(
-    type='YOLOv6t',
-    pretrained='weights/yolov6t.pt',
-    depth_multiple=0.33,
-    width_multiple=0.375,
-    backbone=dict(
-        type='EfficientRep',
-        num_repeats=[1, 6, 12, 18, 6],
-        out_channels=[64, 128, 256, 512, 1024],
-        ),
-    neck=dict(
-        type='RepPANNeck',
-        num_repeats=[12, 12, 12, 12],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=1,
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        iou_type='siou',
-        use_dfl=False,
-        reg_max=0 #if use_dfl is False, please set reg_max to 0
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.0032,
-    lrf=0.12,
-    momentum=0.843,
-    weight_decay=0.00036,
-    warmup_epochs=2.0,
-    warmup_momentum=0.5,
-    warmup_bias_lr=0.05
-)
-
-data_aug = dict(
-    hsv_h=0.0138,
-    hsv_s=0.664,
-    hsv_v=0.464,
-    degrees=0.373,
-    translate=0.245,
-    scale=0.898,
-    shear=0.602,
-    flipud=0.00856,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.243,
-)
diff --git a/configs/mbla/README.md b/configs/mbla/README.md
deleted file mode 100644
index d163124d..00000000
--- a/configs/mbla/README.md
+++ /dev/null
@@ -1,28 +0,0 @@
-## YOLOv6 mbla model
-
-English | [简体中文](./README_cn.md)
-
-### Features
-
-- Apply MBLABlock(Multi Branch Layer Aggregation Block) blocks in the network structure.
-
-Advantage:
-- Adopt a unified network structure and configuration.
-
-- Better performance for Small model comparing to yolov6 3.0 release.
-
-- Better performance comparing to yolov6 3.0 base.
-
-
-
-### Performance
-
-| Model                                                         | Size | mAP<sup>val<br/>0.5:0.95 | Speed<sup>T4<br/>trt fp16 b1 <br/>(fps) | Speed<sup>T4<br/>trt fp16 b32 <br/>(fps) | Params<br/><sup> (M) | FLOPs<br/><sup> (G) |
-| :----------------------------------------------------------- | -------- | :----------------------- | -------------------------------------- | --------------------------------------- | -------------------- | ------------------- |
-| [**YOLOv6-S-mbla**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6s_mbla.pt) | 640      | 47.0<sup>distill            | 300                                    | 424                                    | 11.6                  | 29.8                |
-| [**YOLOv6-M-mbla**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6m_mbla.pt) | 640      | 50.3<sup>distill            | 168                                    | 216                                     | 26.1                 | 66.7                |
-| [**YOLOv6-L-mbla**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6l_base.pt) | 640      | 52.0<sup>distill         | 129                                    | 154                                     | 46.3                 | 118.2                |
-| [**YOLOv6-X-base**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6x_base.pt) | 640      | 53.5<sup>distill         | 78                                    | 94                                     | 78.8                 | 199.0               |
-
-- Speed is tested with TensorRT 8.4.2.4 on T4.
-- The processes of model training, evaluation, and inference are the same as the original ones. For details, please refer to [this README](https://github.com/meituan/YOLOv6#quick-start).
diff --git a/configs/mbla/README_cn.md b/configs/mbla/README_cn.md
deleted file mode 100644
index ad399fe0..00000000
--- a/configs/mbla/README_cn.md
+++ /dev/null
@@ -1,26 +0,0 @@
-## YOLOv6 MBLA版模型
-
-简体中文 | [English](./README.md)
-
-### 模型特点
-
-- 网络主体结构均采用MBLABlock(Multi Branch Layer Aggregation Block)
-
-优势：
-- 采用统一的网络结构和配置
-
-- 相比3.0版本在s尺度效果提升，相比3.0base版本各尺度效果提升
-
-
-
-### 模型指标
-
-| 模型                                                         | 输入尺寸 | mAP<sup>val<br/>0.5:0.95 | 速度<sup>T4<br/>trt fp16 b1 <br/>(fps) | 速度<sup>T4<br/>trt fp16 b32 <br/>(fps) | Params<br/><sup> (M) | FLOPs<br/><sup> (G) |
-| :----------------------------------------------------------- | -------- | :----------------------- | -------------------------------------- | --------------------------------------- | -------------------- | ------------------- |
-| [**YOLOv6-S-mbla**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6s_mbla.pt) | 640      | 47.0<sup>distill            | 300                                    | 424                                    | 11.6                  | 29.8                |
-| [**YOLOv6-M-mbla**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6m_mbla.pt) | 640      | 50.3<sup>distill            | 168                                    | 216                                     | 26.1                 | 66.7                |
-| [**YOLOv6-L-mbla**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6l_base.pt) | 640      | 52.0<sup>distill         | 129                                    | 154                                     | 46.3                 | 118.2                |
-| [**YOLOv6-X-base**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6x_base.pt) | 640      | 53.5<sup>distill         | 78                                    | 94                                     | 78.8                 | 199.0               |
-
-- 速度是在 T4 上测试的，TensorRT 版本为  8.4.2.4；
-- 模型训练、评估、推理流程与原来保持一致，具体可参考 [首页 README 文档](https://github.com/meituan/YOLOv6/blob/main/README_cn.md#%E5%BF%AB%E9%80%9F%E5%BC%80%E5%A7%8B)。
diff --git a/configs/mbla/yolov6l_mbla_finetune.py b/configs/mbla/yolov6l_mbla_finetune.py
deleted file mode 100644
index 6ea88967..00000000
--- a/configs/mbla/yolov6l_mbla_finetune.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# YOLOv6l model
-model = dict(
-    type='YOLOv6l_mbla',
-    pretrained=None,
-    depth_multiple=0.5,
-    width_multiple=1.0,
-    backbone=dict(
-        type='CSPBepBackbone',
-        num_repeats=[1, 4, 8, 8, 4],
-        out_channels=[64, 128, 256, 512, 1024],
-        csp_e=float(1)/2,
-        fuse_P2=True,
-        stage_block_type="MBLABlock",
-        ),
-    neck=dict(
-        type='CSPRepBiFPANNeck',
-        num_repeats=[8, 8, 8, 8],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        csp_e=float(1)/2,
-        stage_block_type="MBLABlock",
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=3,
-        anchors_init=[[10,13, 19,19, 33,23],
-                      [30,61, 59,59, 59,119],
-                      [116,90, 185,185, 373,326]],
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type='giou',
-        use_dfl=True,
-        reg_max=16, #if use_dfl is False, please set reg_max to 0
-        distill_weight={
-            'class': 2.0,
-            'dfl': 1.0,
-        },
-    )
-)
-
-solver=dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.0032,
-    lrf=0.12,
-    momentum=0.843,
-    weight_decay=0.00036,
-    warmup_epochs=2.0,
-    warmup_momentum=0.5,
-    warmup_bias_lr=0.05
-)
-
-data_aug = dict(
-    hsv_h=0.0138,
-    hsv_s=0.664,
-    hsv_v=0.464,
-    degrees=0.373,
-    translate=0.245,
-    scale=0.898,
-    shear=0.602,
-    flipud=0.00856,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.243,
-)
-
-training_mode = "conv_silu"
diff --git a/configs/mbla/yolov6m_mbla.py b/configs/mbla/yolov6m_mbla.py
deleted file mode 100644
index f84fc43d..00000000
--- a/configs/mbla/yolov6m_mbla.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# YOLOv6l model
-model = dict(
-    type='YOLOv6m_mbla',
-    pretrained=None,
-    depth_multiple=0.5,
-    width_multiple=0.75,
-    backbone=dict(
-        type='CSPBepBackbone',
-        num_repeats=[1, 4, 8, 8, 4],
-        out_channels=[64, 128, 256, 512, 1024],
-        csp_e=float(1)/2,
-        fuse_P2=True,
-        stage_block_type="MBLABlock",
-        ),
-    neck=dict(
-        type='CSPRepBiFPANNeck',
-        num_repeats=[8, 8, 8, 8],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        csp_e=float(1)/2,
-        stage_block_type="MBLABlock",
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=3,
-        anchors_init=[[10,13, 19,19, 33,23],
-                      [30,61, 59,59, 59,119],
-                      [116,90, 185,185, 373,326]],
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type='giou',
-        use_dfl=True,
-        reg_max=16, #if use_dfl is False, please set reg_max to 0
-        distill_weight={
-            'class': 2.0,
-            'dfl': 1.0,
-        },
-    )
-)
-
-solver=dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.01,
-    lrf=0.01,
-    momentum=0.937,
-    weight_decay=0.0005,
-    warmup_epochs=3.0,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.9,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.1,
-)
-
-training_mode = "conv_silu"
diff --git a/configs/mbla/yolov6m_mbla_finetune.py b/configs/mbla/yolov6m_mbla_finetune.py
deleted file mode 100644
index aa0bc816..00000000
--- a/configs/mbla/yolov6m_mbla_finetune.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# YOLOv6l model
-model = dict(
-    type='YOLOv6m_mbla',
-    pretrained=None,
-    depth_multiple=0.5,
-    width_multiple=0.75,
-    backbone=dict(
-        type='CSPBepBackbone',
-        num_repeats=[1, 4, 8, 8, 4],
-        out_channels=[64, 128, 256, 512, 1024],
-        csp_e=float(1)/2,
-        fuse_P2=True,
-        stage_block_type="MBLABlock",
-        ),
-    neck=dict(
-        type='CSPRepBiFPANNeck',
-        num_repeats=[8, 8, 8, 8],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        csp_e=float(1)/2,
-        stage_block_type="MBLABlock",
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=3,
-        anchors_init=[[10,13, 19,19, 33,23],
-                      [30,61, 59,59, 59,119],
-                      [116,90, 185,185, 373,326]],
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type='giou',
-        use_dfl=True,
-        reg_max=16, #if use_dfl is False, please set reg_max to 0
-        distill_weight={
-            'class': 2.0,
-            'dfl': 1.0,
-        },
-    )
-)
-
-solver=dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.0032,
-    lrf=0.12,
-    momentum=0.843,
-    weight_decay=0.00036,
-    warmup_epochs=2.0,
-    warmup_momentum=0.5,
-    warmup_bias_lr=0.05
-)
-
-data_aug = dict(
-    hsv_h=0.0138,
-    hsv_s=0.664,
-    hsv_v=0.464,
-    degrees=0.373,
-    translate=0.245,
-    scale=0.898,
-    shear=0.602,
-    flipud=0.00856,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.243,
-)
-
-training_mode = "conv_silu"
diff --git a/configs/mbla/yolov6s_mbla.py b/configs/mbla/yolov6s_mbla.py
deleted file mode 100644
index eedc76ee..00000000
--- a/configs/mbla/yolov6s_mbla.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# YOLOv6l model
-model = dict(
-    type='YOLOv6s_mbla',
-    pretrained=None,
-    depth_multiple=0.5,
-    width_multiple=0.5,
-    backbone=dict(
-        type='CSPBepBackbone',
-        num_repeats=[1, 4, 8, 8, 4],
-        out_channels=[64, 128, 256, 512, 1024],
-        csp_e=float(1)/2,
-        fuse_P2=True,
-        stage_block_type="MBLABlock",
-        ),
-    neck=dict(
-        type='CSPRepBiFPANNeck',
-        num_repeats=[8, 8, 8, 8],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        csp_e=float(1)/2,
-        stage_block_type="MBLABlock",
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=3,
-        anchors_init=[[10,13, 19,19, 33,23],
-                      [30,61, 59,59, 59,119],
-                      [116,90, 185,185, 373,326]],
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type='giou',
-        use_dfl=True,
-        reg_max=16, #if use_dfl is False, please set reg_max to 0
-        distill_weight={
-            'class': 2.0,
-            'dfl': 1.0,
-        },
-    )
-)
-
-solver=dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.01,
-    lrf=0.01,
-    momentum=0.937,
-    weight_decay=0.0005,
-    warmup_epochs=3.0,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.9,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.1,
-)
-
-training_mode = "conv_silu"
diff --git a/configs/mbla/yolov6s_mbla_finetune.py b/configs/mbla/yolov6s_mbla_finetune.py
deleted file mode 100644
index a9812c71..00000000
--- a/configs/mbla/yolov6s_mbla_finetune.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# YOLOv6l model
-model = dict(
-    type='YOLOv6s_mbla',
-    pretrained=None,
-    depth_multiple=0.5,
-    width_multiple=0.5,
-    backbone=dict(
-        type='CSPBepBackbone',
-        num_repeats=[1, 4, 8, 8, 4],
-        out_channels=[64, 128, 256, 512, 1024],
-        csp_e=float(1)/2,
-        fuse_P2=True,
-        stage_block_type="MBLABlock",
-        ),
-    neck=dict(
-        type='CSPRepBiFPANNeck',
-        num_repeats=[8, 8, 8, 8],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        csp_e=float(1)/2,
-        stage_block_type="MBLABlock",
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=3,
-        anchors_init=[[10,13, 19,19, 33,23],
-                      [30,61, 59,59, 59,119],
-                      [116,90, 185,185, 373,326]],
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type='giou',
-        use_dfl=True,
-        reg_max=16, #if use_dfl is False, please set reg_max to 0
-        distill_weight={
-            'class': 2.0,
-            'dfl': 1.0,
-        },
-    )
-)
-
-solver=dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.0032,
-    lrf=0.12,
-    momentum=0.843,
-    weight_decay=0.00036,
-    warmup_epochs=2.0,
-    warmup_momentum=0.5,
-    warmup_bias_lr=0.05
-)
-
-data_aug = dict(
-    hsv_h=0.0138,
-    hsv_s=0.664,
-    hsv_v=0.464,
-    degrees=0.373,
-    translate=0.245,
-    scale=0.898,
-    shear=0.602,
-    flipud=0.00856,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.243,
-)
-
-training_mode = "conv_silu"
diff --git a/configs/mbla/yolov6x_mbla.py b/configs/mbla/yolov6x_mbla.py
deleted file mode 100644
index b7b9703c..00000000
--- a/configs/mbla/yolov6x_mbla.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# YOLOv6l model
-model = dict(
-    type='YOLOv6x_mbla',
-    pretrained=None,
-    depth_multiple=1.0,
-    width_multiple=1.0,
-    backbone=dict(
-        type='CSPBepBackbone',
-        num_repeats=[1, 4, 8, 8, 4],
-        out_channels=[64, 128, 256, 512, 1024],
-        csp_e=float(1)/2,
-        fuse_P2=True,
-        stage_block_type="MBLABlock",
-        ),
-    neck=dict(
-        type='CSPRepBiFPANNeck',
-        num_repeats=[8, 8, 8, 8],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        csp_e=float(1)/2,
-        stage_block_type="MBLABlock",
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=3,
-        anchors_init=[[10,13, 19,19, 33,23],
-                      [30,61, 59,59, 59,119],
-                      [116,90, 185,185, 373,326]],
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type='giou',
-        use_dfl=True,
-        reg_max=16, #if use_dfl is False, please set reg_max to 0
-        distill_weight={
-            'class': 2.0,
-            'dfl': 1.0,
-        },
-    )
-)
-
-solver=dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.01,
-    lrf=0.01,
-    momentum=0.937,
-    weight_decay=0.0005,
-    warmup_epochs=3.0,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.9,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.1,
-)
-
-training_mode = "conv_silu"
diff --git a/configs/mbla/yolov6x_mbla_finetune.py b/configs/mbla/yolov6x_mbla_finetune.py
deleted file mode 100644
index 65c57cb2..00000000
--- a/configs/mbla/yolov6x_mbla_finetune.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# YOLOv6l model
-model = dict(
-    type='YOLOv6x_mbla',
-    pretrained=None,
-    depth_multiple=1.0,
-    width_multiple=1.0,
-    backbone=dict(
-        type='CSPBepBackbone',
-        num_repeats=[1, 4, 8, 8, 4],
-        out_channels=[64, 128, 256, 512, 1024],
-        csp_e=float(1)/2,
-        fuse_P2=True,
-        stage_block_type="MBLABlock",
-        ),
-    neck=dict(
-        type='CSPRepBiFPANNeck',
-        num_repeats=[8, 8, 8, 8],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        csp_e=float(1)/2,
-        stage_block_type="MBLABlock",
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=3,
-        anchors_init=[[10,13, 19,19, 33,23],
-                      [30,61, 59,59, 59,119],
-                      [116,90, 185,185, 373,326]],
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type='giou',
-        use_dfl=True,
-        reg_max=16, #if use_dfl is False, please set reg_max to 0
-        distill_weight={
-            'class': 2.0,
-            'dfl': 1.0,
-        },
-    )
-)
-
-solver=dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.0032,
-    lrf=0.12,
-    momentum=0.843,
-    weight_decay=0.00036,
-    warmup_epochs=2.0,
-    warmup_momentum=0.5,
-    warmup_bias_lr=0.05
-)
-
-data_aug = dict(
-    hsv_h=0.0138,
-    hsv_s=0.664,
-    hsv_v=0.464,
-    degrees=0.373,
-    translate=0.245,
-    scale=0.898,
-    shear=0.602,
-    flipud=0.00856,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.243,
-)
-
-training_mode = "conv_silu"
diff --git a/configs/qarepvgg/README.md b/configs/qarepvgg/README.md
deleted file mode 100644
index 81b130d2..00000000
--- a/configs/qarepvgg/README.md
+++ /dev/null
@@ -1,26 +0,0 @@
-## YOLOv6 base model
-
-English | [简体中文](./README_cn.md)
-
-### Features
-
-- This is a RepOpt-version implementation of YOLOv6 according to [QARepVGG](https://arxiv.org/abs/2212.01593).
-
-- The QARep version models possess slightly lower float accuracy on COCO than the RepVGG version models, but achieve highly improved quantized accuracy.
-
-- The INT8 accuracies listed were obtained using a simple PTQ process, as implemented in the [`onnx_to_trt.py`](../../deploy/TensorRT/onnx_to_trt.py) script. However, higher accuracies could be achieved using Quantization-Aware Training (QAT) due to the specific architecture design of the QARepVGG model.
-
-### Performance
-
-| Model                                                         | Size | Float<br/>mAP<sup>val<br/>0.5:0.95 | INT8<br/>mAP<sup>val<br/>0.5:0.95 | Speed<sup>T4<br/>trt fp16 b32 <br/>(fps) | Speed<sup>T4<br/>trt int8 b32 <br/>(fps) | Params<br/><sup> (M) | FLOPs<br/><sup> (G) |
-| :----------------------------------------------------------- | -------- | :----------------------- | -------------------------------------- | --------------------------------------- | -------------------- | ------------------- | -------------------- |
-| [**YOLOv6-N**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6n.pt) | 640      | 37.5            | 34.3                                    | 1286                                   | 1773                  |4.7                  | 11.4                |
-| [**YOLOv6-N-qa**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6n_qa.pt) | 640      | 37.1            | 36.4                                    | 1286                                     | 1773                 | 4.7                  | 11.4             |
-| [**YOLOv6-S**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6s.pt) | 640      | 45.0         | 41.3                                    | 513                                     | 1117                 | 18.5                 | 45.3                 |
-| [**YOLOv6-S-qa**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6s_qa.pt) | 640      | 44.7         | 44.0                                    | 513                                     | 1117                 | 18.5                 | 45.3                  |
-| [**YOLOv6-M**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6m.pt) | 640      | 50.0         | 48.1                                    | 250                                     | 439                 | 34.9                 | 85.8                 |
-| [**YOLOv6-M-qa**](https://github.com/meituan/YOLOv6/releases/download/0.3.0/yolov6m_qa.pt) | 640      | 49.7         | 49.4                                    | 250                                     | 439                 | 34.9                 | 85.8                 |
-
-- Speed is tested with TensorRT 8.4 on T4.
-- We have not conducted experiments on the YOLOv6-L model since it does not use the RepVGG architecture.
-- The processes of model training, evaluation, and inference are the same as the original ones. For details, please refer to [this README](https://github.com/meituan/YOLOv6#quick-start).
diff --git a/configs/repopt/yolov6_tiny_hs.py b/configs/repopt/yolov6_tiny_hs.py
deleted file mode 100644
index 70a74279..00000000
--- a/configs/repopt/yolov6_tiny_hs.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# YOLOv6t model
-model = dict(
-    type='YOLOv6t',
-    pretrained=None,
-    depth_multiple=0.33,
-    width_multiple=0.375,
-    backbone=dict(
-        type='EfficientRep',
-        num_repeats=[1, 6, 12, 18, 6],
-        out_channels=[64, 128, 256, 512, 1024],
-        ),
-    neck=dict(
-        type='RepPANNeck',
-        num_repeats=[12, 12, 12, 12],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=1,
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type='siou',
-        use_dfl=False,
-        reg_max=0 #if use_dfl is False, please set reg_max to 0
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.01,
-    lrf=0.01,
-    momentum=0.937,
-    weight_decay=0.0005,
-    warmup_epochs=3.0,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.5,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.0,
-)
-
-# Choose Rep-block by the training Mode, choices=["repvgg", "hyper-search", "repopt"]
-training_mode='hyper_search'
diff --git a/configs/repopt/yolov6_tiny_opt.py b/configs/repopt/yolov6_tiny_opt.py
deleted file mode 100644
index 95dbf317..00000000
--- a/configs/repopt/yolov6_tiny_opt.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# YOLOv6t model
-model = dict(
-    type='YOLOv6t',
-    pretrained=None,
-    scales='../yolov6_assert/v6t_v2_scale_last.pt',
-    depth_multiple=0.33,
-    width_multiple=0.375,
-    backbone=dict(
-        type='EfficientRep',
-        num_repeats=[1, 6, 12, 18, 6],
-        out_channels=[64, 128, 256, 512, 1024],
-        ),
-    neck=dict(
-        type='RepPANNeck',
-        num_repeats=[12, 12, 12, 12],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=1,
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type='siou',
-        use_dfl=False,
-        reg_max=0 #if use_dfl is False, please set reg_max to 0
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.01,
-    lrf=0.01,
-    momentum=0.937,
-    weight_decay=0.0005,
-    warmup_epochs=3.0,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.5,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.0,
-)
-# Choose Rep-block by the training Mode, choices=["repvgg", "hyper-search", "repopt"]
-training_mode='repopt'
diff --git a/configs/repopt/yolov6_tiny_opt_qat.py b/configs/repopt/yolov6_tiny_opt_qat.py
deleted file mode 100644
index 701bf4f1..00000000
--- a/configs/repopt/yolov6_tiny_opt_qat.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# YOLOv6t model
-model = dict(
-    type='YOLOv6t',
-    pretrained='./assets/v6s_t.pt',
-    scales='./assets/v6t_v2_scale_last.pt',
-    depth_multiple=0.33,
-    width_multiple=0.375,
-    backbone=dict(
-        type='EfficientRep',
-        num_repeats=[1, 6, 12, 18, 6],
-        out_channels=[64, 128, 256, 512, 1024],
-        ),
-    neck=dict(
-        type='RepPANNeck',
-        num_repeats=[12, 12, 12, 12],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=1,
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type='siou',
-        use_dfl=False,
-        reg_max=0, #if use_dfl is False, please set reg_max to 0
-        distill_weight={
-            'class': 1.0,
-            'dfl': 1.0,
-        },
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.00001,
-    lrf=0.001,
-    momentum=0.937,
-    weight_decay=0.00005,
-    warmup_epochs=3.0,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.5,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.0,
-)
-
-ptq = dict(
-    num_bits = 8,
-    calib_batches = 4,
-    # 'max', 'histogram'
-    calib_method = 'max',
-    # 'entropy', 'percentile', 'mse'
-    histogram_amax_method='entropy',
-    histogram_amax_percentile=99.99,
-    calib_output_path='./',
-    sensitive_layers_skip=False,
-    sensitive_layers_list=[],
-)
-
-qat = dict(
-    calib_pt = './assets/v6s_t_calib_max.pt',
-    sensitive_layers_skip = False,
-    sensitive_layers_list=[],
-)
-
-# Choose Rep-block by the training Mode, choices=["repvgg", "hyper-search", "repopt"]
-training_mode='repopt'
diff --git a/configs/repopt/yolov6n_hs.py b/configs/repopt/yolov6n_hs.py
deleted file mode 100644
index 67607ba2..00000000
--- a/configs/repopt/yolov6n_hs.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# YOLOv6n model
-model = dict(
-    type='YOLOv6n',
-    pretrained=None,
-    depth_multiple=0.33,
-    width_multiple=0.25,
-    backbone=dict(
-        type='EfficientRep',
-        num_repeats=[1, 6, 12, 18, 6],
-        out_channels=[64, 128, 256, 512, 1024],
-        ),
-    neck=dict(
-        type='RepPANNeck',
-        num_repeats=[12, 12, 12, 12],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=1,
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type='siou',
-        use_dfl=False,
-        reg_max=0 #if use_dfl is False, please set reg_max to 0
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.02, #0.01 # 0.02
-    lrf=0.01,
-    momentum=0.937,
-    weight_decay=0.0005,
-    warmup_epochs=3.0,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.5,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.0,
-)
-
-# Choose Rep-block by the training Mode, choices=["repvgg", "hyper-search", "repopt"]
-training_mode='hyper_search'
diff --git a/configs/repopt/yolov6n_opt.py b/configs/repopt/yolov6n_opt.py
deleted file mode 100644
index 9b3db4fb..00000000
--- a/configs/repopt/yolov6n_opt.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# YOLOv6n model
-model = dict(
-    type='YOLOv6n',
-    pretrained=None,
-    scales='../yolov6_assert/v6n_v2_scale_last.pt',
-    depth_multiple=0.33,
-    width_multiple=0.25,
-    backbone=dict(
-        type='EfficientRep',
-        num_repeats=[1, 6, 12, 18, 6],
-        out_channels=[64, 128, 256, 512, 1024],
-        ),
-    neck=dict(
-        type='RepPANNeck',
-        num_repeats=[12, 12, 12, 12],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=1,
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type='siou',
-        use_dfl=False,
-        reg_max=0 #if use_dfl is False, please set reg_max to 0
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.02, #0.01 # 0.02
-    lrf=0.01,
-    momentum=0.937,
-    weight_decay=0.0005,
-    warmup_epochs=3.0,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.5,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.0,
-)
-# Choose Rep-block by the training Mode, choices=["repvgg", "hyper-search", "repopt"]
-training_mode='repopt'
diff --git a/configs/repopt/yolov6n_opt_qat.py b/configs/repopt/yolov6n_opt_qat.py
deleted file mode 100644
index 4e76dfd3..00000000
--- a/configs/repopt/yolov6n_opt_qat.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# YOLOv6n model
-model = dict(
-    type='YOLOv6n',
-    pretrained='./assets/v6s_n.pt',
-    scales='./assets/v6n_v2_scale_last.pt',
-    depth_multiple=0.33,
-    width_multiple=0.25,
-    backbone=dict(
-        type='EfficientRep',
-        num_repeats=[1, 6, 12, 18, 6],
-        out_channels=[64, 128, 256, 512, 1024],
-        ),
-    neck=dict(
-        type='RepPANNeck',
-        num_repeats=[12, 12, 12, 12],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=1,
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type='siou',
-        use_dfl=False,
-        reg_max=0, #if use_dfl is False, please set reg_max to 0
-        distill_weight={
-            'class': 1.0,
-            'dfl': 1.0,
-        },
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.00001, #0.01 # 0.02
-    lrf=0.001,
-    momentum=0.937,
-    weight_decay=0.00005,
-    warmup_epochs=3.0,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.5,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.0,
-)
-
-ptq = dict(
-    num_bits = 8,
-    calib_batches = 4,
-    # 'max', 'histogram'
-    calib_method = 'max',
-    # 'entropy', 'percentile', 'mse'
-    histogram_amax_method='entropy',
-    histogram_amax_percentile=99.99,
-    calib_output_path='./',
-    sensitive_layers_skip=False,
-    sensitive_layers_list=[],
-)
-
-qat = dict(
-    calib_pt = './assets/v6s_n_calib_max.pt',
-    sensitive_layers_skip = False,
-    sensitive_layers_list=[],
-)
-# Choose Rep-block by the training Mode, choices=["repvgg", "hyper-search", "repopt"]
-training_mode='repopt'
diff --git a/configs/repopt/yolov6s_hs.py b/configs/repopt/yolov6s_hs.py
deleted file mode 100644
index 60c7286a..00000000
--- a/configs/repopt/yolov6s_hs.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# YOLOv6s model
-model = dict(
-    type='YOLOv6s',
-    pretrained=None,
-    depth_multiple=0.33,
-    width_multiple=0.50,
-    backbone=dict(
-        type='EfficientRep',
-        num_repeats=[1, 6, 12, 18, 6],
-        out_channels=[64, 128, 256, 512, 1024],
-        ),
-    neck=dict(
-        type='RepPANNeck',
-        num_repeats=[12, 12, 12, 12],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=1,
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type='giou',
-        use_dfl=False,
-        reg_max=0
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.01,
-    lrf=0.01,
-    momentum=0.937,
-    weight_decay=0.0005,
-    warmup_epochs=3.0,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.5,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.0,
-)
-
-# Choose Rep-block by the training Mode, choices=["repvgg", "hyper-search", "repopt"]
-training_mode='hyper_search'
diff --git a/configs/repopt/yolov6s_opt.py b/configs/repopt/yolov6s_opt.py
deleted file mode 100644
index 2676eb4f..00000000
--- a/configs/repopt/yolov6s_opt.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# YOLOv6s model
-model = dict(
-    type='YOLOv6s',
-    pretrained=None,
-    scales='../yolov6_assert/v6s_v2_scale.pt',
-    depth_multiple=0.33,
-    width_multiple=0.50,
-    backbone=dict(
-        type='EfficientRep',
-        num_repeats=[1, 6, 12, 18, 6],
-        out_channels=[64, 128, 256, 512, 1024],
-        ),
-    neck=dict(
-        type='RepPANNeck',
-        num_repeats=[12, 12, 12, 12],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=1,
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type='giou',
-        use_dfl=False,
-        reg_max=0
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.01,
-    lrf=0.01,
-    momentum=0.937,
-    weight_decay=0.0005,
-    warmup_epochs=3.0,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.5,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.0,
-)
-
-# Choose Rep-block by the training Mode, choices=["repvgg", "hyper-search", "repopt"]
-training_mode='repopt'
diff --git a/configs/repopt/yolov6s_opt_qat.py b/configs/repopt/yolov6s_opt_qat.py
deleted file mode 100644
index a41ea085..00000000
--- a/configs/repopt/yolov6s_opt_qat.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# YOLOv6s model
-model = dict(
-    type='YOLOv6s',
-    pretrained='./assets/yolov6s_v2_reopt_43.1.pt',
-    scales='./assets/yolov6s_v2_scale.pt',
-    depth_multiple=0.33,
-    width_multiple=0.50,
-    backbone=dict(
-        type='EfficientRep',
-        num_repeats=[1, 6, 12, 18, 6],
-        out_channels=[64, 128, 256, 512, 1024],
-        ),
-    neck=dict(
-        type='RepPANNeck',
-        num_repeats=[12, 12, 12, 12],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=1,
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type = 'giou',
-        use_dfl = False,
-        reg_max = 0,  # if use_dfl is False, please set reg_max to 0
-        distill_weight={
-            'class': 1.0,
-            'dfl': 1.0,
-        },
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.00001,
-    lrf=0.001,
-    momentum=0.937,
-    weight_decay=0.00005,
-    warmup_epochs=3,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.5,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.0,
-)
-
-ptq = dict(
-    num_bits = 8,
-    calib_batches = 4,
-    # 'max', 'histogram'
-    calib_method = 'histogram',
-    # 'entropy', 'percentile', 'mse'
-    histogram_amax_method='entropy',
-    histogram_amax_percentile=99.99,
-    calib_output_path='./',
-    sensitive_layers_skip=False,
-    sensitive_layers_list=['detect.stems.0.conv',
-                           'detect.stems.1.conv',
-                           'detect.stems.2.conv',
-                           'detect.cls_convs.0.conv',
-                           'detect.cls_convs.1.conv',
-                           'detect.cls_convs.2.conv',
-                           'detect.reg_convs.0.conv',
-                           'detect.reg_convs.1.conv',
-                           'detect.reg_convs.2.conv',
-                           'detect.cls_preds.0',
-                           'detect.cls_preds.1',
-                           'detect.cls_preds.2',
-                           'detect.reg_preds.0',
-                           'detect.reg_preds.1',
-                           'detect.reg_preds.2',
-                           ],
-)
-
-qat = dict(
-    calib_pt = './assets/yolov6s_v2_reopt_43.1_calib_histogram.pt',
-    sensitive_layers_skip = False,
-    sensitive_layers_list=['detect.stems.0.conv',
-                           'detect.stems.1.conv',
-                           'detect.stems.2.conv',
-                           'detect.cls_convs.0.conv',
-                           'detect.cls_convs.1.conv',
-                           'detect.cls_convs.2.conv',
-                           'detect.reg_convs.0.conv',
-                           'detect.reg_convs.1.conv',
-                           'detect.reg_convs.2.conv',
-                           'detect.cls_preds.0',
-                           'detect.cls_preds.1',
-                           'detect.cls_preds.2',
-                           'detect.reg_preds.0',
-                           'detect.reg_preds.1',
-                           'detect.reg_preds.2',
-                           ],
-)
-
-# Choose Rep-block by the training Mode, choices=["repvgg", "hyper-search", "repopt"]
-training_mode='repopt'
diff --git a/configs/yolov6l.py b/configs/solo/yolov6l_solo.py
similarity index 92%
rename from configs/yolov6l.py
rename to configs/solo/yolov6l_solo.py
index bfa6728b..caabc1f4 100644
--- a/configs/yolov6l.py
+++ b/configs/solo/yolov6l_solo.py
@@ -1,4 +1,4 @@
-# YOLOv6l model
+# YOLOv6l-seg model
 model = dict(
     type='YOLOv6l',
     pretrained=None,
@@ -22,6 +22,10 @@
         in_channels=[128, 256, 512],
         num_layers=3,
         begin_indices=24,
+        npr=256,
+        nm=64,
+        isseg=True,
+        issolo=True,
         anchors=3,
         anchors_init=[[10,13, 19,19, 33,23],
                       [30,61, 59,59, 59,119],
@@ -45,7 +49,7 @@
     lr0=0.01,
     lrf=0.01,
     momentum=0.937,
-    weight_decay=0.0005,
+    weight_decay=0.001,
     warmup_epochs=3.0,
     warmup_momentum=0.8,
     warmup_bias_lr=0.1
diff --git a/configs/qarepvgg/yolov6m_qa.py b/configs/solo/yolov6m_solo.py
similarity index 92%
rename from configs/qarepvgg/yolov6m_qa.py
rename to configs/solo/yolov6m_solo.py
index c0690f15..84e73c0f 100644
--- a/configs/qarepvgg/yolov6m_qa.py
+++ b/configs/solo/yolov6m_solo.py
@@ -1,4 +1,4 @@
-# YOLOv6m model
+# YOLOv6m-seg model
 model = dict(
     type='YOLOv6m',
     pretrained=None,
@@ -22,6 +22,10 @@
         in_channels=[128, 256, 512],
         num_layers=3,
         begin_indices=24,
+        npr=256,
+        nm=64,
+        isseg=True,
+        issolo=True,
         anchors=3,
         anchors_init=[[10,13, 19,19, 33,23],
                       [30,61, 59,59, 59,119],
@@ -45,7 +49,7 @@
     lr0=0.01,
     lrf=0.01,
     momentum=0.937,
-    weight_decay=0.0005,
+    weight_decay=0.001,
     warmup_epochs=3.0,
     warmup_momentum=0.8,
     warmup_bias_lr=0.1
@@ -64,5 +68,3 @@
     mosaic=1.0,
     mixup=0.1,
 )
-
-training_mode='qarepvggv2'
diff --git a/configs/yolov6n.py b/configs/solo/yolov6n_solo.py
similarity index 92%
rename from configs/yolov6n.py
rename to configs/solo/yolov6n_solo.py
index 74f9386d..6392ceb4 100644
--- a/configs/yolov6n.py
+++ b/configs/solo/yolov6n_solo.py
@@ -1,4 +1,4 @@
-# YOLOv6n model
+# YOLOv6n-seg model
 model = dict(
     type='YOLOv6n',
     pretrained=None,
@@ -21,6 +21,10 @@
         in_channels=[128, 256, 512],
         num_layers=3,
         begin_indices=24,
+        npr=256,
+        nm=64,
+        isseg=True,
+        issolo=True,
         anchors=3,
         anchors_init=[[10,13, 19,19, 33,23],
                       [30,61, 59,59, 59,119],
@@ -44,7 +48,7 @@
     lr0=0.02,
     lrf=0.01,
     momentum=0.937,
-    weight_decay=0.0005,
+    weight_decay=0.001,
     warmup_epochs=3.0,
     warmup_momentum=0.8,
     warmup_bias_lr=0.1
diff --git a/configs/qarepvgg/yolov6s_qa.py b/configs/solo/yolov6s_solo.py
similarity index 94%
rename from configs/qarepvgg/yolov6s_qa.py
rename to configs/solo/yolov6s_solo.py
index 3051679a..c2499ba3 100644
--- a/configs/qarepvgg/yolov6s_qa.py
+++ b/configs/solo/yolov6s_solo.py
@@ -1,4 +1,4 @@
-# YOLOv6s model
+# YOLOv6s-seg model
 model = dict(
     type='YOLOv6s',
     pretrained=None,
@@ -21,6 +21,10 @@
         in_channels=[128, 256, 512],
         num_layers=3,
         begin_indices=24,
+        npr=256,
+        nm=64,
+        isseg=True,
+        issolo=True,
         anchors=3,
         anchors_init=[[10,13, 19,19, 33,23],
                       [30,61, 59,59, 59,119],
@@ -63,5 +67,3 @@
     mosaic=1.0,
     mixup=0.0,
 )
-
-training_mode='qarepvggv2'
diff --git a/configs/base/yolov6m_base.py b/configs/solo/yolov6x_solo.py
similarity index 81%
rename from configs/base/yolov6m_base.py
rename to configs/solo/yolov6x_solo.py
index 5670f096..57a175ab 100644
--- a/configs/base/yolov6m_base.py
+++ b/configs/solo/yolov6x_solo.py
@@ -1,9 +1,9 @@
-# YOLOv6m medium/large base model
+# YOLOv6x-seg model
 model = dict(
-    type='YOLOv6m_base',
+    type='YOLOv6x',
     pretrained=None,
-    depth_multiple=0.80,
-    width_multiple=0.75,
+    depth_multiple=1.33,
+    width_multiple=1.25,
     backbone=dict(
         type='CSPBepBackbone',
         num_repeats=[1, 6, 12, 18, 6],
@@ -22,6 +22,10 @@
         in_channels=[128, 256, 512],
         num_layers=3,
         begin_indices=24,
+        npr=256,
+        nm=64,
+        isseg=True,
+        issolo=True,
         anchors=3,
         anchors_init=[[10,13, 19,19, 33,23],
                       [30,61, 59,59, 59,119],
@@ -33,7 +37,7 @@
         use_dfl=True,
         reg_max=16, #if use_dfl is False, please set reg_max to 0
         distill_weight={
-            'class': 0.8,
+            'class': 2.0,
             'dfl': 1.0,
         },
     )
@@ -45,7 +49,7 @@
     lr0=0.01,
     lrf=0.01,
     momentum=0.937,
-    weight_decay=0.0005,
+    weight_decay=0.0015,
     warmup_epochs=3.0,
     warmup_momentum=0.8,
     warmup_bias_lr=0.1
@@ -64,4 +68,5 @@
     mosaic=1.0,
     mixup=0.1,
 )
-training_mode = "conv_relu"
+training_mode = "conv_silu"
+# use normal conv to speed up training and further improve accuracy.
diff --git a/configs/yolov6_lite/README.md b/configs/yolov6_lite/README.md
deleted file mode 100644
index 170d12d9..00000000
--- a/configs/yolov6_lite/README.md
+++ /dev/null
@@ -1,22 +0,0 @@
-## YOLOv6Lite model
-
-English | [简体中文](./README_cn.md)
-
-## Mobile Benchmark
-| Model | Size | mAP<sup>val<br/>0.5:0.95 | sm8350<br/><sup>(ms) | mt6853<br/><sup>(ms) | sdm660<br/><sup>(ms) |Params<br/><sup> (M) |   FLOPs<br/><sup> (G) |
-| :----------------------------------------------------------- | ---- | -------------------- | -------------------- | -------------------- | -------------------- | -------------------- | -------------------- |
-| [**YOLOv6Lite-S**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6lite_s.pt) | 320*320 | 22.4                     | 7.99                     | 11.99                     | 41.86                     | 0.55                     | 0.56                     |
-| [**YOLOv6Lite-M**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6lite_m.pt) | 320*320 | 25.1                     | 9.08                     | 13.27                     | 47.95                     | 0.79                     | 0.67                     |
-| [**YOLOv6Lite-L**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6lite_l.pt) | 320*320 | 28.0                     | 11.37                     | 16.20                     | 61.40                     | 1.09                     | 0.87                     |
-| [**YOLOv6Lite-L**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6lite_l.pt) | 320*192 | 25.0                     | 7.02                     | 9.66                     | 36.13                     | 1.09                     | 0.52                     |
-| [**YOLOv6Lite-L**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6lite_l.pt) | 224*128 | 18.9                     | 3.63                     | 4.99                     | 17.76                     | 1.09                     | 0.24                     |
-
-<details open>
-<summary>Table Notes</summary>
-
-- From the perspective of model size and input image ratio, we have built a series of models on the mobile terminal to facilitate flexible applications in different scenarios.
-- All checkpoints are trained with 400 epochs without distillation.
-- Results of the mAP and speed are evaluated on [COCO val2017](https://cocodataset.org/#download) dataset, and the input resolution is the Size in the table.
-- Speed is tested on MNN 2.3.0 AArch64 with 2 threads by arm82 acceleration. The inference warm-up is performed 10 times, and the cycle is performed 100 times.
-- Qualcomm 888(sm8350), Dimensity 720(mt6853) and Qualcomm 660(sdm660) correspond to chips with different performances at the high, middle and low end respectively, which can be used as a reference for model capabilities under different chips.
-- Refer to [Test NCNN Speed](./docs/Test_NCNN_speed.md) tutorial to reproduce the NCNN speed results of YOLOv6Lite.
diff --git a/configs/yolov6_lite/README_cn.md b/configs/yolov6_lite/README_cn.md
deleted file mode 100644
index 23dd715e..00000000
--- a/configs/yolov6_lite/README_cn.md
+++ /dev/null
@@ -1,23 +0,0 @@
-## YOLOv6 轻量级模型
-
-简体中文 | [English](./README.md)
-
-## 移动端模型指标
-
-| 模型 | 输入尺寸 | mAP<sup>val<br/>0.5:0.95 | sm8350<br/><sup>(ms) | mt6853<br/><sup>(ms) | sdm660<br/><sup>(ms) |Params<br/><sup> (M) |   FLOPs<br/><sup> (G) |
-| :----------------------------------------------------------- | ---- | -------------------- | -------------------- | -------------------- | -------------------- | -------------------- | -------------------- |
-| [**YOLOv6Lite-S**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6lite_s.pt) | 320*320 | 22.4                     | 7.99                     | 11.99                     | 41.86                     | 0.55                     | 0.56                     |
-| [**YOLOv6Lite-M**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6lite_m.pt) | 320*320 | 25.1                     | 9.08                     | 13.27                     | 47.95                     | 0.79                     | 0.67                     |
-| [**YOLOv6Lite-L**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6lite_l.pt) | 320*320 | 28.0                     | 11.37                     | 16.20                     | 61.40                     | 1.09                     | 0.87                     |
-| [**YOLOv6Lite-L**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6lite_l.pt) | 320*192 | 25.0                     | 7.02                     | 9.66                     | 36.13                     | 1.09                     | 0.52                     |
-| [**YOLOv6Lite-L**](https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6lite_l.pt) | 224*128 | 18.9                     | 3.63                     | 4.99                     | 17.76                     | 1.09                     | 0.24                     |
-
-<details open>
-<summary>表格笔记</summary>
-
-- 从模型尺寸和输入图片比例两种角度，在构建了移动端系列模型，方便不同场景下的灵活应用。
-- 所有权重都经过 400 个 epoch 的训练，并且没有使用蒸馏技术。
--  mAP 和速度指标是在 COCO val2017 数据集上评估的，输入分辨率为表格中对应展示的。
-- 使用 MNN 2.3.0 AArch64 进行速度测试。测速时，采用2个线程，并开启arm82加速，推理预热10次，循环100次。
-- 高通888(sm8350)、天玑720(mt6853)和高通660(sdm660)分别对应高中低端不同性能的芯片，可以作为不同芯片下机型能力的参考。
-- [NCNN 速度测试](./docs/Test_NCNN_speed.md)教程可以帮助展示及复现 YOLOv6Lite 的 NCNN 速度结果。
diff --git a/configs/yolov6_lite/yolov6_lite_l.py b/configs/yolov6_lite/yolov6_lite_l.py
deleted file mode 100644
index 212c8c73..00000000
--- a/configs/yolov6_lite/yolov6_lite_l.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# YOLOv6-lite-l model
-model = dict(
-    type='YOLOv6-lite-l',
-    pretrained=None,
-    width_multiple=1.5,
-    backbone=dict(
-        type='Lite_EffiBackbone',
-        num_repeats=[1, 3, 7, 3],
-        out_channels=[24, 32, 64, 128, 256],
-        scale_size=0.5,
-        ),
-    neck=dict(
-        type='Lite_EffiNeck',
-        in_channels=[256, 128, 64],
-        unified_channels=96
-        ),
-    head=dict(
-        type='Lite_EffideHead',
-        in_channels=[96, 96, 96, 96],
-        num_layers=4,
-        anchors=1,
-        strides=[8, 16, 32, 64],
-        atss_warmup_epoch=4,
-        iou_type='siou',
-        use_dfl=False,
-        reg_max=0 #if use_dfl is False, please set reg_max to 0
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.1 * 4,
-    lrf=0.01,
-    momentum=0.9,
-    weight_decay=0.00004,
-    warmup_epochs=3.0,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.5,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.0,
-)
diff --git a/configs/yolov6_lite/yolov6_lite_l_finetune.py b/configs/yolov6_lite/yolov6_lite_l_finetune.py
deleted file mode 100644
index 48315c4d..00000000
--- a/configs/yolov6_lite/yolov6_lite_l_finetune.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# YOLOv6-lite-l model
-model = dict(
-    type='YOLOv6-lite-l',
-    pretrained='weights/yolov6lite_l.pt',
-    width_multiple=1.5,
-    backbone=dict(
-        type='Lite_EffiBackbone',
-        num_repeats=[1, 3, 7, 3],
-        out_channels=[24, 32, 64, 128, 256],
-        scale_size=0.5,
-        ),
-    neck=dict(
-        type='Lite_EffiNeck',
-        in_channels=[256, 128, 64],
-        unified_channels=96
-        ),
-    head=dict(
-        type='Lite_EffideHead',
-        in_channels=[96, 96, 96, 96],
-        num_layers=4,
-        anchors=1,
-        strides=[8, 16, 32, 64],
-        atss_warmup_epoch=4,
-        iou_type='siou',
-        use_dfl=False,
-        reg_max=0 #if use_dfl is False, please set reg_max to 0
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.0032,
-    lrf=0.12,
-    momentum=0.843,
-    weight_decay=0.00036,
-    warmup_epochs=2.0,
-    warmup_momentum=0.5,
-    warmup_bias_lr=0.05
-)
-
-data_aug = dict(
-    hsv_h=0.0138,
-    hsv_s=0.664,
-    hsv_v=0.464,
-    degrees=0.373,
-    translate=0.245,
-    scale=0.898,
-    shear=0.602,
-    flipud=0.00856,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.243,
-)
diff --git a/configs/yolov6_lite/yolov6_lite_m.py b/configs/yolov6_lite/yolov6_lite_m.py
deleted file mode 100644
index 8f0de368..00000000
--- a/configs/yolov6_lite/yolov6_lite_m.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# YOLOv6-lite-m model
-model = dict(
-    type='YOLOv6-lite-m',
-    pretrained=None,
-    width_multiple=1.1,
-    backbone=dict(
-        type='Lite_EffiBackbone',
-        num_repeats=[1, 3, 7, 3],
-        out_channels=[24, 32, 64, 128, 256],
-        scale_size=0.5,
-        ),
-    neck=dict(
-        type='Lite_EffiNeck',
-        in_channels=[256, 128, 64],
-        unified_channels=96
-        ),
-    head=dict(
-        type='Lite_EffideHead',
-        in_channels=[96, 96, 96, 96],
-        num_layers=4,
-        anchors=1,
-        strides=[8, 16, 32, 64],
-        atss_warmup_epoch=4,
-        iou_type='siou',
-        use_dfl=False,
-        reg_max=0 #if use_dfl is False, please set reg_max to 0
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.1 * 4,
-    lrf=0.01,
-    momentum=0.9,
-    weight_decay=0.00004,
-    warmup_epochs=3.0,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.5,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.0,
-)
diff --git a/configs/yolov6_lite/yolov6_lite_m_finetune.py b/configs/yolov6_lite/yolov6_lite_m_finetune.py
deleted file mode 100644
index 108adda5..00000000
--- a/configs/yolov6_lite/yolov6_lite_m_finetune.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# YOLOv6-lite-m model
-model = dict(
-    type='YOLOv6-lite-m',
-    pretrained='weights/yolov6lite_m.pt',
-    width_multiple=1.1,
-    backbone=dict(
-        type='Lite_EffiBackbone',
-        num_repeats=[1, 3, 7, 3],
-        out_channels=[24, 32, 64, 128, 256],
-        scale_size=0.5,
-        ),
-    neck=dict(
-        type='Lite_EffiNeck',
-        in_channels=[256, 128, 64],
-        unified_channels=96
-        ),
-    head=dict(
-        type='Lite_EffideHead',
-        in_channels=[96, 96, 96, 96],
-        num_layers=4,
-        anchors=1,
-        strides=[8, 16, 32, 64],
-        atss_warmup_epoch=4,
-        iou_type='siou',
-        use_dfl=False,
-        reg_max=0 #if use_dfl is False, please set reg_max to 0
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.0032,
-    lrf=0.12,
-    momentum=0.843,
-    weight_decay=0.00036,
-    warmup_epochs=2.0,
-    warmup_momentum=0.5,
-    warmup_bias_lr=0.05
-)
-
-data_aug = dict(
-    hsv_h=0.0138,
-    hsv_s=0.664,
-    hsv_v=0.464,
-    degrees=0.373,
-    translate=0.245,
-    scale=0.898,
-    shear=0.602,
-    flipud=0.00856,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.243,
-)
diff --git a/configs/yolov6_lite/yolov6_lite_s.py b/configs/yolov6_lite/yolov6_lite_s.py
deleted file mode 100644
index 42a52e37..00000000
--- a/configs/yolov6_lite/yolov6_lite_s.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# YOLOv6-lite-s model
-model = dict(
-    type='YOLOv6-lite-s',
-    pretrained=None,
-    width_multiple=0.7,
-    backbone=dict(
-        type='Lite_EffiBackbone',
-        num_repeats=[1, 3, 7, 3],
-        out_channels=[24, 32, 64, 128, 256],
-        scale_size=0.5,
-        ),
-    neck=dict(
-        type='Lite_EffiNeck',
-        in_channels=[256, 128, 64],
-        unified_channels=96
-        ),
-    head=dict(
-        type='Lite_EffideHead',
-        in_channels=[96, 96, 96, 96],
-        num_layers=4,
-        anchors=1,
-        strides=[8, 16, 32, 64],
-        atss_warmup_epoch=4,
-        iou_type='siou',
-        use_dfl=False,
-        reg_max=0 #if use_dfl is False, please set reg_max to 0
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.1 * 4,
-    lrf=0.01,
-    momentum=0.9,
-    weight_decay=0.00004,
-    warmup_epochs=3.0,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.5,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.0,
-)
diff --git a/configs/yolov6_lite/yolov6_lite_s_finetune.py b/configs/yolov6_lite/yolov6_lite_s_finetune.py
deleted file mode 100644
index befee2ce..00000000
--- a/configs/yolov6_lite/yolov6_lite_s_finetune.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# YOLOv6-lite-s model
-model = dict(
-    type='YOLOv6-lite-s',
-    pretrained='weights/yolov6lite_s.pt',
-    width_multiple=0.7,
-    backbone=dict(
-        type='Lite_EffiBackbone',
-        num_repeats=[1, 3, 7, 3],
-        out_channels=[24, 32, 64, 128, 256],
-        scale_size=0.5,
-        ),
-    neck=dict(
-        type='Lite_EffiNeck',
-        in_channels=[256, 128, 64],
-        unified_channels=96
-        ),
-    head=dict(
-        type='Lite_EffideHead',
-        in_channels=[96, 96, 96, 96],
-        num_layers=4,
-        anchors=1,
-        strides=[8, 16, 32, 64],
-        atss_warmup_epoch=4,
-        iou_type='siou',
-        use_dfl=False,
-        reg_max=0 #if use_dfl is False, please set reg_max to 0
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.0032,
-    lrf=0.12,
-    momentum=0.843,
-    weight_decay=0.00036,
-    warmup_epochs=2.0,
-    warmup_momentum=0.5,
-    warmup_bias_lr=0.05
-)
-
-data_aug = dict(
-    hsv_h=0.0138,
-    hsv_s=0.664,
-    hsv_v=0.464,
-    degrees=0.373,
-    translate=0.245,
-    scale=0.898,
-    shear=0.602,
-    flipud=0.00856,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.243,
-)
diff --git a/configs/yolov6l6.py b/configs/yolov6l6.py
deleted file mode 100644
index 3bb77c5f..00000000
--- a/configs/yolov6l6.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# YOLOv6l6 model
-model = dict(
-    type='YOLOv6l6',
-    pretrained=None,
-    depth_multiple=1.0,
-    width_multiple=1.0,
-    backbone=dict(
-        type='CSPBepBackbone_P6',
-        num_repeats=[1, 6, 12, 18, 6, 6],
-        out_channels=[64, 128, 256, 512, 768, 1024],
-        csp_e=float(1)/2,
-        fuse_P2=True,
-        ),
-    neck=dict(
-        type='CSPRepBiFPANNeck_P6',
-        num_repeats=[12, 12, 12, 12, 12, 12],
-        out_channels=[512, 256, 128, 256, 512, 1024],
-        csp_e=float(1)/2,
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512, 1024],
-        num_layers=4,
-        anchors=1,
-        strides=[8, 16, 32, 64],
-        atss_warmup_epoch=4,
-        iou_type='giou',
-        use_dfl=True,
-        reg_max=16, #if use_dfl is False, please set reg_max to 0
-        distill_weight={
-            'class': 1.0,
-            'dfl': 1.0,
-        },
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.01,
-    lrf=0.01,
-    momentum=0.937,
-    weight_decay=0.0005,
-    warmup_epochs=3.0,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.9,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.2,
-)
-training_mode = "conv_silu"
diff --git a/configs/yolov6l6_finetune.py b/configs/yolov6l6_finetune.py
deleted file mode 100644
index 2ffb8ada..00000000
--- a/configs/yolov6l6_finetune.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# YOLOv6l6 model
-model = dict(
-    type='YOLOv6l6',
-    pretrained='weights/yolov6l6.pt',
-    depth_multiple=1.0,
-    width_multiple=1.0,
-    backbone=dict(
-        type='CSPBepBackbone_P6',
-        num_repeats=[1, 6, 12, 18, 6, 6],
-        out_channels=[64, 128, 256, 512, 768, 1024],
-        csp_e=float(1)/2,
-        fuse_P2=True,
-        ),
-    neck=dict(
-        type='CSPRepBiFPANNeck_P6',
-        num_repeats=[12, 12, 12, 12, 12, 12],
-        out_channels=[512, 256, 128, 256, 512, 1024],
-        csp_e=float(1)/2,
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512, 1024],
-        num_layers=4,
-        anchors=1,
-        strides=[8, 16, 32, 64],
-        atss_warmup_epoch=4,
-        iou_type='giou',
-        use_dfl=True,
-        reg_max=16, #if use_dfl is False, please set reg_max to 0
-        distill_weight={
-            'class': 1.0,
-            'dfl': 1.0,
-        },
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.0032,
-    lrf=0.12,
-    momentum=0.843,
-    weight_decay=0.00036,
-    warmup_epochs=2.0,
-    warmup_momentum=0.5,
-    warmup_bias_lr=0.05
-)
-
-data_aug = dict(
-    hsv_h=0.0138,
-    hsv_s=0.664,
-    hsv_v=0.464,
-    degrees=0.373,
-    translate=0.245,
-    scale=0.898,
-    shear=0.602,
-    flipud=0.00856,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.243,
-)
-training_mode = "conv_silu"
diff --git a/configs/yolov6l_finetune.py b/configs/yolov6l_finetune.py
deleted file mode 100644
index 9b301233..00000000
--- a/configs/yolov6l_finetune.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# YOLOv6l model
-model = dict(
-    type='YOLOv6l',
-    pretrained='weights/yolov6l.pt',
-    depth_multiple=1.0,
-    width_multiple=1.0,
-    backbone=dict(
-        type='CSPBepBackbone',
-        num_repeats=[1, 6, 12, 18, 6],
-        out_channels=[64, 128, 256, 512, 1024],
-        csp_e=float(1)/2,
-        fuse_P2=True,
-        ),
-    neck=dict(
-        type='CSPRepBiFPANNeck',
-        num_repeats=[12, 12, 12, 12],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        csp_e=float(1)/2,
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=3,
-        anchors_init=[[10,13, 19,19, 33,23],
-                      [30,61, 59,59, 59,119],
-                      [116,90, 185,185, 373,326]],
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type='giou',
-        use_dfl=True,
-        reg_max=16, #if use_dfl is False, please set reg_max to 0
-        distill_weight={
-            'class': 2.0,
-            'dfl': 1.0,
-        },
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.0032,
-    lrf=0.12,
-    momentum=0.843,
-    weight_decay=0.00036,
-    warmup_epochs=2.0,
-    warmup_momentum=0.5,
-    warmup_bias_lr=0.05
-)
-
-data_aug = dict(
-    hsv_h=0.0138,
-    hsv_s=0.664,
-    hsv_v=0.464,
-    degrees=0.373,
-    translate=0.245,
-    scale=0.898,
-    shear=0.602,
-    flipud=0.00856,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.243,
-)
-training_mode = "conv_silu"
-# use normal conv to speed up training and further improve accuracy.
diff --git a/configs/base/yolov6l_base.py b/configs/yolov6l_seg.py
similarity index 85%
rename from configs/base/yolov6l_base.py
rename to configs/yolov6l_seg.py
index ef2dbbb2..2ed9211f 100644
--- a/configs/base/yolov6l_base.py
+++ b/configs/yolov6l_seg.py
@@ -1,6 +1,6 @@
-# YOLOv6l large base model
+# YOLOv6l-seg model
 model = dict(
-    type='YOLOv6l_base',
+    type='YOLOv6l',
     pretrained=None,
     depth_multiple=1.0,
     width_multiple=1.0,
@@ -22,6 +22,10 @@
         in_channels=[128, 256, 512],
         num_layers=3,
         begin_indices=24,
+        npr=256,
+        nm=32,
+        isseg=True,
+        issolo=False,
         anchors=3,
         anchors_init=[[10,13, 19,19, 33,23],
                       [30,61, 59,59, 59,119],
@@ -45,7 +49,7 @@
     lr0=0.01,
     lrf=0.01,
     momentum=0.937,
-    weight_decay=0.0005,
+    weight_decay=0.001,
     warmup_epochs=3.0,
     warmup_momentum=0.8,
     warmup_bias_lr=0.1
@@ -64,4 +68,5 @@
     mosaic=1.0,
     mixup=0.1,
 )
-training_mode = "conv_relu"
+training_mode = "conv_silu"
+# use normal conv to speed up training and further improve accuracy.
diff --git a/configs/yolov6m6.py b/configs/yolov6m6.py
deleted file mode 100644
index e741bbc0..00000000
--- a/configs/yolov6m6.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# YOLOv6m6 model
-model = dict(
-    type='YOLOv6m6',
-    pretrained=None,
-    depth_multiple=0.60,
-    width_multiple=0.75,
-    backbone=dict(
-        type='CSPBepBackbone_P6',
-        num_repeats=[1, 6, 12, 18, 6, 6],
-        out_channels=[64, 128, 256, 512, 768, 1024],
-        csp_e=float(2)/3,
-        fuse_P2=True,
-        ),
-    neck=dict(
-        type='CSPRepBiFPANNeck_P6',
-        num_repeats=[12, 12, 12, 12, 12, 12],
-        out_channels=[512, 256, 128, 256, 512, 1024],
-        csp_e=float(2)/3,
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512, 1024],
-        num_layers=4,
-        anchors=1,
-        strides=[8, 16, 32, 64],
-        atss_warmup_epoch=4,
-        iou_type='giou',
-        use_dfl=True,
-        reg_max=16, #if use_dfl is False, please set reg_max to 0
-        distill_weight={
-            'class': 1.0,
-            'dfl': 1.0,
-        },
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.01,
-    lrf=0.01,
-    momentum=0.937,
-    weight_decay=0.0005,
-    warmup_epochs=3.0,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.9,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.1,
-)
diff --git a/configs/yolov6m6_finetune.py b/configs/yolov6m6_finetune.py
deleted file mode 100644
index 83760d3a..00000000
--- a/configs/yolov6m6_finetune.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# YOLOv6m6 model
-model = dict(
-    type='YOLOv6m6',
-    pretrained='weights/yolov6m6.pt',
-    depth_multiple=0.60,
-    width_multiple=0.75,
-    backbone=dict(
-        type='CSPBepBackbone_P6',
-        num_repeats=[1, 6, 12, 18, 6, 6],
-        out_channels=[64, 128, 256, 512, 768, 1024],
-        csp_e=float(2)/3,
-        fuse_P2=True,
-        ),
-    neck=dict(
-        type='CSPRepBiFPANNeck_P6',
-        num_repeats=[12, 12, 12, 12, 12, 12],
-        out_channels=[512, 256, 128, 256, 512, 1024],
-        csp_e=float(2)/3,
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512, 1024],
-        num_layers=4,
-        anchors=1,
-        strides=[8, 16, 32, 64],
-        atss_warmup_epoch=4,
-        iou_type='giou',
-        use_dfl=True,
-        reg_max=16, #if use_dfl is False, please set reg_max to 0
-        distill_weight={
-            'class': 1.0,
-            'dfl': 1.0,
-        },
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.0032,
-    lrf=0.12,
-    momentum=0.843,
-    weight_decay=0.00036,
-    warmup_epochs=2.0,
-    warmup_momentum=0.5,
-    warmup_bias_lr=0.05
-)
-
-data_aug = dict(
-    hsv_h=0.0138,
-    hsv_s=0.664,
-    hsv_v=0.464,
-    degrees=0.373,
-    translate=0.245,
-    scale=0.898,
-    shear=0.602,
-    flipud=0.00856,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.243,
-)
diff --git a/configs/yolov6m_finetune.py b/configs/yolov6m_finetune.py
deleted file mode 100644
index cfe0fa93..00000000
--- a/configs/yolov6m_finetune.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# YOLOv6m model
-model = dict(
-    type='YOLOv6m',
-    pretrained='weights/yolov6m.pt',
-    depth_multiple=0.60,
-    width_multiple=0.75,
-    backbone=dict(
-        type='CSPBepBackbone',
-        num_repeats=[1, 6, 12, 18, 6],
-        out_channels=[64, 128, 256, 512, 1024],
-        csp_e=float(2)/3,
-        fuse_P2=True,
-        ),
-    neck=dict(
-        type='CSPRepBiFPANNeck',
-        num_repeats=[12, 12, 12, 12],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        csp_e=float(2)/3,
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=3,
-        anchors_init=[[10,13, 19,19, 33,23],
-                      [30,61, 59,59, 59,119],
-                      [116,90, 185,185, 373,326]],
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type='giou',
-        use_dfl=True,
-        reg_max=16, #if use_dfl is False, please set reg_max to 0
-        distill_weight={
-            'class': 0.8,
-            'dfl': 1.0,
-        },
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.0032,
-    lrf=0.12,
-    momentum=0.843,
-    weight_decay=0.00036,
-    warmup_epochs=2.0,
-    warmup_momentum=0.5,
-    warmup_bias_lr=0.05
-)
-
-data_aug = dict(
-    hsv_h=0.0138,
-    hsv_s=0.664,
-    hsv_v=0.464,
-    degrees=0.373,
-    translate=0.245,
-    scale=0.898,
-    shear=0.602,
-    flipud=0.00856,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.243,
-)
diff --git a/configs/yolov6m.py b/configs/yolov6m_seg.py
similarity index 92%
rename from configs/yolov6m.py
rename to configs/yolov6m_seg.py
index 29fae396..d8660be3 100644
--- a/configs/yolov6m.py
+++ b/configs/yolov6m_seg.py
@@ -1,4 +1,4 @@
-# YOLOv6m model
+# YOLOv6m-seg model
 model = dict(
     type='YOLOv6m',
     pretrained=None,
@@ -22,6 +22,10 @@
         in_channels=[128, 256, 512],
         num_layers=3,
         begin_indices=24,
+        npr=256,
+        nm=32,
+        isseg=True,
+        issolo=False,
         anchors=3,
         anchors_init=[[10,13, 19,19, 33,23],
                       [30,61, 59,59, 59,119],
@@ -45,7 +49,7 @@
     lr0=0.01,
     lrf=0.01,
     momentum=0.937,
-    weight_decay=0.0005,
+    weight_decay=0.001,
     warmup_epochs=3.0,
     warmup_momentum=0.8,
     warmup_bias_lr=0.1
diff --git a/configs/yolov6n6.py b/configs/yolov6n6.py
deleted file mode 100644
index 0abe3a44..00000000
--- a/configs/yolov6n6.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# YOLOv6n model
-model = dict(
-    type='YOLOv6n6',
-    pretrained=None,
-    depth_multiple=0.33,
-    width_multiple=0.25,
-    backbone=dict(
-        type='EfficientRep6',
-        num_repeats=[1, 6, 12, 18, 6, 6],
-        out_channels=[64, 128, 256, 512, 768, 1024],
-        fuse_P2=True, # if use RepBiFPANNeck6, please set fuse_P2 to True.
-        cspsppf=True,
-        ),
-    neck=dict(
-        type='RepBiFPANNeck6',
-        num_repeats=[12, 12, 12, 12, 12, 12],
-        out_channels=[512, 256, 128, 256, 512, 1024],
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512, 1024],
-        num_layers=4,
-        anchors=1,
-        strides=[8, 16, 32, 64],
-        atss_warmup_epoch=4,
-        iou_type='siou',
-        use_dfl=False,
-        reg_max=0 #if use_dfl is False, please set reg_max to 0
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.02,
-    lrf=0.01,
-    momentum=0.937,
-    weight_decay=0.0005,
-    warmup_epochs=3.0,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.5,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.0,
-)
diff --git a/configs/yolov6n6_finetune.py b/configs/yolov6n6_finetune.py
deleted file mode 100644
index 01100f0f..00000000
--- a/configs/yolov6n6_finetune.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# YOLOv6n model
-model = dict(
-    type='YOLOv6n6',
-    pretrained='weights/yolov6n6.pt',
-    depth_multiple=0.33,
-    width_multiple=0.25,
-    backbone=dict(
-        type='EfficientRep6',
-        num_repeats=[1, 6, 12, 18, 6, 6],
-        out_channels=[64, 128, 256, 512, 768, 1024],
-        fuse_P2=True, # if use RepBiFPANNeck6, please set fuse_P2 to True.
-        cspsppf=True,
-        ),
-    neck=dict(
-        type='RepBiFPANNeck6',
-        num_repeats=[12, 12, 12, 12, 12, 12],
-        out_channels=[512, 256, 128, 256, 512, 1024],
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512, 1024],
-        num_layers=4,
-        anchors=1,
-        strides=[8, 16, 32, 64],
-        atss_warmup_epoch=4,
-        iou_type='siou',
-        use_dfl=False,
-        reg_max=0 #if use_dfl is False, please set reg_max to 0
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.0032,
-    lrf=0.12,
-    momentum=0.843,
-    weight_decay=0.00036,
-    warmup_epochs=2.0,
-    warmup_momentum=0.5,
-    warmup_bias_lr=0.05
-)
-
-data_aug = dict(
-    hsv_h=0.0138,
-    hsv_s=0.664,
-    hsv_v=0.464,
-    degrees=0.373,
-    translate=0.245,
-    scale=0.898,
-    shear=0.602,
-    flipud=0.00856,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.243,
-)
diff --git a/configs/yolov6n_finetune.py b/configs/yolov6n_finetune.py
deleted file mode 100644
index 03b6d1ba..00000000
--- a/configs/yolov6n_finetune.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# YOLOv6s model
-model = dict(
-    type='YOLOv6n',
-    pretrained='weights/yolov6n.pt',
-    depth_multiple=0.33,
-    width_multiple=0.25,
-    backbone=dict(
-        type='EfficientRep',
-        num_repeats=[1, 6, 12, 18, 6],
-        out_channels=[64, 128, 256, 512, 1024],
-        fuse_P2=True,
-        cspsppf=True,
-        ),
-    neck=dict(
-        type='RepBiFPANNeck',
-        num_repeats=[12, 12, 12, 12],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=3,
-        anchors_init=[[10,13, 19,19, 33,23],
-                      [30,61, 59,59, 59,119],
-                      [116,90, 185,185, 373,326]],
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type='siou',
-        use_dfl=False, # set to True if you want to further train with distillation
-        reg_max=0, # set to 16 if you want to further train with distillation
-        distill_weight={
-            'class': 1.0,
-            'dfl': 1.0,
-        },
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.0032,
-    lrf=0.12,
-    momentum=0.843,
-    weight_decay=0.00036,
-    warmup_epochs=2.0,
-    warmup_momentum=0.5,
-    warmup_bias_lr=0.05
-)
-
-data_aug = dict(
-    hsv_h=0.0138,
-    hsv_s=0.664,
-    hsv_v=0.464,
-    degrees=0.373,
-    translate=0.245,
-    scale=0.898,
-    shear=0.602,
-    flipud=0.00856,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.243,
-)
diff --git a/configs/qarepvgg/yolov6n_qa.py b/configs/yolov6n_seg.py
similarity index 92%
rename from configs/qarepvgg/yolov6n_qa.py
rename to configs/yolov6n_seg.py
index b42d9ddb..94b42ed1 100644
--- a/configs/qarepvgg/yolov6n_qa.py
+++ b/configs/yolov6n_seg.py
@@ -1,4 +1,4 @@
-# YOLOv6s model
+# YOLOv6n-seg model
 model = dict(
     type='YOLOv6n',
     pretrained=None,
@@ -21,6 +21,10 @@
         in_channels=[128, 256, 512],
         num_layers=3,
         begin_indices=24,
+        npr=256,
+        nm=32,
+        isseg=True,
+        issolo=False,
         anchors=3,
         anchors_init=[[10,13, 19,19, 33,23],
                       [30,61, 59,59, 59,119],
@@ -44,7 +48,7 @@
     lr0=0.02,
     lrf=0.01,
     momentum=0.937,
-    weight_decay=0.0005,
+    weight_decay=0.001,
     warmup_epochs=3.0,
     warmup_momentum=0.8,
     warmup_bias_lr=0.1
@@ -63,4 +67,3 @@
     mosaic=1.0,
     mixup=0.0,
 )
-training_mode='qarepvggv2'
diff --git a/configs/yolov6s6.py b/configs/yolov6s6.py
deleted file mode 100644
index 091bfffc..00000000
--- a/configs/yolov6s6.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# YOLOv6n model
-model = dict(
-    type='YOLOv6s6',
-    pretrained=None,
-    depth_multiple=0.33,
-    width_multiple=0.50,
-    backbone=dict(
-        type='EfficientRep6',
-        num_repeats=[1, 6, 12, 18, 6, 6],
-        out_channels=[64, 128, 256, 512, 768, 1024],
-        fuse_P2=True, # if use RepBiFPANNeck6, please set fuse_P2 to True.
-        cspsppf=True,
-        ),
-    neck=dict(
-        type='RepBiFPANNeck6',
-        num_repeats=[12, 12, 12, 12, 12, 12],
-        out_channels=[512, 256, 128, 256, 512, 1024],
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512, 1024],
-        num_layers=4,
-        anchors=1,
-        strides=[8, 16, 32, 64],
-        atss_warmup_epoch=4,
-        iou_type='giou',
-        use_dfl=False,
-        reg_max=0 #if use_dfl is False, please set reg_max to 0
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.01,
-    lrf=0.01,
-    momentum=0.937,
-    weight_decay=0.0005,
-    warmup_epochs=3.0,
-    warmup_momentum=0.8,
-    warmup_bias_lr=0.1
-)
-
-data_aug = dict(
-    hsv_h=0.015,
-    hsv_s=0.7,
-    hsv_v=0.4,
-    degrees=0.0,
-    translate=0.1,
-    scale=0.5,
-    shear=0.0,
-    flipud=0.0,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.0,
-)
diff --git a/configs/yolov6s6_finetune.py b/configs/yolov6s6_finetune.py
deleted file mode 100644
index a22697ed..00000000
--- a/configs/yolov6s6_finetune.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# YOLOv6n model
-model = dict(
-    type='YOLOv6s6',
-    pretrained='weights/yolov6s6.pt',
-    depth_multiple=0.33,
-    width_multiple=0.50,
-    backbone=dict(
-        type='EfficientRep6',
-        num_repeats=[1, 6, 12, 18, 6, 6],
-        out_channels=[64, 128, 256, 512, 768, 1024],
-        fuse_P2=True, # if use RepBiFPANNeck6, please set fuse_P2 to True.
-        cspsppf=True,
-        ),
-    neck=dict(
-        type='RepBiFPANNeck6',
-        num_repeats=[12, 12, 12, 12, 12, 12],
-        out_channels=[512, 256, 128, 256, 512, 1024],
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512, 1024],
-        num_layers=4,
-        anchors=1,
-        strides=[8, 16, 32, 64],
-        atss_warmup_epoch=4,
-        iou_type='giou',
-        use_dfl=False,
-        reg_max=0 #if use_dfl is False, please set reg_max to 0
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.0032,
-    lrf=0.12,
-    momentum=0.843,
-    weight_decay=0.00036,
-    warmup_epochs=2.0,
-    warmup_momentum=0.5,
-    warmup_bias_lr=0.05
-)
-
-data_aug = dict(
-    hsv_h=0.0138,
-    hsv_s=0.664,
-    hsv_v=0.464,
-    degrees=0.373,
-    translate=0.245,
-    scale=0.898,
-    shear=0.602,
-    flipud=0.00856,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.243,
-)
diff --git a/configs/yolov6s_finetune.py b/configs/yolov6s_finetune.py
deleted file mode 100644
index d6fb27fe..00000000
--- a/configs/yolov6s_finetune.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# YOLOv6s model
-model = dict(
-    type='YOLOv6s',
-    pretrained='weights/yolov6s.pt',
-    depth_multiple=0.33,
-    width_multiple=0.50,
-    backbone=dict(
-        type='EfficientRep',
-        num_repeats=[1, 6, 12, 18, 6],
-        out_channels=[64, 128, 256, 512, 1024],
-        fuse_P2=True,
-        cspsppf=True,
-        ),
-    neck=dict(
-        type='RepBiFPANNeck',
-        num_repeats=[12, 12, 12, 12],
-        out_channels=[256, 128, 128, 256, 256, 512],
-        ),
-    head=dict(
-        type='EffiDeHead',
-        in_channels=[128, 256, 512],
-        num_layers=3,
-        begin_indices=24,
-        anchors=3,
-        anchors_init=[[10,13, 19,19, 33,23],
-                      [30,61, 59,59, 59,119],
-                      [116,90, 185,185, 373,326]],
-        out_indices=[17, 20, 23],
-        strides=[8, 16, 32],
-        atss_warmup_epoch=0,
-        iou_type='giou',
-        use_dfl=False, # set to True if you want to further train with distillation
-        reg_max=0, # set to 16 if you want to further train with distillation
-        distill_weight={
-            'class': 1.0,
-            'dfl': 1.0,
-        },
-    )
-)
-
-solver = dict(
-    optim='SGD',
-    lr_scheduler='Cosine',
-    lr0=0.0032,
-    lrf=0.12,
-    momentum=0.843,
-    weight_decay=0.00036,
-    warmup_epochs=2.0,
-    warmup_momentum=0.5,
-    warmup_bias_lr=0.05
-)
-
-data_aug = dict(
-    hsv_h=0.0138,
-    hsv_s=0.664,
-    hsv_v=0.464,
-    degrees=0.373,
-    translate=0.245,
-    scale=0.898,
-    shear=0.602,
-    flipud=0.00856,
-    fliplr=0.5,
-    mosaic=1.0,
-    mixup=0.243,
-)
diff --git a/configs/yolov6s.py b/configs/yolov6s_seg.py
similarity index 92%
rename from configs/yolov6s.py
rename to configs/yolov6s_seg.py
index 8d8b6739..c4274ccc 100644
--- a/configs/yolov6s.py
+++ b/configs/yolov6s_seg.py
@@ -1,4 +1,4 @@
-# YOLOv6s model
+# YOLOv6s-seg model
 model = dict(
     type='YOLOv6s',
     pretrained=None,
@@ -21,6 +21,10 @@
         in_channels=[128, 256, 512],
         num_layers=3,
         begin_indices=24,
+        npr=256,
+        nm=32,
+        isseg=True,
+        issolo=False,
         anchors=3,
         anchors_init=[[10,13, 19,19, 33,23],
                       [30,61, 59,59, 59,119],
@@ -44,7 +48,7 @@
     lr0=0.01,
     lrf=0.01,
     momentum=0.937,
-    weight_decay=0.0005,
+    weight_decay=0.001,
     warmup_epochs=3.0,
     warmup_momentum=0.8,
     warmup_bias_lr=0.1
diff --git a/configs/mbla/yolov6l_mbla.py b/configs/yolov6x_seg.py
similarity index 79%
rename from configs/mbla/yolov6l_mbla.py
rename to configs/yolov6x_seg.py
index 7534b705..3ef53e50 100644
--- a/configs/mbla/yolov6l_mbla.py
+++ b/configs/yolov6x_seg.py
@@ -1,29 +1,31 @@
-# YOLOv6l model
+# YOLOv6l-seg model
 model = dict(
-    type='YOLOv6l_mbla',
+    type='YOLOv6l',
     pretrained=None,
-    depth_multiple=0.5,
-    width_multiple=1.0,
+    depth_multiple=1.33,
+    width_multiple=1.25,
     backbone=dict(
         type='CSPBepBackbone',
-        num_repeats=[1, 4, 8, 8, 4],
+        num_repeats=[1, 6, 12, 18, 6],
         out_channels=[64, 128, 256, 512, 1024],
         csp_e=float(1)/2,
         fuse_P2=True,
-        stage_block_type="MBLABlock",
         ),
     neck=dict(
         type='CSPRepBiFPANNeck',
-        num_repeats=[8, 8, 8, 8],
+        num_repeats=[12, 12, 12, 12],
         out_channels=[256, 128, 128, 256, 256, 512],
         csp_e=float(1)/2,
-        stage_block_type="MBLABlock",
         ),
     head=dict(
         type='EffiDeHead',
         in_channels=[128, 256, 512],
         num_layers=3,
         begin_indices=24,
+        npr=256,
+        nm=32,
+        isseg=True,
+        issolo=False,
         anchors=3,
         anchors_init=[[10,13, 19,19, 33,23],
                       [30,61, 59,59, 59,119],
@@ -47,7 +49,7 @@
     lr0=0.01,
     lrf=0.01,
     momentum=0.937,
-    weight_decay=0.0005,
+    weight_decay=0.001,
     warmup_epochs=3.0,
     warmup_momentum=0.8,
     warmup_bias_lr=0.1
@@ -66,5 +68,5 @@
     mosaic=1.0,
     mixup=0.1,
 )
-
 training_mode = "conv_silu"
+# use normal conv to speed up training and further improve accuracy.
diff --git a/data/coco.yaml b/data/coco.yaml
index d20d411e..8ce2676d 100644
--- a/data/coco.yaml
+++ b/data/coco.yaml
@@ -1,13 +1,11 @@
 # COCO 2017 dataset http://cocodataset.org
-train: ../coco/images/train2017 # 118287 images
-val: ../coco/images/val2017  # 5000 images
-test: ../coco/images/test2017
-anno_path: ../coco/annotations/instances_val2017.json
+train: ./data/coco/images/train2017 # 118287 images
+val: ./data/coco/images/val2017  # 5000 images
+# test: ./data/coco/images/val2017
 
 # number of classes
 nc: 80
-# whether it is coco dataset, only coco dataset should be set to True.
-is_coco: True
+
 
 # class names
 names: [ 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
diff --git a/data/images/000000056350.jpg b/data/images/000000056350.jpg
new file mode 100644
index 00000000..0c95d084
Binary files /dev/null and b/data/images/000000056350.jpg differ
diff --git a/data/images/9_Press_Conference_Press_Conference_9_946.jpg b/data/images/9_Press_Conference_Press_Conference_9_946.jpg
new file mode 100644
index 00000000..aa342667
Binary files /dev/null and b/data/images/9_Press_Conference_Press_Conference_9_946.jpg differ
diff --git a/deploy/ONNX/README.md b/deploy/ONNX/README.md
index d42f3c8c..c3a618cb 100644
--- a/deploy/ONNX/README.md
+++ b/deploy/ONNX/README.md
@@ -33,15 +33,6 @@ python ./deploy/ONNX/export_onnx.py \
 - `--conf-thres` : Confidence threshold for NMS algorithm.
 - `--device` : Export device. Cuda device : 0 or 0,1,2,3 ... , CPU : cpu .
 
-## Download
-
-* [YOLOv6-N](https://github.com/meituan/YOLOv6/releases/download/0.2.0/yolov6n.onnx)
-* [YOLOv6-T](https://github.com/meituan/YOLOv6/releases/download/0.2.0/yolov6t.onnx)
-* [YOLOv6-S](https://github.com/meituan/YOLOv6/releases/download/0.2.0/yolov6s.onnx)
-* [YOLOv6-M](https://github.com/meituan/YOLOv6/releases/download/0.2.0/yolov6m.onnx)
-* [YOLOv6-L-ReLU](https://github.com/meituan/YOLOv6/releases/download/0.2.0/yolov6l_relu.onnx)
-* [YOLOv6-L](https://github.com/meituan/YOLOv6/releases/download/0.2.0/yolov6l.onnx)
-
 
 ## End2End export
 
diff --git a/deploy/ONNX/export_onnx.py b/deploy/ONNX/export_onnx.py
index ba7440ae..85368c85 100644
--- a/deploy/ONNX/export_onnx.py
+++ b/deploy/ONNX/export_onnx.py
@@ -22,7 +22,7 @@
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument('--weights', type=str, default='./yolov6s.pt', help='weights path')
+    parser.add_argument('--weights', type=str, default='./weights/best_ckpt.pt', help='weights path')
     parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='image size, the order is: height width')  # height, width
     parser.add_argument('--batch-size', type=int, default=1, help='batch size')
     parser.add_argument('--half', action='store_true', help='FP16 half-precision export')
diff --git a/tools/eval.py b/tools/eval.py
index 5543029c..7814639e 100644
--- a/tools/eval.py
+++ b/tools/eval.py
@@ -23,13 +23,13 @@ def boolean_string(s):
 def get_args_parser(add_help=True):
     parser = argparse.ArgumentParser(description='YOLOv6 PyTorch Evalating', add_help=add_help)
     parser.add_argument('--data', type=str, default='./data/coco.yaml', help='dataset.yaml path')
-    parser.add_argument('--weights', type=str, default='./weights/yolov6s.pt', help='model.pt path(s)')
-    parser.add_argument('--batch-size', type=int, default=32, help='batch size')
+    parser.add_argument('--weights', type=str, default='./checkpoints/yolov6n_yol.pt', help='model.pt path(s)')
+    parser.add_argument('--batch-size', type=int, default=2, help='batch size')
     parser.add_argument('--img-size', type=int, default=640, help='inference size (pixels)')
     parser.add_argument('--conf-thres', type=float, default=0.03, help='confidence threshold')
     parser.add_argument('--iou-thres', type=float, default=0.65, help='NMS IoU threshold')
     parser.add_argument('--task', default='val', help='val, test, or speed')
-    parser.add_argument('--device', default='0', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
+    parser.add_argument('--device', default='4', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
     parser.add_argument('--half', default=False, action='store_true', help='whether to use fp16 infer')
     parser.add_argument('--save_dir', type=str, default='runs/val/', help='evaluation save dir')
     parser.add_argument('--name', type=str, default='exp', help='save evaluation results to save_dir/name')
@@ -37,8 +37,8 @@ def get_args_parser(add_help=True):
     parser.add_argument('--infer_on_rect', default=True, type=boolean_string, help='default to run with rectangle image to boost speed.')
     parser.add_argument('--reproduce_640_eval', default=False, action='store_true', help='whether to reproduce 640 infer result, overwrite some config')
     parser.add_argument('--eval_config_file', type=str, default='./configs/experiment/eval_640_repro.py', help='config file for repro 640 infer result')
-    parser.add_argument('--do_coco_metric', default=True, type=boolean_string, help='whether to use pycocotool to metric, set False to close')
-    parser.add_argument('--do_pr_metric', default=False, type=boolean_string, help='whether to calculate precision, recall and F1, n, set False to close')
+    parser.add_argument('--do_coco_metric', default=False, type=boolean_string, help='whether to use pycocotool to metric, set False to close')
+    parser.add_argument('--do_pr_metric', default=True, type=boolean_string, help='whether to calculate precision, recall and F1, n, set False to close')
     parser.add_argument('--plot_curve', default=True, type=boolean_string, help='whether to save plots in savedir when do pr metric, set False to close')
     parser.add_argument('--plot_confusion_matrix', default=False, action='store_true', help='whether to save confusion matrix plots when do pr metric, might cause no harm warning print')
     parser.add_argument('--verbose', default=False, action='store_true', help='whether to print metric on each class')
@@ -46,6 +46,7 @@ def get_args_parser(add_help=True):
     parser.add_argument('--specific-shape', action='store_true', help='rectangular training')
     parser.add_argument('--height', type=int, default=None, help='image height of model input')
     parser.add_argument('--width', type=int, default=None, help='image width of model input')
+    parser.add_argument('--issolo', default=False, type=boolean_string, help='is solo format')
     args = parser.parse_args()
 
     if args.config_file:
@@ -113,7 +114,8 @@ def run(data,
         config_file=None,
         specific_shape=False,
         height=640,
-        width=640
+        width=640,
+        issolo=False
         ):
     """ Run the evaluation process
 
@@ -155,9 +157,11 @@ def run(data,
 
     # eval
     model.eval()
-    pred_result, vis_outputs, vis_paths = val.predict_model(model, dataloader, task)
-    eval_result = val.eval_model(pred_result, model, dataloader, task)
-    return eval_result, vis_outputs, vis_paths
+    pred_result, _, __= val.predict_model(model, dataloader, task, issolo=issolo)
+    return pred_result, _, __
+    #raise ValueError("..")
+    #eval_result = val.eval_model(pred_result, model, dataloader, task)
+    #return eval_result, vis_outputs, vis_paths
 
 
 def main(args):
diff --git a/tools/infer.py b/tools/infer.py
index 95b3fdc7..cb051112 100644
--- a/tools/infer.py
+++ b/tools/infer.py
@@ -17,11 +17,11 @@
 
 def get_args_parser(add_help=True):
     parser = argparse.ArgumentParser(description='YOLOv6 PyTorch Inference.', add_help=add_help)
-    parser.add_argument('--weights', type=str, default='weights/yolov6s.pt', help='model path(s) for inference.')
-    parser.add_argument('--source', type=str, default='data/images', help='the source path, e.g. image-file/dir.')
+    parser.add_argument('--weights', type=str, default='./checkpoints/yolov6n_yol.pt', help='model path(s) for inference.')
+    parser.add_argument('--source', type=str, default='./data/images', help='the source path, e.g. image-file/dir.')
     parser.add_argument('--webcam', action='store_true', help='whether to use webcam.')
-    parser.add_argument('--webcam-addr', type=str, default='0', help='the web camera address, local camera or rtsp address.')
-    parser.add_argument('--yaml', type=str, default='data/coco.yaml', help='data yaml file.')
+    parser.add_argument('--webcam-addr', type=str, default='6', help='the web camera address, local camera or rtsp address.')
+    parser.add_argument('--yaml', type=str, default='data/test.yaml', help='data yaml file.')
     parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='the image-size(h,w) in inference size.')
     parser.add_argument('--conf-thres', type=float, default=0.4, help='confidence threshold for inference.')
     parser.add_argument('--iou-thres', type=float, default=0.45, help='NMS IoU threshold for inference.')
@@ -29,7 +29,7 @@ def get_args_parser(add_help=True):
     parser.add_argument('--device', default='0', help='device to run our model i.e. 0 or 0,1,2,3 or cpu.')
     parser.add_argument('--save-txt', action='store_true', help='save results to *.txt.')
     parser.add_argument('--not-save-img', action='store_true', help='do not save visuallized inference results.')
-    parser.add_argument('--save-dir', type=str, help='directory to save predictions in. See --save-txt.')
+    parser.add_argument('--save-dir', type=str, default='./runs/inference', help='directory to save predictions in. See --save-txt.')
     parser.add_argument('--view-img', action='store_true', help='show inference results')
     parser.add_argument('--classes', nargs='+', type=int, help='filter by classes, e.g. --classes 0, or --classes 0 2 3.')
     parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS.')
@@ -38,6 +38,7 @@ def get_args_parser(add_help=True):
     parser.add_argument('--hide-labels', default=False, action='store_true', help='hide labels.')
     parser.add_argument('--hide-conf', default=False, action='store_true', help='hide confidences.')
     parser.add_argument('--half', action='store_true', help='whether to use FP16 half-precision inference.')
+    parser.add_argument('--issolo', action='store_true', help='solo structure or not')
 
     args = parser.parse_args()
     LOGGER.info(args)
@@ -66,6 +67,7 @@ def run(weights=osp.join(ROOT, 'yolov6s.pt'),
         hide_labels=False,
         hide_conf=False,
         half=False,
+        issolo=False
         ):
     """ Inference process, supporting inference on one image file or directory which containing images.
     Args:
@@ -105,7 +107,7 @@ def run(weights=osp.join(ROOT, 'yolov6s.pt'),
 
     # Inference
     inferer = Inferer(source, webcam, webcam_addr, weights, device, yaml, img_size, half)
-    inferer.infer(conf_thres, iou_thres, classes, agnostic_nms, max_det, save_dir, save_txt, not not_save_img, hide_labels, hide_conf, view_img)
+    inferer.infer(conf_thres, iou_thres, classes, agnostic_nms, max_det, save_dir, save_txt, not not_save_img, hide_labels, hide_conf, view_img, issolo=issolo)
 
     if save_txt or not not_save_img:
         LOGGER.info(f"Results saved to {save_dir}")
diff --git a/tools/train.py b/tools/train.py
index 635c68e4..9771e562 100644
--- a/tools/train.py
+++ b/tools/train.py
@@ -25,10 +25,10 @@
 def get_args_parser(add_help=True):
     parser = argparse.ArgumentParser(description='YOLOv6 PyTorch Training', add_help=add_help)
     parser.add_argument('--data-path', default='./data/coco.yaml', type=str, help='path of dataset')
-    parser.add_argument('--conf-file', default='./configs/yolov6n.py', type=str, help='experiments description file')
+    parser.add_argument('--conf-file', default='./configs/yolov6s.py', type=str, help='experiments description file')
     parser.add_argument('--img-size', default=640, type=int, help='train, val image size (pixels)')
     parser.add_argument('--rect', action='store_true', help='whether to use rectangular training, default is False')
-    parser.add_argument('--batch-size', default=32, type=int, help='total batch size for all GPUs')
+    parser.add_argument('--batch-size', default=16, type=int, help='total batch size for all GPUs')
     parser.add_argument('--epochs', default=400, type=int, help='number of total epochs to run')
     parser.add_argument('--workers', default=8, type=int, help='number of data loading workers (default: 8)')
     parser.add_argument('--device', default='0', type=str, help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
@@ -45,7 +45,7 @@ def get_args_parser(add_help=True):
     parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter')
     parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume the most recent training')
     parser.add_argument('--write_trainbatch_tb', action='store_true', help='write train_batch image to tensorboard once an epoch, may slightly slower train speed if open')
-    parser.add_argument('--stop_aug_last_n_epoch', default=15, type=int, help='stop strong aug at last n epoch, neg value not stop, default 15')
+    parser.add_argument('--stop_aug_last_n_epoch', default=-1, type=int, help='stop strong aug at last n epoch, neg value not stop, default 15')
     parser.add_argument('--save_ckpt_on_last_n_epoch', default=-1, type=int, help='save last n epoch even not best or last, neg value not save')
     parser.add_argument('--distill', action='store_true', help='distill or not')
     parser.add_argument('--distill_feat', action='store_true', help='distill featmap or not')
@@ -54,7 +54,7 @@ def get_args_parser(add_help=True):
     parser.add_argument('--teacher_model_path', type=str, default=None, help='teacher model path')
     parser.add_argument('--temperature', type=int, default=20, help='distill temperature')
     parser.add_argument('--fuse_ab', action='store_true', help='fuse ab branch in training process or not')
-    parser.add_argument('--bs_per_gpu', default=32, type=int, help='batch size per GPU for auto-rescale learning rate, set to 16 for P6 models')
+    parser.add_argument('--bs_per_gpu', default=8, type=int, help='batch size per GPU for auto-rescale learning rate, set to 16 for P6 models')
     parser.add_argument('--specific-shape', action='store_true', help='rectangular training')
     parser.add_argument('--height', type=int, default=None, help='image height of model input')
     parser.add_argument('--width', type=int, default=None, help='image width of model input')
diff --git a/yolov6/assigners/anchor_generator.py b/yolov6/assigners/anchor_generator.py
index c8276418..3a41e0ba 100644
--- a/yolov6/assigners/anchor_generator.py
+++ b/yolov6/assigners/anchor_generator.py
@@ -1,7 +1,5 @@
 import torch
-from yolov6.utils.general import check_version
 
-torch_1_10_plus = check_version(torch.__version__, minimum='1.10.0')
 
 def generate_anchors(feats, fpn_strides, grid_cell_size=5.0, grid_cell_offset=0.5,  device='cpu', is_eval=False, mode='af'):
     '''Generate anchors from features.'''
@@ -15,7 +13,10 @@ def generate_anchors(feats, fpn_strides, grid_cell_size=5.0, grid_cell_offset=0.
             _, _, h, w = feats[i].shape
             shift_x = torch.arange(end=w, device=device) + grid_cell_offset
             shift_y = torch.arange(end=h, device=device) + grid_cell_offset
-            shift_y, shift_x = torch.meshgrid(shift_y, shift_x, indexing='ij') if torch_1_10_plus else torch.meshgrid(shift_y, shift_x)
+            try:
+                shift_y, shift_x = torch.meshgrid(shift_y, shift_x, indexing='ij')
+            except:
+                shift_y, shift_x = torch.meshgrid(shift_y, shift_x)
             anchor_point = torch.stack(
                     [shift_x, shift_y], axis=-1).to(torch.float)
             if mode == 'af': # anchor-free
@@ -37,7 +38,10 @@ def generate_anchors(feats, fpn_strides, grid_cell_size=5.0, grid_cell_offset=0.
             cell_half_size = grid_cell_size * stride * 0.5
             shift_x = (torch.arange(end=w, device=device) + grid_cell_offset) * stride
             shift_y = (torch.arange(end=h, device=device) + grid_cell_offset) * stride
-            shift_y, shift_x = torch.meshgrid(shift_y, shift_x, indexing='ij') if torch_1_10_plus else torch.meshgrid(shift_y, shift_x)
+            try:
+                shift_y, shift_x = torch.meshgrid(shift_y, shift_x, indexing='ij')
+            except:
+                shift_y, shift_x = torch.meshgrid(shift_y, shift_x)
             anchor = torch.stack(
                 [
                     shift_x - cell_half_size, shift_y - cell_half_size,
diff --git a/yolov6/assigners/atss_assigner.py b/yolov6/assigners/atss_assigner.py
index 12a5f243..c1d51e74 100644
--- a/yolov6/assigners/atss_assigner.py
+++ b/yolov6/assigners/atss_assigner.py
@@ -21,7 +21,8 @@ def forward(self,
                 gt_labels,
                 gt_bboxes,
                 mask_gt,
-                pd_bboxes):
+                pd_bboxes,
+                gt_segmasks):
         r"""This code is based on
             https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/atss_assigner.py
 
@@ -47,7 +48,8 @@ def forward(self,
             return torch.full( [self.bs, self.n_anchors], self.bg_idx).to(device), \
                    torch.zeros([self.bs, self.n_anchors, 4]).to(device), \
                    torch.zeros([self.bs, self.n_anchors, self.num_classes]).to(device), \
-                   torch.zeros([self.bs, self.n_anchors]).to(device)
+                   torch.zeros([self.bs, self.n_anchors]).to(device), \
+                   torch.zeros(*pd_bboxes.shape[:2], 40, 40)
 
 
         overlaps = iou2d_calculator(gt_bboxes.reshape([-1, 4]), anc_bboxes)
@@ -74,7 +76,7 @@ def forward(self,
             mask_pos, overlaps, self.n_max_boxes)
 
         # assigned target
-        target_labels, target_bboxes, target_scores = self.get_targets(
+        target_labels, target_bboxes, target_scores, target_segmasks = self.get_targets(
             gt_labels, gt_bboxes, target_gt_idx, fg_mask)
 
         # soft label with iou
@@ -83,7 +85,7 @@ def forward(self,
             ious = ious.max(axis=-2)[0].unsqueeze(-1)
             target_scores *= ious
 
-        return target_labels.long(), target_bboxes, target_scores, fg_mask.bool()
+        return target_labels.long(), target_bboxes, target_scores, fg_mask.bool(), target_segmasks
 
     def select_topk_candidates(self,
                                distances,
@@ -139,7 +141,8 @@ def get_targets(self,
                     gt_labels,
                     gt_bboxes,
                     target_gt_idx,
-                    fg_mask):
+                    fg_mask,
+                    gt_segmasks):
 
         # assigned target labels
         batch_idx = torch.arange(self.bs, dtype=gt_labels.dtype, device=gt_labels.device)
@@ -158,4 +161,7 @@ def get_targets(self,
         target_scores = F.one_hot(target_labels.long(), self.num_classes + 1).float()
         target_scores = target_scores[:, :, :self.num_classes]
 
-        return target_labels, target_bboxes, target_scores
+        m_shape = gt_segmasks.shape[-2:]
+        target_segmasks = gt_segmasks.reshape([-1, m_shape[0], m_shape[1]])[target_gt_idx.flatten()]
+
+        return target_labels, target_bboxes, target_scores, target_segmasks
diff --git a/yolov6/assigners/atss_assigner_seg.py b/yolov6/assigners/atss_assigner_seg.py
new file mode 100644
index 00000000..bf844387
--- /dev/null
+++ b/yolov6/assigners/atss_assigner_seg.py
@@ -0,0 +1,166 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from yolov6.assigners.iou2d_calculator import iou2d_calculator
+from yolov6.assigners.assigner_utils import dist_calculator, select_candidates_in_gts, select_highest_overlaps, iou_calculator
+
+class ATSSAssigner(nn.Module):
+    '''Adaptive Training Sample Selection Assigner'''
+    def __init__(self,
+                 topk=9,
+                 num_classes=80):
+        super(ATSSAssigner, self).__init__()
+        self.topk = topk
+        self.num_classes = num_classes
+        self.bg_idx = num_classes
+
+    @torch.no_grad()
+    def forward(self,
+                anc_bboxes,
+                n_level_bboxes,
+                gt_labels,
+                gt_bboxes,
+                mask_gt,
+                pd_bboxes,
+                gt_segmasks):
+        r"""This code is based on
+            https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/atss_assigner.py
+
+        Args:
+            anc_bboxes (Tensor): shape(num_total_anchors, 4)
+            n_level_bboxes (List):len(3)
+            gt_labels (Tensor): shape(bs, n_max_boxes, 1)
+            gt_bboxes (Tensor): shape(bs, n_max_boxes, 4)
+            mask_gt (Tensor): shape(bs, n_max_boxes, 1)
+            pd_bboxes (Tensor): shape(bs, n_max_boxes, 4)
+        Returns:
+            target_labels (Tensor): shape(bs, num_total_anchors)
+            target_bboxes (Tensor): shape(bs, num_total_anchors, 4)
+            target_scores (Tensor): shape(bs, num_total_anchors, num_classes)
+            fg_mask (Tensor): shape(bs, num_total_anchors)
+        """
+        self.n_anchors = anc_bboxes.size(0)
+        self.bs = gt_bboxes.size(0)
+        self.n_max_boxes = gt_bboxes.size(1)
+
+        if self.n_max_boxes == 0:
+            device = gt_bboxes.device
+            return torch.full( [self.bs, self.n_anchors], self.bg_idx).to(device), \
+                   torch.zeros([self.bs, self.n_anchors, 4]).to(device), \
+                   torch.zeros([self.bs, self.n_anchors, self.num_classes]).to(device), \
+                   torch.zeros([self.bs, self.n_anchors]).to(device)
+
+
+        overlaps = iou2d_calculator(gt_bboxes.reshape([-1, 4]), anc_bboxes)
+        overlaps = overlaps.reshape([self.bs, -1, self.n_anchors])
+
+        distances, ac_points = dist_calculator(gt_bboxes.reshape([-1, 4]), anc_bboxes)
+        distances = distances.reshape([self.bs, -1, self.n_anchors])
+
+        is_in_candidate, candidate_idxs = self.select_topk_candidates(
+            distances, n_level_bboxes, mask_gt)
+
+        overlaps_thr_per_gt, iou_candidates = self.thres_calculator(
+            is_in_candidate, candidate_idxs, overlaps)
+
+        # select candidates iou >= threshold as positive
+        is_pos = torch.where(
+            iou_candidates > overlaps_thr_per_gt.repeat([1, 1, self.n_anchors]),
+            is_in_candidate, torch.zeros_like(is_in_candidate))
+
+        is_in_gts = select_candidates_in_gts(ac_points, gt_bboxes)
+        mask_pos = is_pos * is_in_gts * mask_gt
+
+        target_gt_idx, fg_mask, mask_pos = select_highest_overlaps(
+            mask_pos, overlaps, self.n_max_boxes)
+
+        # assigned target
+        target_labels, target_bboxes, target_scores, target_segmasks = self.get_targets(
+            gt_labels, gt_bboxes, target_gt_idx, fg_mask, gt_segmasks)
+
+        # soft label with iou
+        if pd_bboxes is not None:
+            ious = iou_calculator(gt_bboxes, pd_bboxes) * mask_pos
+            ious = ious.max(axis=-2)[0].unsqueeze(-1)
+            target_scores *= ious
+
+        return target_labels.long(), target_bboxes, target_scores, fg_mask.bool(), target_segmasks
+
+    def select_topk_candidates(self,
+                               distances,
+                               n_level_bboxes,
+                               mask_gt):
+
+        mask_gt = mask_gt.repeat(1, 1, self.topk).bool()
+        level_distances = torch.split(distances, n_level_bboxes, dim=-1)
+        is_in_candidate_list = []
+        candidate_idxs = []
+        start_idx = 0
+        for per_level_distances, per_level_boxes in zip(level_distances, n_level_bboxes):
+
+            end_idx = start_idx + per_level_boxes
+            selected_k = min(self.topk, per_level_boxes)
+            _, per_level_topk_idxs = per_level_distances.topk(selected_k, dim=-1, largest=False)
+            candidate_idxs.append(per_level_topk_idxs + start_idx)
+            per_level_topk_idxs = torch.where(mask_gt,
+                per_level_topk_idxs, torch.zeros_like(per_level_topk_idxs))
+            is_in_candidate = F.one_hot(per_level_topk_idxs, per_level_boxes).sum(dim=-2)
+            is_in_candidate = torch.where(is_in_candidate > 1,
+                torch.zeros_like(is_in_candidate), is_in_candidate)
+            is_in_candidate_list.append(is_in_candidate.to(distances.dtype))
+            start_idx = end_idx
+
+        is_in_candidate_list = torch.cat(is_in_candidate_list, dim=-1)
+        candidate_idxs = torch.cat(candidate_idxs, dim=-1)
+
+        return is_in_candidate_list, candidate_idxs
+
+    def thres_calculator(self,
+                         is_in_candidate,
+                         candidate_idxs,
+                         overlaps):
+
+        n_bs_max_boxes = self.bs * self.n_max_boxes
+        _candidate_overlaps = torch.where(is_in_candidate > 0,
+            overlaps, torch.zeros_like(overlaps))
+        candidate_idxs = candidate_idxs.reshape([n_bs_max_boxes, -1])
+        assist_idxs = self.n_anchors * torch.arange(n_bs_max_boxes, device=candidate_idxs.device)
+        assist_idxs = assist_idxs[:,None]
+        faltten_idxs = candidate_idxs + assist_idxs
+        candidate_overlaps = _candidate_overlaps.reshape(-1)[faltten_idxs]
+        candidate_overlaps = candidate_overlaps.reshape([self.bs, self.n_max_boxes, -1])
+
+        overlaps_mean_per_gt = candidate_overlaps.mean(axis=-1, keepdim=True)
+        overlaps_std_per_gt = candidate_overlaps.std(axis=-1, keepdim=True)
+        overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt
+
+        return overlaps_thr_per_gt, _candidate_overlaps
+
+    def get_targets(self,
+                    gt_labels,
+                    gt_bboxes,
+                    target_gt_idx,
+                    fg_mask,
+                    gt_segmasks):
+
+        # assigned target labels
+        batch_idx = torch.arange(self.bs, dtype=gt_labels.dtype, device=gt_labels.device)
+        batch_idx = batch_idx[...,None]
+        target_gt_idx = (target_gt_idx + batch_idx * self.n_max_boxes).long()
+        target_labels = gt_labels.flatten()[target_gt_idx.flatten()]
+        target_labels = target_labels.reshape([self.bs, self.n_anchors])
+        target_labels = torch.where(fg_mask > 0,
+            target_labels, torch.full_like(target_labels, self.bg_idx))
+
+        # assigned target boxes
+        target_bboxes = gt_bboxes.reshape([-1, 4])[target_gt_idx.flatten()]
+        target_bboxes = target_bboxes.reshape([self.bs, self.n_anchors, 4])
+
+        # assigned target scores
+        target_scores = F.one_hot(target_labels.long(), self.num_classes + 1).float()
+        target_scores = target_scores[:, :, :self.num_classes]
+
+        m_shape = gt_segmasks.shape[-2:]
+        target_segmasks = gt_segmasks.reshape([-1, m_shape[0], m_shape[1]])[target_gt_idx.flatten()]
+
+        return target_labels, target_bboxes, target_scores, target_segmasks
diff --git a/yolov6/assigners/tal_assigner.py b/yolov6/assigners/tal_assigner.py
index 45008f5a..d1bd404a 100644
--- a/yolov6/assigners/tal_assigner.py
+++ b/yolov6/assigners/tal_assigner.py
@@ -25,7 +25,8 @@ def forward(self,
                 anc_points,
                 gt_labels,
                 gt_bboxes,
-                mask_gt):
+                mask_gt,
+                gt_segmasks):
         """This code referenced to
            https://github.com/Nioolek/PPYOLOE_pytorch/blob/master/ppyoloe/assigner/tal_assigner.py
 
@@ -50,10 +51,11 @@ def forward(self,
             return torch.full_like(pd_scores[..., 0], self.bg_idx).to(device), \
                    torch.zeros_like(pd_bboxes).to(device), \
                    torch.zeros_like(pd_scores).to(device), \
-                   torch.zeros_like(pd_scores[..., 0]).to(device)
+                   torch.zeros_like(pd_scores[..., 0]).to(device), \
+                   torch.zeros(*pd_bboxes.shape[:2], 40, 40)
 
         cycle, step, self.bs = (1, self.bs, self.bs) if self.n_max_boxes <= 100 else (self.bs, 1, 1)
-        target_labels_lst, target_bboxes_lst, target_scores_lst, fg_mask_lst = [], [], [], []
+        target_labels_lst, target_bboxes_lst, target_scores_lst, fg_mask_lst, target_segmasks_lst = [], [], [], [], []
         # loop batch dim in case of numerous object box
         for i in range(cycle):
             start, end = i*step, (i+1)*step
@@ -62,6 +64,7 @@ def forward(self,
             gt_labels_ = gt_labels[start:end, ...]
             gt_bboxes_ = gt_bboxes[start:end, ...]
             mask_gt_   = mask_gt[start:end, ...]
+            gt_segmasks_ = gt_segmasks[start:end, ...]
 
             mask_pos, align_metric, overlaps = self.get_pos_mask(
                 pd_scores_, pd_bboxes_, gt_labels_, gt_bboxes_, anc_points, mask_gt_)
@@ -70,8 +73,8 @@ def forward(self,
                 mask_pos, overlaps, self.n_max_boxes)
 
             # assigned target
-            target_labels, target_bboxes, target_scores = self.get_targets(
-                gt_labels_, gt_bboxes_, target_gt_idx, fg_mask)
+            target_labels, target_bboxes, target_scores, target_segmasks = self.get_targets(
+                gt_labels_, gt_bboxes_, target_gt_idx, fg_mask, gt_segmasks_)
 
             # normalize
             align_metric *= mask_pos
@@ -85,14 +88,16 @@ def forward(self,
             target_bboxes_lst.append(target_bboxes)
             target_scores_lst.append(target_scores)
             fg_mask_lst.append(fg_mask)
+            target_segmasks_lst.append(target_segmasks)
 
         # concat
         target_labels = torch.cat(target_labels_lst, 0)
         target_bboxes = torch.cat(target_bboxes_lst, 0)
         target_scores = torch.cat(target_scores_lst, 0)
         fg_mask = torch.cat(fg_mask_lst, 0)
+        target_segmasks = torch.cat(target_segmasks_lst, 0)
 
-        return target_labels, target_bboxes, target_scores, fg_mask.bool()
+        return target_labels, target_bboxes, target_scores, fg_mask.bool(), target_segmasks
 
     def get_pos_mask(self,
                      pd_scores,
@@ -153,7 +158,8 @@ def get_targets(self,
                     gt_labels,
                     gt_bboxes,
                     target_gt_idx,
-                    fg_mask):
+                    fg_mask,
+                    gt_segmasks):
 
         # assigned target labels
         batch_ind = torch.arange(end=self.bs, dtype=torch.int64, device=gt_labels.device)[...,None]
@@ -169,5 +175,8 @@ def get_targets(self,
         fg_scores_mask  = fg_mask[:, :, None].repeat(1, 1, self.num_classes)
         target_scores = torch.where(fg_scores_mask > 0, target_scores,
                                         torch.full_like(target_scores, 0))
+        m_shape = gt_segmasks.shape[-2:]
+        target_segmasks = gt_segmasks.reshape([-1, m_shape[0], m_shape[1]])[target_gt_idx]
 
-        return target_labels, target_bboxes, target_scores
+
+        return target_labels, target_bboxes, target_scores, target_segmasks
diff --git a/yolov6/assigners/tal_assigner_seg.py b/yolov6/assigners/tal_assigner_seg.py
new file mode 100644
index 00000000..057c718b
--- /dev/null
+++ b/yolov6/assigners/tal_assigner_seg.py
@@ -0,0 +1,185 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from yolov6.assigners.assigner_utils import select_candidates_in_gts, select_highest_overlaps, iou_calculator, dist_calculator
+
+class TaskAlignedAssigner(nn.Module):
+    def __init__(self,
+                 topk=13,
+                 num_classes=80,
+                 alpha=1.0,
+                 beta=6.0,
+                 eps=1e-9):
+        super(TaskAlignedAssigner, self).__init__()
+        self.topk = topk
+        self.num_classes = num_classes
+        self.bg_idx = num_classes
+        self.alpha = alpha
+        self.beta = beta
+        self.eps = eps
+
+    @torch.no_grad()
+    def forward(self,
+                pd_scores,
+                pd_bboxes,
+                anc_points,
+                gt_labels,
+                gt_bboxes,
+                mask_gt,
+                gt_segmasks):
+        """This code referenced to
+           https://github.com/Nioolek/PPYOLOE_pytorch/blob/master/ppyoloe/assigner/tal_assigner.py
+
+        Args:
+            pd_scores (Tensor): shape(bs, num_total_anchors, num_classes)
+            pd_bboxes (Tensor): shape(bs, num_total_anchors, 4)
+            anc_points (Tensor): shape(num_total_anchors, 2)
+            gt_labels (Tensor): shape(bs, n_max_boxes, 1)
+            gt_bboxes (Tensor): shape(bs, n_max_boxes, 4)
+            mask_gt (Tensor): shape(bs, n_max_boxes, 1)
+        Returns:
+            target_labels (Tensor): shape(bs, num_total_anchors)
+            target_bboxes (Tensor): shape(bs, num_total_anchors, 4)
+            target_scores (Tensor): shape(bs, num_total_anchors, num_classes)
+            fg_mask (Tensor): shape(bs, num_total_anchors)
+        """
+        self.bs = pd_scores.size(0)
+        self.n_max_boxes = gt_bboxes.size(1)
+
+        if self.n_max_boxes == 0:
+            device = gt_bboxes.device
+            return torch.full_like(pd_scores[..., 0], self.bg_idx).to(device), \
+                   torch.zeros_like(pd_bboxes).to(device), \
+                   torch.zeros_like(pd_scores).to(device), \
+                   torch.zeros_like(pd_scores[..., 0]).to(device), \
+                   []
+                   #torch.zeros(*pd_bboxes.shape[:2]).to(device)
+                   
+
+        # cycle, step, self.bs = (1, self.bs, self.bs) if self.n_max_boxes <= 100 else (self.bs, 1, 1)
+        cycle, step, self.bs = (1, self.bs, self.bs)
+        target_labels_lst, target_bboxes_lst, target_scores_lst, fg_mask_lst, idx_lst = [], [], [], [], []
+        # loop batch dim in case of numerous object box
+        for i in range(cycle):
+            start, end = i*step, (i+1)*step
+            pd_scores_ = pd_scores[start:end, ...]
+            pd_bboxes_ = pd_bboxes[start:end, ...]
+            gt_labels_ = gt_labels[start:end, ...]
+            gt_bboxes_ = gt_bboxes[start:end, ...]
+            mask_gt_   = mask_gt[start:end, ...]
+            # gt_segmasks_ = gt_segmasks[start:end, ...]
+
+            mask_pos, align_metric, overlaps = self.get_pos_mask(
+                pd_scores_, pd_bboxes_, gt_labels_, gt_bboxes_, anc_points, mask_gt_)
+
+            target_gt_idx, fg_mask, mask_pos = select_highest_overlaps(
+                mask_pos, overlaps, self.n_max_boxes)
+
+            # assigned target
+            target_labels, target_bboxes, target_scores, idx = self.get_targets(
+                gt_labels_, gt_bboxes_, target_gt_idx, fg_mask)
+
+            # normalize
+            align_metric *= mask_pos
+            pos_align_metrics = align_metric.max(axis=-1, keepdim=True)[0]
+            pos_overlaps = (overlaps * mask_pos).max(axis=-1, keepdim=True)[0]
+            norm_align_metric = (align_metric * pos_overlaps / (pos_align_metrics + self.eps)).max(-2)[0].unsqueeze(-1)
+            target_scores = target_scores * norm_align_metric
+
+            # append
+            target_labels_lst.append(target_labels)
+            idx_lst.append(idx)
+            target_bboxes_lst.append(target_bboxes)
+            target_scores_lst.append(target_scores)
+            fg_mask_lst.append(fg_mask)
+            # target_segmasks_lst.append(target_segmasks)
+
+        # concat
+        target_labels = torch.cat(target_labels_lst, 0)
+        target_bboxes = torch.cat(target_bboxes_lst, 0)
+        target_scores = torch.cat(target_scores_lst, 0)
+        fg_mask = torch.cat(fg_mask_lst, 0)
+        # target_segmasks = torch.cat(target_segmasks_lst, 0)
+
+        return target_labels, target_bboxes, target_scores, fg_mask.bool(), idx_lst
+
+    def get_pos_mask(self,
+                     pd_scores,
+                     pd_bboxes,
+                     gt_labels,
+                     gt_bboxes,
+                     anc_points,
+                     mask_gt):
+
+        # get anchor_align metric
+        align_metric, overlaps = self.get_box_metrics(pd_scores, pd_bboxes, gt_labels, gt_bboxes)
+        # get in_gts mask
+        mask_in_gts = select_candidates_in_gts(anc_points, gt_bboxes)
+        # get topk_metric mask
+        mask_topk = self.select_topk_candidates(
+            align_metric * mask_in_gts, topk_mask=mask_gt.repeat([1, 1, self.topk]).bool())
+        # merge all mask to a final mask
+        mask_pos = mask_topk * mask_in_gts * mask_gt
+
+        return mask_pos, align_metric, overlaps
+
+    def get_box_metrics(self,
+                        pd_scores,
+                        pd_bboxes,
+                        gt_labels,
+                        gt_bboxes):
+
+        pd_scores = pd_scores.permute(0, 2, 1)
+        gt_labels = gt_labels.to(torch.long)
+        ind = torch.zeros([2, self.bs, self.n_max_boxes], dtype=torch.long)
+        ind[0] = torch.arange(end=self.bs).view(-1, 1).repeat(1, self.n_max_boxes)
+        ind[1] = gt_labels.squeeze(-1)
+        bbox_scores = pd_scores[ind[0], ind[1]]
+
+        overlaps = iou_calculator(gt_bboxes, pd_bboxes)
+        align_metric = bbox_scores.pow(self.alpha) * overlaps.pow(self.beta)
+
+        return align_metric, overlaps
+
+    def select_topk_candidates(self,
+                               metrics,
+                               largest=True,
+                               topk_mask=None):
+
+        num_anchors = metrics.shape[-1]
+        topk_metrics, topk_idxs = torch.topk(
+            metrics, self.topk, axis=-1, largest=largest)
+        if topk_mask is None:
+            topk_mask = (topk_metrics.max(axis=-1, keepdim=True) > self.eps).tile(
+                [1, 1, self.topk])
+        topk_idxs = torch.where(topk_mask, topk_idxs, torch.zeros_like(topk_idxs))
+        is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(axis=-2)
+        is_in_topk = torch.where(is_in_topk > 1,
+            torch.zeros_like(is_in_topk), is_in_topk)
+        return is_in_topk.to(metrics.dtype)
+
+    def get_targets(self,
+                    gt_labels,
+                    gt_bboxes,
+                    target_gt_idx,
+                    fg_mask):
+
+        # assigned target labels
+        batch_ind = torch.arange(end=self.bs, dtype=torch.int64, device=gt_labels.device)[...,None]
+        target_gt_idx = target_gt_idx + batch_ind * self.n_max_boxes
+        target_labels = gt_labels.long().flatten()[target_gt_idx]
+
+        # assigned target boxes
+        target_bboxes = gt_bboxes.reshape([-1, 4])[target_gt_idx]
+
+        # assigned target scores
+        target_labels[target_labels<0] = 0
+        target_scores = F.one_hot(target_labels, self.num_classes)
+        fg_scores_mask  = fg_mask[:, :, None].repeat(1, 1, self.num_classes)
+        target_scores = torch.where(fg_scores_mask > 0, target_scores,
+                                        torch.full_like(target_scores, 0))
+        # m_shape = gt_segmasks.shape[-2:]
+        # target_segmasks = gt_segmasks.reshape([-1, m_shape[0], m_shape[1]])[target_gt_idx]
+
+
+        return target_labels, target_bboxes, target_scores, target_gt_idx
diff --git a/yolov6/assigners/tal_assigner_seg2.py b/yolov6/assigners/tal_assigner_seg2.py
new file mode 100644
index 00000000..aa1101cd
--- /dev/null
+++ b/yolov6/assigners/tal_assigner_seg2.py
@@ -0,0 +1,183 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from yolov6.assigners.assigner_utils import select_candidates_in_gts, select_highest_overlaps, iou_calculator, dist_calculator
+
+class TaskAlignedAssigner(nn.Module):
+    def __init__(self,
+                 topk=13,
+                 num_classes=80,
+                 alpha=1.0,
+                 beta=6.0,
+                 eps=1e-9):
+        super(TaskAlignedAssigner, self).__init__()
+        self.topk = topk
+        self.num_classes = num_classes
+        self.bg_idx = num_classes
+        self.alpha = alpha
+        self.beta = beta
+        self.eps = eps
+
+    @torch.no_grad()
+    def forward(self,
+                pd_scores,
+                pd_bboxes,
+                anc_points,
+                gt_labels,
+                gt_bboxes,
+                mask_gt,
+                gt_segmasks):
+        """This code referenced to
+           https://github.com/Nioolek/PPYOLOE_pytorch/blob/master/ppyoloe/assigner/tal_assigner.py
+
+        Args:
+            pd_scores (Tensor): shape(bs, num_total_anchors, num_classes)
+            pd_bboxes (Tensor): shape(bs, num_total_anchors, 4)
+            anc_points (Tensor): shape(num_total_anchors, 2)
+            gt_labels (Tensor): shape(bs, n_max_boxes, 1)
+            gt_bboxes (Tensor): shape(bs, n_max_boxes, 4)
+            mask_gt (Tensor): shape(bs, n_max_boxes, 1)
+        Returns:
+            target_labels (Tensor): shape(bs, num_total_anchors)
+            target_bboxes (Tensor): shape(bs, num_total_anchors, 4)
+            target_scores (Tensor): shape(bs, num_total_anchors, num_classes)
+            fg_mask (Tensor): shape(bs, num_total_anchors)
+        """
+        self.bs = pd_scores.size(0)
+        self.n_max_boxes = gt_bboxes.size(1)
+
+        if self.n_max_boxes == 0:
+            device = gt_bboxes.device
+            return torch.full_like(pd_scores[..., 0], self.bg_idx).to(device), \
+                   torch.zeros_like(pd_bboxes).to(device), \
+                   torch.zeros_like(pd_scores).to(device), \
+                   torch.zeros_like(pd_scores[..., 0]).to(device), \
+                   torch.zeros(*pd_bboxes.shape[:2], 40, 40)
+
+        cycle, step, self.bs = (1, self.bs, self.bs) if self.n_max_boxes <= 100 else (self.bs, 1, 1)
+        target_labels_lst, target_bboxes_lst, target_scores_lst, fg_mask_lst, target_segmasks_lst = [], [], [], [], []
+        # loop batch dim in case of numerous object box
+        for i in range(cycle):
+            start, end = i*step, (i+1)*step
+            pd_scores_ = pd_scores[start:end, ...]
+            pd_bboxes_ = pd_bboxes[start:end, ...]
+            gt_labels_ = gt_labels[start:end, ...]
+            gt_bboxes_ = gt_bboxes[start:end, ...]
+            mask_gt_   = mask_gt[start:end, ...]
+            gt_segmasks_ = gt_segmasks[start:end, ...]
+
+            mask_pos, align_metric, overlaps = self.get_pos_mask(
+                pd_scores_, pd_bboxes_, gt_labels_, gt_bboxes_, anc_points, mask_gt_)
+
+            target_gt_idx, fg_mask, mask_pos = select_highest_overlaps(
+                mask_pos, overlaps, self.n_max_boxes)
+
+            # assigned target
+            target_labels, target_bboxes, target_scores, target_segmasks = self.get_targets(
+                gt_labels_, gt_bboxes_, target_gt_idx, fg_mask, gt_segmasks_)
+
+            # normalize
+            align_metric *= mask_pos
+            pos_align_metrics = align_metric.max(axis=-1, keepdim=True)[0]
+            pos_overlaps = (overlaps * mask_pos).max(axis=-1, keepdim=True)[0]
+            norm_align_metric = (align_metric * pos_overlaps / (pos_align_metrics + self.eps)).max(-2)[0].unsqueeze(-1)
+            target_scores = target_scores * norm_align_metric
+
+            # append
+            target_labels_lst.append(target_labels)
+            target_bboxes_lst.append(target_bboxes)
+            target_scores_lst.append(target_scores)
+            fg_mask_lst.append(fg_mask)
+            target_segmasks_lst.append(target_segmasks)
+
+        # concat
+        target_labels = torch.cat(target_labels_lst, 0)
+        target_bboxes = torch.cat(target_bboxes_lst, 0)
+        target_scores = torch.cat(target_scores_lst, 0)
+        fg_mask = torch.cat(fg_mask_lst, 0)
+        target_segmasks = torch.cat(target_segmasks_lst, 0)
+
+        return target_labels, target_bboxes, target_scores, fg_mask.bool(), target_segmasks
+
+    def get_pos_mask(self,
+                     pd_scores,
+                     pd_bboxes,
+                     gt_labels,
+                     gt_bboxes,
+                     anc_points,
+                     mask_gt):
+
+        # get anchor_align metric
+        align_metric, overlaps = self.get_box_metrics(pd_scores, pd_bboxes, gt_labels, gt_bboxes)
+        # get in_gts mask
+        mask_in_gts = select_candidates_in_gts(anc_points, gt_bboxes)
+        # get topk_metric mask
+        mask_topk = self.select_topk_candidates(
+            align_metric * mask_in_gts, topk_mask=mask_gt.repeat([1, 1, self.topk]).bool())
+        # merge all mask to a final mask
+        mask_pos = mask_topk * mask_in_gts * mask_gt
+
+        return mask_pos, align_metric, overlaps
+
+    def get_box_metrics(self,
+                        pd_scores,
+                        pd_bboxes,
+                        gt_labels,
+                        gt_bboxes):
+
+        pd_scores = pd_scores.permute(0, 2, 1)
+        gt_labels = gt_labels.to(torch.long)
+        ind = torch.zeros([2, self.bs, self.n_max_boxes], dtype=torch.long)
+        ind[0] = torch.arange(end=self.bs).view(-1, 1).repeat(1, self.n_max_boxes)
+        ind[1] = gt_labels.squeeze(-1)
+        bbox_scores = pd_scores[ind[0], ind[1]]
+
+        overlaps = iou_calculator(gt_bboxes, pd_bboxes)
+        align_metric = bbox_scores.pow(self.alpha) * overlaps.pow(self.beta)
+
+        return align_metric, overlaps
+
+    def select_topk_candidates(self,
+                               metrics,
+                               largest=True,
+                               topk_mask=None):
+
+        num_anchors = metrics.shape[-1]
+        topk_metrics, topk_idxs = torch.topk(
+            metrics, self.topk, axis=-1, largest=largest)
+        if topk_mask is None:
+            topk_mask = (topk_metrics.max(axis=-1, keepdim=True) > self.eps).tile(
+                [1, 1, self.topk])
+        topk_idxs = torch.where(topk_mask, topk_idxs, torch.zeros_like(topk_idxs))
+        is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(axis=-2)
+        is_in_topk = torch.where(is_in_topk > 1,
+            torch.zeros_like(is_in_topk), is_in_topk)
+        return is_in_topk.to(metrics.dtype)
+
+    def get_targets(self,
+                    gt_labels,
+                    gt_bboxes,
+                    target_gt_idx,
+                    fg_mask,
+                    gt_segmasks):
+
+        # assigned target labels
+        batch_ind = torch.arange(end=self.bs, dtype=torch.int64, device=gt_labels.device)[...,None]
+        target_gt_idx = target_gt_idx + batch_ind * self.n_max_boxes
+        target_labels = gt_labels.long().flatten()[target_gt_idx]
+
+        # assigned target boxes
+        target_bboxes = gt_bboxes.reshape([-1, 4])[target_gt_idx]
+
+        # assigned target scores
+        target_labels[target_labels<0] = 0
+        target_scores = F.one_hot(target_labels, self.num_classes)
+        fg_scores_mask  = fg_mask[:, :, None].repeat(1, 1, self.num_classes)
+        target_scores = torch.where(fg_scores_mask > 0, target_scores,
+                                        torch.full_like(target_scores, 0))
+        m_shape = gt_segmasks.shape[-2:]
+        target_segmasks = gt_segmasks.reshape([-1, m_shape[0], m_shape[1]])[target_gt_idx]
+        print(target_gt_idx.shape, fg_mask.shape)
+
+
+        return target_labels, target_bboxes, target_scores, target_segmasks
diff --git a/yolov6/core/engine.py b/yolov6/core/engine.py
index 10545135..663a0812 100644
--- a/yolov6/core/engine.py
+++ b/yolov6/core/engine.py
@@ -21,7 +21,6 @@
 from yolov6.models.yolo import build_model
 from yolov6.models.yolo_lite import build_model as build_lite_model
 
-from yolov6.models.losses.loss import ComputeLoss as ComputeLoss
 from yolov6.models.losses.loss_fuseab import ComputeLoss as ComputeLoss_ab
 from yolov6.models.losses.loss_distill import ComputeLoss as ComputeLoss_distill
 from yolov6.models.losses.loss_distill_ns import ComputeLoss as ComputeLoss_distill_ns
@@ -35,6 +34,8 @@
 from yolov6.utils.general import download_ckpt
 
 
+
+
 class Trainer:
     def __init__(self, args, cfg, device):
         self.args = args
@@ -42,6 +43,8 @@ def __init__(self, args, cfg, device):
         self.device = device
         self.max_epoch = args.epochs
 
+        
+
         if args.resume:
             self.ckpt = torch.load(args.resume, map_location='cpu')
 
@@ -105,8 +108,8 @@ def __init__(self, args, cfg, device):
         self.height = args.height
         self.width = args.width
 
-        self.loss_num = 3
-        self.loss_info = ['Epoch', 'lr', 'iou_loss', 'dfl_loss', 'cls_loss']
+        self.loss_num = 4
+        self.loss_info = ['Epoch', 'lr', 'iou_loss', 'dfl_loss', 'cls_loss', "seg_loss"]
         if self.args.distill:
             self.loss_num += 1
             self.loss_info += ['cwd_loss']
@@ -140,7 +143,9 @@ def train_one_epoch(self, epoch_num):
 
     # Training one batch data.
     def train_in_steps(self, epoch_num, step_num):
-        images, targets = self.prepro_data(self.batch_data, self.device)
+        # torch.cuda.synchronize()
+        # qq1 = time.time()
+        images, targets, segmasks = self.prepro_data(self.batch_data, self.device)
         # plot train_batch and save to tensorboard once an epoch
         if self.write_trainbatch_tb and self.main_process and self.step == 0:
             self.plot_train_batch(images, targets)
@@ -149,7 +154,11 @@ def train_in_steps(self, epoch_num, step_num):
         # forward
         with amp.autocast(enabled=self.device != 'cpu'):
             _, _, batch_height, batch_width = images.shape
+            # torch.cuda.synchronize()
+            # qq2 = time.time()
             preds, s_featmaps = self.model(images)
+            # torch.cuda.synchronize()
+            # qq3 = time.time()
             if self.args.distill:
                 with torch.no_grad():
                     t_preds, t_featmaps = self.teacher_model(images)
@@ -159,18 +168,21 @@ def train_in_steps(self, epoch_num, step_num):
                                                                   batch_height, batch_width)
 
             elif self.args.fuse_ab:
-                total_loss, loss_items = self.compute_loss((preds[0],preds[3],preds[4]), targets, epoch_num,
-                                                            step_num, batch_height, batch_width) # YOLOv6_af
-                total_loss_ab, loss_items_ab = self.compute_loss_ab(preds[:3], targets, epoch_num, step_num,
-                                                                     batch_height, batch_width) # YOLOv6_ab
+                total_loss, loss_items = self.compute_loss((preds[0],preds[3],preds[4], preds[5]), targets, epoch_num,
+                                                            step_num, batch_height, batch_width, segmasks) # YOLOv6_af
+                total_loss_ab, loss_items_ab = self.compute_loss_ab((preds[0],preds[1],preds[2], preds[6]), targets, epoch_num, step_num,
+                                                                     batch_height, batch_width, segmasks) # YOLOv6_ab
                 total_loss += total_loss_ab
                 loss_items += loss_items_ab
             else:
-                total_loss, loss_items = self.compute_loss(preds, targets, epoch_num, step_num,
-                                                            batch_height, batch_width) # YOLOv6_af
+                total_loss, loss_items = self.compute_loss((preds[0],preds[3],preds[4], preds[5]), targets, epoch_num, step_num,
+                                                            batch_height, batch_width, segmasks, img=images) # YOLOv6_af
             if self.rank != -1:
                 total_loss *= self.world_size
+            # torch.cuda.synchronize()
+            # qq4 = time.time()
         # backward
+        # print("prepare : {}s | model : {}s | loss : {}s".format(qq2 - qq1, qq3 - qq2, qq4 - qq3))
         self.scaler.scale(total_loss).backward()
         self.loss_items = loss_items
         self.update_optimizer()
@@ -186,12 +198,12 @@ def after_epoch(self):
             is_val_epoch = (remaining_epochs == 0) or ((not self.args.eval_final_only) and ((self.epoch + 1) % eval_interval == 0))
             if is_val_epoch:
                 self.eval_model()
-                self.ap = self.evaluate_results[1]
+                self.ap = self.evaluate_results[3]
                 self.best_ap = max(self.ap, self.best_ap)
             # save ckpt
             ckpt = {
-                    'model': deepcopy(de_parallel(self.model)).half(),
-                    'ema': deepcopy(self.ema.ema).half(),
+                    'model': deepcopy(de_parallel(self.model)),
+                    'ema': deepcopy(self.ema.ema),
                     'updates': self.ema.updates,
                     'optimizer': self.optimizer.state_dict(),
                     'scheduler': self.scheduler.state_dict(),
@@ -231,7 +243,10 @@ def eval_model(self):
                             task='train',
                             specific_shape=self.specific_shape,
                             height=self.height,
-                            width=self.width
+                            width=self.width,
+                            do_pr_metric=True,
+                            do_coco_metric=False,
+                            issolo=self.cfg.model.head.issolo
                             )
         else:
             def get_cfg_value(cfg_dict, value_str, default_value):
@@ -263,10 +278,10 @@ def get_cfg_value(cfg_dict, value_str, default_value):
                             width=self.width
                             )
 
-        LOGGER.info(f"Epoch: {self.epoch} | mAP@0.5: {results[0]} | mAP@0.50:0.95: {results[1]}")
-        self.evaluate_results = results[:2]
+        LOGGER.info(f"Epoch: {self.epoch} | box_mAP@0.5: {results[0]} | box_mAP@0.50:0.95: {results[1]} | mask_mAP@0.5: {results[2]} | mask_mAP@0.50:0.95: {results[3]}")
+        self.evaluate_results = [results[1], results[3]]
         # plot validation predictions
-        self.plot_val_pred(vis_outputs, vis_paths)
+        # self.plot_val_pred(vis_outputs, vis_paths)
 
 
     def before_train_loop(self):
@@ -286,6 +301,10 @@ def before_train_loop(self):
             self.best_ap = self.evaluate_results[1]
             self.best_stop_strong_aug_ap = self.evaluate_results[1]
 
+        if self.cfg.model.head.issolo:
+            from yolov6.models.losses.seg_loss_solo_main import ComputeLoss as ComputeLoss
+        else:
+            from yolov6.models.losses.seg_loss import ComputeLoss as ComputeLoss
 
         self.compute_loss = ComputeLoss(num_classes=self.data_dict['nc'],
                                         ori_img_size=self.img_size,
@@ -293,6 +312,7 @@ def before_train_loop(self):
                                         use_dfl=self.cfg.model.head.use_dfl,
                                         reg_max=self.cfg.model.head.reg_max,
                                         iou_type=self.cfg.model.head.iou_type,
+                                        nm=self.cfg.model.head.nm,
 					                    fpn_strides=self.cfg.model.head.strides)
 
         if self.args.fuse_ab:
@@ -305,7 +325,7 @@ def before_train_loop(self):
                                         fpn_strides=self.cfg.model.head.strides,
                                         )
         if self.args.distill :
-            if self.cfg.model.type in ['YOLOv6n','YOLOv6s']:
+            if self.cfg.model.type in ['YOLOv6n','YOLOv6s']:    
                 Loss_distill_func = ComputeLoss_distill_ns
             else:
                 Loss_distill_func = ComputeLoss_distill
@@ -404,7 +424,8 @@ def get_data_loader(args, cfg, data_dict):
     def prepro_data(batch_data, device):
         images = batch_data[0].to(device, non_blocking=True).float() / 255
         targets = batch_data[1].to(device)
-        return images, targets
+        segmask = batch_data[4].to(device)
+        return images, targets, segmask
 
     def get_model(self, args, cfg, nc, device):
         if 'YOLOv6-lite' in cfg.model.type:
@@ -588,4 +609,4 @@ def quant_setup(self, model, cfg, device):
                 # QAT flow load calibrated model
                 assert cfg.qat.calib_pt is not None, 'Please provide calibrated model'
                 model.load_state_dict(torch.load(cfg.qat.calib_pt)['model'].float().state_dict())
-            model.to(device)
+            model.to(device)
\ No newline at end of file
diff --git a/yolov6/core/evaler.py b/yolov6/core/evaler.py
index e79f51be..15c8bc76 100644
--- a/yolov6/core/evaler.py
+++ b/yolov6/core/evaler.py
@@ -7,13 +7,19 @@
 import torch
 import yaml
 from pathlib import Path
+import cv2
+from multiprocessing.pool import ThreadPool
+
+
 
 from pycocotools.coco import COCO
 from pycocotools.cocoeval import COCOeval
 
+import torch.nn.functional as F
+
 from yolov6.data.data_load import create_dataloader
 from yolov6.utils.events import LOGGER, NCOLS
-from yolov6.utils.nms import non_max_suppression
+from yolov6.utils.nms import non_max_suppression_seg, non_max_suppression_seg_solo
 from yolov6.utils.general import download_ckpt
 from yolov6.utils.checkpoint import load_checkpoint
 from yolov6.utils.torch_utils import time_sync, get_model_info
@@ -87,24 +93,25 @@ def init_data(self, dataloader, task):
         self.is_coco = self.data.get("is_coco", False)
         self.ids = self.coco80_to_coco91_class() if self.is_coco else list(range(1000))
         if task != 'train':
+            pad = 0.0
             eval_hyp = {
                 "shrink_size":self.shrink_size,
             }
             rect = self.infer_on_rect
-            pad = 0.5 if rect else 0.0
             dataloader = create_dataloader(self.data[task if task in ('train', 'val', 'test') else 'val'],
-                                           self.img_size, self.batch_size, self.stride, hyp=eval_hyp, check_labels=True, pad=pad, rect=rect,
+                                           self.img_size, self.batch_size, self.stride, hyp=eval_hyp, check_labels=True, pad=0.5, rect=True,
                                            data_dict=self.data, task=task, specific_shape=self.specific_shape, height=self.height, width=self.width)[0]
         return dataloader
 
-    def predict_model(self, model, dataloader, task):
+    def predict_model(self, model, dataloader, task, issolo=False, weight_nums=66, bias_nums=1, dyconv_channels=66):
         '''Model prediction
         Predicts the whole dataset and gets the prediced results and inference time.
         '''
         self.speed_result = torch.zeros(4, device=self.device)
         pred_results = []
         pbar = tqdm(dataloader, desc=f"Inferencing model in {task} datasets.", ncols=NCOLS)
-
+        weight_nums = [weight_nums]
+        bias_nums = [bias_nums]
         # whether to compute metric and plot PR curve and P、R、F1 curve under iou50 match rule
         if self.do_pr_metric:
             stats, ap = [], []
@@ -115,7 +122,7 @@ def predict_model(self, model, dataloader, task):
                 from yolov6.utils.metrics import ConfusionMatrix
                 confusion_matrix = ConfusionMatrix(nc=model.nc)
 
-        for i, (imgs, targets, paths, shapes) in enumerate(pbar):
+        for i, (imgs, targets, paths, shapes, masks) in enumerate(pbar):
             # pre-process
             t1 = time_sync()
             imgs = imgs.to(self.device, non_blocking=True)
@@ -125,12 +132,23 @@ def predict_model(self, model, dataloader, task):
 
             # Inference
             t2 = time_sync()
-            outputs, _ = model(imgs)
+            toutputs, _ = model(imgs)
             self.speed_result[2] += time_sync() - t2  # inference time
 
             # post-process
             t3 = time_sync()
-            outputs = non_max_suppression(outputs, self.conf_thres, self.iou_thres, multi_label=True)
+            if not issolo:
+                loutputs = non_max_suppression_seg(toutputs, self.conf_thres, self.iou_thres, multi_label=True)
+            else:
+                loutputs = non_max_suppression_seg_solo(toutputs, self.conf_thres, self.iou_thres, multi_label=True)
+            protos = toutputs[1][0]
+            segments = []
+            segconf = [loutputs[li][..., 0:] for li in range(len(loutputs))]
+            outputs = [loutputs[li][..., :6] for li in range(len(loutputs))]
+            if not issolo:
+                segments = [self.handle_proto_test([protos[li].reshape(1, *(protos[li].shape[-3:]))], segconf[li], imgs.shape[-2:]) for li in range(len(loutputs))]
+            else:
+                segments = [self.handle_proto_solo([protos[li].reshape(1, *(protos[li].shape[-3:]))], segconf[li], imgs.shape[-2:], weight_sums=weight_nums, bias_sums=bias_nums, dyconv=dyconv_channels) for li in range(len(loutputs))]
             self.speed_result[3] += time_sync() - t3  # post-process time
             self.speed_result[0] += len(outputs)
 
@@ -139,7 +157,7 @@ def predict_model(self, model, dataloader, task):
                 eval_outputs = copy.deepcopy([x.detach().cpu() for x in outputs])
 
             # save result
-            pred_results.extend(self.convert_to_coco_format(outputs, imgs, paths, shapes, self.ids))
+            # pred_results.extend(self.convert_to_coco_format_seg(outputs, imgs, paths, shapes, self.ids, segments))
 
             # for tensorboard visualization, maximum images to show: 8
             if i == 0:
@@ -153,25 +171,29 @@ def predict_model(self, model, dataloader, task):
             # Statistics per image
             # This code is based on
             # https://github.com/ultralytics/yolov5/blob/master/val.py
-            for si, pred in enumerate(eval_outputs):
+            for si, (pred, pred_masks) in enumerate(zip(eval_outputs, segments)):
                 labels = targets[targets[:, 0] == si, 1:]
                 nl = len(labels)
                 tcls = labels[:, 0].tolist() if nl else []  # target class
                 seen += 1
+                correct_masks = torch.zeros(len(pred), niou, dtype=torch.bool)  # init
+                correct = torch.zeros(len(pred), niou, dtype=torch.bool)  # init
 
                 if len(pred) == 0:
                     if nl:
-                        stats.append((torch.zeros(0, niou, dtype=torch.bool), torch.Tensor(), torch.Tensor(), tcls))
+                        stats.append((correct_masks, correct, torch.Tensor(), torch.Tensor(), tcls))
                     continue
 
+                # Masks
+                midx = targets[:, 0] == si
+                gt_masks = masks[midx]
                 # Predictions
                 predn = pred.clone()
                 self.scale_coords(imgs[si].shape[1:], predn[:, :4], shapes[si][0], shapes[si][1])  # native-space pred
 
                 # Assign all predictions as incorrect
-                correct = torch.zeros(pred.shape[0], niou, dtype=torch.bool)
+                
                 if nl:
-
                     from yolov6.utils.nms import xywh2xyxy
 
                     # target boxes
@@ -183,49 +205,122 @@ def predict_model(self, model, dataloader, task):
 
                     labelsn = torch.cat((labels[:, 0:1], tbox), 1)  # native-space labels
 
-                    from yolov6.utils.metrics import process_batch
+                    from yolov6.utils.metrics import process_batch    
 
                     correct = process_batch(predn, labelsn, iouv)
+                    correct_masks = process_batch(predn, labelsn, iouv, pred_masks, gt_masks, overlap=False, masks=True)
                     if self.plot_confusion_matrix:
                         confusion_matrix.process_batch(predn, labelsn)
 
                 # Append statistics (correct, conf, pcls, tcls)
-                stats.append((correct.cpu(), pred[:, 4].cpu(), pred[:, 5].cpu(), tcls))
+
+
+                stats.append((correct_masks.cpu(), correct.cpu(), pred[:, 4].cpu(), pred[:, 5].cpu(), tcls))
 
         if self.do_pr_metric:
             # Compute statistics
             stats = [np.concatenate(x, 0) for x in zip(*stats)]  # to numpy
             if len(stats) and stats[0].any():
-
-                from yolov6.utils.metrics import ap_per_class
-                p, r, ap, f1, ap_class = ap_per_class(*stats, plot=self.plot_curve, save_dir=self.save_dir, names=model.names)
-                AP50_F1_max_idx = len(f1.mean(0)) - f1.mean(0)[::-1].argmax() -1
-                LOGGER.info(f"IOU 50 best mF1 thershold near {AP50_F1_max_idx/1000.0}.")
-                ap50, ap = ap[:, 0], ap.mean(1)  # AP@0.5, AP@0.5:0.95
-                mp, mr, map50, map = p[:, AP50_F1_max_idx].mean(), r[:, AP50_F1_max_idx].mean(), ap50.mean(), ap.mean()
-                nt = np.bincount(stats[3].astype(np.int64), minlength=model.nc)  # number of targets per class
+                from yolov6.utils.metrics import ap_per_class_box_and_mask, Metrics
+                metrics = Metrics()
+                # v5 method
+                results = ap_per_class_box_and_mask(*stats, plot=self.plot_curve, save_dir=self.save_dir, names=model.names)
+                metrics.update(results)
+                nt = np.bincount(stats[4].astype(np.int64), minlength=model.nc)  # number of targets per class
 
                 # Print results
-                s = ('%-16s' + '%12s' * 7) % ('Class', 'Images', 'Labels', 'P@.5iou', 'R@.5iou', 'F1@.5iou', 'mAP@.5', 'mAP@.5:.95')
+                s = ('%22s' + '%15s' * 10) % ('Class', 'Images', 'Instances', 'Box(P', 'R', 'mAP50', 'mAP50-95)', 'Mask(P', 'R',
+                                  'mAP50', 'mAP50-95)')
                 LOGGER.info(s)
-                pf = '%-16s' + '%12i' * 2 + '%12.3g' * 5  # print format
-                LOGGER.info(pf % ('all', seen, nt.sum(), mp, mr, f1.mean(0)[AP50_F1_max_idx], map50, map))
-
-                self.pr_metric_result = (map50, map)
-
-                # Print results per class
-                if self.verbose and model.nc > 1:
-                    for i, c in enumerate(ap_class):
-                        LOGGER.info(pf % (model.names[c], seen, nt[c], p[i, AP50_F1_max_idx], r[i, AP50_F1_max_idx],
-                                           f1[i, AP50_F1_max_idx], ap50[i], ap[i]))
+                pf = '%22s' + '%15i' * 2 + '%11.5g' * 8  # print format
+                mr = metrics.mean_results()
+                LOGGER.info(pf % ('all', seen, nt.sum(), *mr))
+                return [mr[2], mr[3], mr[6], mr[7]], [], []
 
                 if self.plot_confusion_matrix:
                     confusion_matrix.plot(save_dir=self.save_dir, names=list(model.names))
             else:
-                LOGGER.info("Calculate metric failed, might check dataset.")
-                self.pr_metric_result = (0.0, 0.0)
+                return [0, 0, 0, 0], [], []
+
+        return pred_results
 
-        return pred_results, vis_outputs, vis_paths
+    def parse_dynamic_params(self, flatten_kernels, weight_nums, bias_nums, dyconv_channels):
+        """split kernel head prediction to conv weight and bias."""
+        n_inst = flatten_kernels.size(0)
+        n_layers = len(weight_nums)
+        params_splits = list(
+            torch.split_with_sizes(
+                flatten_kernels, weight_nums + bias_nums, dim=1))
+        weight_splits = params_splits[:n_layers]
+        bias_splits = params_splits[n_layers:]
+        for i in range(n_layers):
+            if i < n_layers - 1:
+                weight_splits[i] = weight_splits[i].reshape(
+                    n_inst * dyconv_channels, -1, 1, 1)
+                bias_splits[i] = bias_splits[i].reshape(n_inst *
+                                                        dyconv_channels)
+            else:
+                weight_splits[i] = weight_splits[i].reshape(n_inst, -1, 1, 1)
+                bias_splits[i] = bias_splits[i].reshape(n_inst)
+
+        return weight_splits, bias_splits
+
+    def handle_proto_solo(self, proto_list, oconfs, imgshape, weight_sums=66, bias_sums=1, dyconv=66, img_orishape=None):
+        '''
+        proto_list: [(bs, 32, w, h), ...]
+        conf: (bs, l, 33) -> which_proto, 32
+        '''
+        def handle_proto_coord(proto):
+            _ = proto.shape[-2:]
+            x = torch.arange(0, 1, step = 1 / _[1]).unsqueeze(0).unsqueeze(0).repeat(1, _[0], 1).to(proto.dtype).to(proto.device)
+            y = torch.arange(0, 1, step = 1 / _[0]).unsqueeze(0).T.unsqueeze(0).repeat(1, 1, _[1]).to(proto.dtype).to(proto.device)
+            return torch.cat([proto, x, y]).reshape(1, -1, *_)
+        
+        def crop_mask(masks, boxes):
+            """
+            "Crop" predicted masks by zeroing out everything not in the predicted bbox.
+            Vectorized by Chong (thanks Chong).
+
+            Args:
+                - masks should be a size [n, h, w] tensor of masks
+                - boxes should be a size [n, 4] tensor of bbox coords in relative point form
+            """
+
+            n, h, w = masks.shape
+            x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1)  # x1 shape(1,1,n)
+            r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :]  # rows shape(1,w,1)
+            c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None]  # cols shape(h,1,1)
+            return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
+
+        conf = oconfs[..., 6:]
+        if conf.shape[0] == 0:
+            return None
+        
+        xyxy = oconfs[..., :4]
+        confs = conf[..., 1:]
+        proto = proto_list[0][0]
+        proto = handle_proto_coord(proto)
+        s = proto.shape[-2:]
+        num_inst = confs.shape[0]
+        proto = proto.reshape(1, -1, *proto.shape[-2:])
+        weights, biases = self.parse_dynamic_params(confs, weight_nums=weight_sums, bias_nums=bias_sums, dyconv_channels=dyconv)
+        n_layers = len(weights)
+        for i, (weight, bias) in enumerate(zip(weights, biases)):
+            x = F.conv2d(
+                proto, weight, bias=bias, stride=1, padding=0, groups=1)
+            if i < n_layers - 1:
+                x = F.relu(x)
+        x = x.reshape(num_inst, *proto.shape[-2:]).unsqueeze(0)
+        seg = x.sigmoid()
+        masks = F.interpolate(seg, imgshape, mode='bilinear', align_corners=False)[0]
+        if img_orishape:
+            masks_ori = F.interpolate(seg, img_orishape, mode='nearest')[0]
+        else:
+            masks_ori = None
+        masks = crop_mask(masks, xyxy).gt_(0.5)
+        masks = masks.gt_(0.5)
+        return masks
+            
 
 
     def eval_model(self, pred_results, model, dataloader, task):
@@ -282,7 +377,8 @@ def eval_model(self, pred_results, model, dataloader, task):
                     label_count_dicts[nc_i]["images"].add(ann_i["image_id"])
                     label_count_dicts[nc_i]["anns"] += 1
 
-                s = ('%-16s' + '%12s' * 7) % ('Class', 'Labeled_images', 'Labels', 'P@.5iou', 'R@.5iou', 'F1@.5iou', 'mAP@.5', 'mAP@.5:.95')
+                s = ('%22s' + '%11s' * 10) % ('Class', 'Images', 'Instances', 'Box(P', 'R', 'mAP50', 'mAP50-95)', 'Mask(P', 'R',
+                                  'mAP50', 'mAP50-95)')
                 LOGGER.info(s)
                 #IOU , all p, all cats, all gt, maxdet 100
                 coco_p = cocoEval.eval['precision']
@@ -383,6 +479,51 @@ def convert_to_coco_format(self, outputs, imgs, paths, shapes, ids):
                 pred_results.append(pred_data)
         return pred_results
 
+    def convert_to_coco_format_seg(self, outputs, imgs, paths, shapes, ids, masks):
+        
+        from pycocotools.mask import encode
+        import time
+
+        def single_encode(x):
+            rle = encode(np.asarray(x[:, :, None], order='F', dtype='uint8'))[0]
+            rle['counts'] = rle['counts'].decode('utf-8')
+            return rle
+            
+        
+        pred_results = []
+        for i, pred in enumerate(outputs):
+            if len(pred) == 0:
+                continue
+            pred_masks = masks[i].cpu().numpy()
+            pred_masks = np.transpose(pred_masks, (2, 0, 1))
+            a = time.time()
+            with ThreadPool(64) as pool:
+                rles = pool.map(single_encode, pred_masks)
+            print("rle time")
+            b = time.time()
+            path, shape = Path(paths[i]), shapes[i][0]
+            self.scale_coords(imgs[i].shape[1:], pred[:, :4], shape, shapes[i][1])
+            image_id = int(path.stem) if self.is_coco else path.stem
+            bboxes = self.box_convert(pred[:, 0:4])
+            bboxes[:, :2] -= bboxes[:, 2:] / 2
+            cls = pred[:, 5]
+            scores = pred[:, 4]
+            for ind in range(pred.shape[0]):
+                category_id = ids[int(cls[ind])]
+                bbox = [round(x, 3) for x in bboxes[ind].tolist()]
+                score = round(scores[ind].item(), 5)
+                pred_data = {
+                    "image_id": image_id,
+                    "category_id": category_id,
+                    "bbox": bbox,
+                    "score": score,
+                    'segmentation': rles[i]
+                }
+                pred_results.append(pred_data)
+            c = time.time()
+            print(b-a, c-b)
+        return pred_results
+
     @staticmethod
     def check_task(task):
         if task not in ['train', 'val', 'test', 'speed']:
@@ -543,3 +684,48 @@ def convert_to_coco_format_trt(nums, boxes, scores, classes, paths, shapes, ids)
             pred_results.extend(convert_to_coco_format_trt(nums, boxes, scores, classes, paths, shapes, self.ids))
             self.speed_result[0] += self.batch_size
         return dataloader, pred_results
+
+    
+
+    @staticmethod
+    def handle_proto_test(proto_list, oconfs, imgshape, img_orishape=None):
+        '''
+        proto_list: [(bs, 32, w, h), ...]
+        conf: (bs, l, 33) -> which_proto, 32
+        '''
+        
+    
+        def crop_mask(masks, boxes):
+            """
+            "Crop" predicted masks by zeroing out everything not in the predicted bbox.
+            Vectorized by Chong (thanks Chong).
+
+            Args:
+                - masks should be a size [n, h, w] tensor of masks
+                - boxes should be a size [n, 4] tensor of bbox coords in relative point form
+            """
+
+            n, h, w = masks.shape
+            x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1)  # x1 shape(1,1,n)
+            r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :]  # rows shape(1,w,1)
+            c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None]  # cols shape(h,1,1)
+            return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
+
+        conf = oconfs[..., 6:]
+        if conf.shape[0] == 0:
+            return None
+        
+        xyxy = oconfs[..., :4]
+        confs = conf[..., 1:]
+        proto = proto_list[0]
+        
+        s = proto.shape[-2:]
+        seg = ((confs@proto.reshape(proto.shape[0], proto.shape[1], -1)).reshape(proto.shape[0], confs.shape[0], *s))
+        seg = seg.sigmoid()
+        masks = F.interpolate(seg, imgshape, mode='bilinear', align_corners=False)[0]
+        if img_orishape:
+            masks_ori = F.interpolate(seg, img_orishape, mode='nearest')[0]
+        else:
+            masks_ori = None
+        masks = crop_mask(masks, xyxy).gt_(0.5)
+        return masks
diff --git a/yolov6/core/inferer.py b/yolov6/core/inferer.py
index cea6586d..3fef6b35 100644
--- a/yolov6/core/inferer.py
+++ b/yolov6/core/inferer.py
@@ -13,11 +13,13 @@
 from PIL import ImageFont
 from collections import deque
 
+import torch.nn.functional as F
+
 from yolov6.utils.events import LOGGER, load_yaml
 from yolov6.layers.common import DetectBackend
 from yolov6.data.data_augment import letterbox
 from yolov6.data.datasets import LoadData
-from yolov6.utils.nms import non_max_suppression
+from yolov6.utils.nms import non_max_suppression_seg, non_max_suppression_seg_solo
 from yolov6.utils.torch_utils import get_model_info
 
 class Inferer:
@@ -67,10 +69,13 @@ def model_switch(self, model, img_size):
 
         LOGGER.info("Switch model to deploy modality.")
 
-    def infer(self, conf_thres, iou_thres, classes, agnostic_nms, max_det, save_dir, save_txt, save_img, hide_labels, hide_conf, view_img=True):
+    def infer(self, conf_thres, iou_thres, classes, agnostic_nms, max_det, save_dir, save_txt, save_img, hide_labels, hide_conf, view_img=True, issolo=True, weight_nums=66, bias_nums=1, dyconv_channels=66):
         ''' Model Inference and results visualization '''
         vid_path, vid_writer, windows = None, None, []
+        print(issolo)
         fps_calculator = CalcFPS()
+        weight_nums = [weight_nums]
+        bias_nums = [bias_nums]
         for img_src, img_path, vid_cap in tqdm(self.files):
             img, img_src = self.process_image(img_src, self.img_size, self.stride, self.half)
             img = img.to(self.device)
@@ -79,15 +84,31 @@ def infer(self, conf_thres, iou_thres, classes, agnostic_nms, max_det, save_dir,
                 # expand for batch dim
             t1 = time.time()
             pred_results = self.model(img)
-            det = non_max_suppression(pred_results, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det)[0]
+            if not issolo:
+                loutputs = non_max_suppression_seg(pred_results, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det)
+            else:
+                loutputs = non_max_suppression_seg_solo(pred_results, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det)
+            protos = pred_results[1][0]
+            segments = []
+            print(len(loutputs))
+            segconf = [loutputs[li][..., 0:] for li in range(len(loutputs))]
+            det = [loutputs[li][..., :6] for li in range(len(loutputs))][0]
+            if not issolo:
+                segments = [self.handle_proto_test([protos[li].reshape(1, *(protos[li].shape[-3:]))], segconf[li], img.shape[-2:]) for li in range(len(loutputs))][0]
+            else:
+                segments = [self.handle_proto_solo([protos[li].reshape(1, *(protos[li].shape[-3:]))], segconf[li], img.shape[-2:], weight_sums=weight_nums, bias_sums=bias_nums, dyconv=dyconv_channels) for li in range(len(loutputs))][0]
             t2 = time.time()
 
+            
+
             if self.webcam:
                 save_path = osp.join(save_dir, self.webcam_addr)
                 txt_path = osp.join(save_dir, self.webcam_addr)
             else:
                 # Create output files in nested dirs that mirrors the structure of the images' dirs
-                rel_path = osp.relpath(osp.dirname(img_path), osp.dirname(self.source))
+                print(osp.dirname(img_path))
+                print(osp.dirname(self.source))
+                rel_path = "test"
                 save_path = osp.join(save_dir, rel_path, osp.basename(img_path))  # im.jpg
                 txt_path = osp.join(save_dir, rel_path, 'labels', osp.splitext(osp.basename(img_path))[0])
                 os.makedirs(osp.join(save_dir, rel_path), exist_ok=True)
@@ -98,9 +119,14 @@ def infer(self, conf_thres, iou_thres, classes, agnostic_nms, max_det, save_dir,
             # check image and font
             assert img_ori.data.contiguous, 'Image needs to be contiguous. Please apply to input images with np.ascontiguousarray(im).'
             self.font_check()
-
             if len(det):
                 det[:, :4] = self.rescale(img.shape[2:], det[:, :4], img_src.shape).round()
+                
+                
+                ii = 0
+                segments = self.rescale_mask(img.shape[2:], segments.cpu().numpy(), img_src.shape)
+                print(segments.shape)
+                segments = segments.transpose(2, 0, 1)
                 for *xyxy, conf, cls in reversed(det):
                     if save_txt:  # Write to file
                         xywh = (self.box_convert(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist()  # normalized xywh
@@ -109,13 +135,16 @@ def infer(self, conf_thres, iou_thres, classes, agnostic_nms, max_det, save_dir,
                             f.write(('%g ' * len(line)).rstrip() % line + '\n')
 
                     if save_img:
+                        print(cls)
                         class_num = int(cls)  # integer class
                         label = None if hide_labels else (self.class_names[class_num] if hide_conf else f'{self.class_names[class_num]} {conf:.2f}')
 
-                        self.plot_box_and_label(img_ori, max(round(sum(img_ori.shape) / 2 * 0.003), 2), xyxy, label, color=self.generate_colors(class_num, True))
+                        img_ori = self.plot_box_and_label(img_ori, max(round(sum(img_ori.shape) / 2 * 0.003), 2), xyxy, label, color=self.generate_colors(class_num, True), segment=segments[ii])
+                    ii += 1
 
                 img_src = np.asarray(img_ori)
 
+
             # FPS counter
             fps_calculator.update(1.0 / (t2 - t1))
             avg_fps = fps_calculator.accumulate()
@@ -187,6 +216,21 @@ def rescale(ori_shape, boxes, target_shape):
 
         return boxes
 
+    @staticmethod
+    def rescale_mask(ori_shape, masks, target_shape):
+        '''Rescale the output to the original image shape'''
+        ratio = min(ori_shape[0] / target_shape[0], ori_shape[1] / target_shape[1])
+        padding = int((ori_shape[1] - target_shape[1] * ratio) / 2), int((ori_shape[0] - target_shape[0] * ratio) / 2)
+
+
+        masks = masks[:, padding[1]: ori_shape[0]- padding[1], padding[0]: ori_shape[1] - padding[0]]
+        masks = masks.transpose(1, 2, 0)
+        masks = cv2.resize(masks, target_shape[:2][::-1])
+        if len(masks.shape) == 2:
+            masks = masks.reshape(*masks.shape, 1)
+
+        return masks
+
     def check_img_size(self, img_size, s=32, floor=0):
         """Make sure image size is a multiple of stride s in each dimension, and return a new shape list of image."""
         if isinstance(img_size, int):  # integer i.e. img_size=640
@@ -204,6 +248,200 @@ def make_divisible(self, x, divisor):
         # Upward revision the value x to make it evenly divisible by the divisor.
         return math.ceil(x / divisor) * divisor
 
+    @staticmethod
+    def handle_proto(proto_list, oconfs, imgshape, det):
+        '''
+        proto_list: [(bs, 32, w, h), ...]
+        conf: (bs, l, 33) -> which_proto, 32
+        '''
+        def crop_mask(masks, boxes):
+            """
+            "Crop" predicted masks by zeroing out everything not in the predicted bbox.
+            Vectorized by Chong (thanks Chong).
+
+            Args:
+                - masks should be a size [n, h, w] tensor of masks
+                - boxes should be a size [n, 4] tensor of bbox coords in relative point form
+            """
+
+            n, h, w = masks.shape
+            x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1)  # x1 shape(1,1,n)
+            r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :]  # rows shape(1,w,1)
+            c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None]  # cols shape(h,1,1)
+            return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
+
+        conf = oconfs[..., 6:]
+        
+        xyxy = oconfs[..., :4]
+        which_proto = conf[..., 0]
+        confs = conf[..., 1:]
+        res = []
+        protos = proto_list[0]
+        for i, proto in enumerate([protos, protos, protos]):
+            s = proto.shape[-2:]
+            tconfs = confs[which_proto[..., 0] == i]
+            if tconfs.shape[0] == 0:
+                continue
+            tseg = ((tconfs@proto.reshape(proto.shape[0], proto.shape[1], -1)).reshape(proto.shape[0], tconfs.shape[1], *s))
+            print("a:")
+            print(which_proto[..., 0] == i)
+            tseg=tseg.sigmoid()
+            masks = F.interpolate(tseg, imgshape, mode='nearest')[0]
+            #return masks
+            print(xyxy[which_proto[..., 0] == i][0].shape)
+            masks = crop_mask(masks, xyxy[which_proto[..., 0] == i][0])[0]
+            res.append(masks.gt_(0.5))
+        return torch.cat(res, dim = 0), xyxy[which_proto[..., 0] == i][0]
+    
+
+    @staticmethod
+    def handle_proto_test(proto_list, oconfs, imgshape, img_orishape=None):
+        '''
+        proto_list: [(bs, 32, w, h), ...]
+        conf: (bs, l, 33) -> which_proto, 32
+        '''
+        def crop_mask(masks, boxes):
+            """
+            "Crop" predicted masks by zeroing out everything not in the predicted bbox.
+            Vectorized by Chong (thanks Chong).
+
+            Args:
+                - masks should be a size [n, h, w] tensor of masks
+                - boxes should be a size [n, 4] tensor of bbox coords in relative point form
+            """
+
+            n, h, w = masks.shape
+            x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1)  # x1 shape(1,1,n)
+            r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :]  # rows shape(1,w,1)
+            c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None]  # cols shape(h,1,1)
+            return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
+
+        conf = oconfs[..., 6:]
+        if conf.shape[0] == 0:
+            return None
+        
+        xyxy = oconfs[..., :4]
+        confs = conf[..., 1:]
+        proto = proto_list[0]
+        s = proto.shape[-2:]
+        seg = ((confs@proto.reshape(proto.shape[0], proto.shape[1], -1)).reshape(proto.shape[0], confs.shape[0], *s))
+        seg = seg.sigmoid()
+        masks = F.interpolate(seg, imgshape, mode='bilinear', align_corners=False)[0]
+        if img_orishape:
+            masks_ori = F.interpolate(seg, img_orishape, mode='nearest')[0]
+        else:
+            masks_ori = None
+        masks = crop_mask(masks, xyxy).gt_(0.5)
+        return masks
+    
+    # def handle_proto_solo(self, proto_list, oconfs, imgshape, weight_sums=66, bias_sums=66, dyconv=66, img_orishape=None):
+    #     '''
+    #     proto_list: [(bs, 32, w, h), ...]
+    #     conf: (bs, l, 33) -> which_proto, 32
+    #     '''
+    #     def crop_mask(masks, boxes):
+    #         """
+    #         "Crop" predicted masks by zeroing out everything not in the predicted bbox.
+    #         Vectorized by Chong (thanks Chong).
+
+    #         Args:
+    #             - masks should be a size [n, h, w] tensor of masks
+    #             - boxes should be a size [n, 4] tensor of bbox coords in relative point form
+    #         """
+
+    #         n, h, w = masks.shape
+    #         x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1)  # x1 shape(1,1,n)
+    #         r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :]  # rows shape(1,w,1)
+    #         c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None]  # cols shape(h,1,1)
+    #         return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
+
+    #     conf = oconfs[..., 6:]
+    #     if conf.shape[0] == 0:
+    #         return None
+        
+    #     xyxy = oconfs[..., :4]
+    #     confs = conf[..., 1:]
+    #     proto = proto_list[0]
+    #     s = proto.shape[-2:]
+    #     num_inst = confs.shape[0]
+    #     proto = proto.reshape(1, -1, *proto.shape[-2:])
+    #     proto = proto.repeat(num_inst, 1, 1, 1)
+    #     weights, biases = self.parse_dynamic_params(confs, weight_nums=weight_sums, bias_nums=bias_sums, dyconv_channels=dyconv)
+    #     n_layers = len(weights)
+    #     for i, (weight, bias) in enumerate(zip(weights, biases)):
+    #         x = F.conv2d(
+    #             proto, weight, bias=bias, stride=1, padding=0, groups=num_inst)
+    #         if i < n_layers - 1:
+    #             x = F.relu(x)
+    #     x = x.reshape(num_inst, *proto.shape[-2:])
+    #     seg = x.sigmoid()
+    #     masks = F.interpolate(seg, imgshape, mode='bilinear', align_corners=False)[0]
+    #     if img_orishape:
+    #         masks_ori = F.interpolate(seg, img_orishape, mode='nearest')[0]
+    #     else:
+    #         masks_ori = None
+    #     masks = crop_mask(masks, xyxy).gt_(0.5)
+    #     return masks
+    def handle_proto_solo(self, proto_list, oconfs, imgshape, weight_sums=66, bias_sums=1, dyconv=66, img_orishape=None):
+        '''
+        proto_list: [(bs, 32, w, h), ...]
+        conf: (bs, l, 33) -> which_proto, 32
+        '''
+        def handle_proto_coord(proto):
+            _ = proto.shape[-2:]
+            x = torch.arange(0, 1, step = 1 / _[1]).unsqueeze(0).unsqueeze(0).repeat(1, _[0], 1).to(proto.dtype).to(proto.device)
+            y = torch.arange(0, 1, step = 1 / _[0]).unsqueeze(0).T.unsqueeze(0).repeat(1, 1, _[1]).to(proto.dtype).to(proto.device)
+            return torch.cat([proto, x, y]).reshape(1, -1, *_)
+        
+        def crop_mask(masks, boxes):
+            """
+            "Crop" predicted masks by zeroing out everything not in the predicted bbox.
+            Vectorized by Chong (thanks Chong).
+
+            Args:
+                - masks should be a size [n, h, w] tensor of masks
+                - boxes should be a size [n, 4] tensor of bbox coords in relative point form
+            """
+
+            n, h, w = masks.shape
+            x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1)  # x1 shape(1,1,n)
+            r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :]  # rows shape(1,w,1)
+            c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None]  # cols shape(h,1,1)
+            return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
+
+        conf = oconfs[..., 6:]
+        if conf.shape[0] == 0:
+            return None
+        
+        xyxy = oconfs[..., :4]
+        confs = conf[..., 1:]
+        proto = proto_list[0][0]
+        proto = handle_proto_coord(proto)
+        s = proto.shape[-2:]
+        num_inst = confs.shape[0]
+        proto = proto.reshape(1, -1, *proto.shape[-2:])
+        weights, biases = self.parse_dynamic_params(confs, weight_nums=weight_sums, bias_nums=bias_sums, dyconv_channels=dyconv)
+        n_layers = len(weights)
+        for i, (weight, bias) in enumerate(zip(weights, biases)):
+            x = F.conv2d(
+                proto, weight, bias=bias, stride=1, padding=0, groups=1)
+            if i < n_layers - 1:
+                x = F.relu(x)
+        x = x.reshape(num_inst, *proto.shape[-2:]).unsqueeze(0)
+        seg = x.sigmoid()
+        masks = F.interpolate(seg, imgshape, mode='bilinear', align_corners=False)[0]
+        if img_orishape:
+            masks_ori = F.interpolate(seg, img_orishape, mode='nearest')[0]
+        else:
+            masks_ori = None
+        masks = crop_mask(masks, xyxy).gt_(0.5)
+        masks = masks.gt_(0.5)
+        return masks
+            
+            
+
+
+
     @staticmethod
     def draw_text(
         img,
@@ -237,9 +475,10 @@ def draw_text(
         return text_size
 
     @staticmethod
-    def plot_box_and_label(image, lw, box, label='', color=(128, 128, 128), txt_color=(255, 255, 255), font=cv2.FONT_HERSHEY_COMPLEX):
+    def plot_box_and_label(image, lw, box, label='', color=(128, 128, 128), txt_color=(255, 255, 255), font=cv2.FONT_HERSHEY_COMPLEX, segment=None):
         # Add one xyxy box to image with label
         p1, p2 = (int(box[0]), int(box[1])), (int(box[2]), int(box[3]))
+        common_color = [[128,0,0], [255,0,0],[255,0,255],[255,102,0],[51,51,0],[0,51,0],[51,204,204],[0,128,128],[0,204,255]]
         cv2.rectangle(image, p1, p2, color, thickness=lw, lineType=cv2.LINE_AA)
         if label:
             tf = max(lw - 1, 1)  # font thickness
@@ -249,6 +488,13 @@ def plot_box_and_label(image, lw, box, label='', color=(128, 128, 128), txt_colo
             cv2.rectangle(image, p1, p2, color, -1, cv2.LINE_AA)  # filled
             cv2.putText(image, label, (p1[0], p1[1] - 2 if outside else p1[1] + h + 2), font, lw / 3, txt_color,
                         thickness=tf, lineType=cv2.LINE_AA)
+        if segment is not None:
+            import random
+            ii=random.randint(0, len(common_color)-1)
+            colr = np.asarray(common_color[ii])
+            colr = colr.reshape(1,3).repeat((image.shape[0] * image.shape[1]), axis = 0).reshape(image.shape[0], image.shape[1], 3)
+            image = cv2.addWeighted(image, 1, (colr * segment.reshape(*segment.shape[:2], 1)).astype(image.dtype), 0.8, 1)
+        return image
 
     @staticmethod
     def font_check(font='./yolov6/utils/Arial.ttf', size=10):
@@ -280,6 +526,27 @@ def generate_colors(i, bgr=False):
         num = len(palette)
         color = palette[int(i) % num]
         return (color[2], color[1], color[0]) if bgr else color
+    
+    def parse_dynamic_params(self, flatten_kernels, weight_nums, bias_nums, dyconv_channels):
+        """split kernel head prediction to conv weight and bias."""
+        n_inst = flatten_kernels.size(0)
+        n_layers = len(weight_nums)
+        params_splits = list(
+            torch.split_with_sizes(
+                flatten_kernels, weight_nums + bias_nums, dim=1))
+        weight_splits = params_splits[:n_layers]
+        bias_splits = params_splits[n_layers:]
+        for i in range(n_layers):
+            if i < n_layers - 1:
+                weight_splits[i] = weight_splits[i].reshape(
+                    n_inst * dyconv_channels, -1, 1, 1)
+                bias_splits[i] = bias_splits[i].reshape(n_inst *
+                                                        dyconv_channels)
+            else:
+                weight_splits[i] = weight_splits[i].reshape(n_inst, -1, 1, 1)
+                bias_splits[i] = bias_splits[i].reshape(n_inst)
+
+        return weight_splits, bias_splits
 
 class CalcFPS:
     def __init__(self, nsamples: int = 50):
diff --git a/yolov6/data/data_augment.py b/yolov6/data/data_augment.py
index 45df88e6..e21c3873 100644
--- a/yolov6/data/data_augment.py
+++ b/yolov6/data/data_augment.py
@@ -26,7 +26,7 @@ def augment_hsv(im, hgain=0.5, sgain=0.5, vgain=0.5):
         cv2.cvtColor(im_hsv, cv2.COLOR_HSV2BGR, dst=im)  # no return needed
 
 
-def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleup=True, stride=32):
+def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=False, scaleup=True, stride=32):
     '''Resize and pad image while meeting stride-multiple constraints.'''
     shape = im.shape[:2]  # current shape [height, width]
     if isinstance(new_shape, int):
@@ -51,19 +51,22 @@ def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleu
 
     if shape[::-1] != new_unpad:  # resize
         im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+
     top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
     left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
     im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
 
+
     return im, r, (left, top)
 
 
-def mixup(im, labels, im2, labels2):
+def mixup(im, labels, segments, im2, labels2, segments2):
     '''Applies MixUp augmentation https://arxiv.org/pdf/1710.09412.pdf.'''
     r = np.random.beta(32.0, 32.0)  # mixup ratio, alpha=beta=32.0
     im = (im * r + im2 * (1 - r)).astype(np.uint8)
     labels = np.concatenate((labels, labels2), 0)
-    return im, labels
+    segments = np.concatenate((segments, segments2), 0)
+    return im, labels, segments
 
 
 def box_candidates(box1, box2, wh_thr=2, ar_thr=20, area_thr=0.1, eps=1e-16):  # box1(4,n), box2(4,n)
@@ -78,19 +81,17 @@ def random_affine(img, labels=(), degrees=10, translate=.1, scale=.1, shear=10,
                   new_shape=(640, 640)):
     '''Applies Random affine transformation.'''
     n = len(labels)
-    if isinstance(new_shape, int):
-        height = width = new_shape
-    else:
-        height, width = new_shape
+    height, width = new_shape
 
     M, s = get_transform_matrix(img.shape[:2], (height, width), degrees, scale, shear, translate)
     if (M != np.eye(3)).any():  # image changed
         img = cv2.warpAffine(img, M[:2], dsize=(width, height), borderValue=(114, 114, 114))
 
     # Transform label coordinates
+    new_segments = []
     if n:
         new = np.zeros((n, 4))
-
+        
         xy = np.ones((n * 4, 3))
         xy[:, :2] = labels[:, [1, 2, 3, 4, 1, 4, 3, 2]].reshape(n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
         xy = xy @ M.T  # transform
@@ -113,6 +114,7 @@ def random_affine(img, labels=(), degrees=10, translate=.1, scale=.1, shear=10,
     return img, labels
 
 
+
 def get_transform_matrix(img_shape, new_shape, degrees, scale, shear, translate):
     new_height, new_width = new_shape
     # Center
@@ -147,6 +149,7 @@ def mosaic_augmentation(shape, imgs, hs, ws, labels, hyp, specific_shape = False
     '''Applies Mosaic augmentation.'''
     assert len(imgs) == 4, "Mosaic augmentation of current version only supports 4 images."
     labels4 = []
+    
     if not specific_shape:
         if isinstance(shape, list) or isinstance(shape, np.ndarray):
             target_height, target_width = shape
@@ -180,15 +183,18 @@ def mosaic_augmentation(shape, imgs, hs, ws, labels, hyp, specific_shape = False
 
         # Labels
         labels_per_img = labels[i].copy()
+        
         if labels_per_img.size:
             boxes = np.copy(labels_per_img[:, 1:])
             boxes[:, 0] = w * (labels_per_img[:, 1] - labels_per_img[:, 3] / 2) + padw  # top left x
             boxes[:, 1] = h * (labels_per_img[:, 2] - labels_per_img[:, 4] / 2) + padh  # top left y
             boxes[:, 2] = w * (labels_per_img[:, 1] + labels_per_img[:, 3] / 2) + padw  # bottom right x
             boxes[:, 3] = h * (labels_per_img[:, 2] + labels_per_img[:, 4] / 2) + padh  # bottom right y
+            
             labels_per_img[:, 1:] = boxes
 
         labels4.append(labels_per_img)
+        
 
     # Concat/clip labels
     labels4 = np.concatenate(labels4, 0)
@@ -196,6 +202,7 @@ def mosaic_augmentation(shape, imgs, hs, ws, labels, hyp, specific_shape = False
     #     np.clip(x, 0, 2 * s, out=x)
     labels4[:, 1::2] = np.clip(labels4[:, 1::2], 0, 2 * target_width)
     labels4[:, 2::2] = np.clip(labels4[:, 2::2], 0, 2 * target_height)
+    
 
     # Augment
     img4, labels4 = random_affine(img4, labels4,
@@ -205,4 +212,4 @@ def mosaic_augmentation(shape, imgs, hs, ws, labels, hyp, specific_shape = False
                                   shear=hyp['shear'],
                                   new_shape=(target_height, target_width))
 
-    return img4, labels4
+    return img4, labels4
\ No newline at end of file
diff --git a/yolov6/data/data_load.py b/yolov6/data/data_load.py
index e68e8d71..923ab1f2 100644
--- a/yolov6/data/data_load.py
+++ b/yolov6/data/data_load.py
@@ -7,7 +7,7 @@
 import torch.distributed as dist
 from torch.utils.data import dataloader, distributed
 
-from .datasets import TrainValDataset
+from .seg_datasets import TrainValDataset
 from yolov6.utils.events import LOGGER
 from yolov6.utils.torch_utils import torch_distributed_zero_first
 
diff --git a/yolov6/data/seg_data_augment.py b/yolov6/data/seg_data_augment.py
new file mode 100644
index 00000000..6a2c87b6
--- /dev/null
+++ b/yolov6/data/seg_data_augment.py
@@ -0,0 +1,298 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# This code is based on
+# https://github.com/ultralytics/yolov5/blob/master/utils/dataloaders.py
+
+import math
+import random
+
+import cv2
+import numpy as np
+
+
+def augment_hsv(im, hgain=0.5, sgain=0.5, vgain=0.5):
+    '''HSV color-space augmentation.'''
+    if hgain or sgain or vgain:
+        r = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] + 1  # random gains
+        hue, sat, val = cv2.split(cv2.cvtColor(im, cv2.COLOR_BGR2HSV))
+        dtype = im.dtype  # uint8
+
+        x = np.arange(0, 256, dtype=r.dtype)
+        lut_hue = ((x * r[0]) % 180).astype(dtype)
+        lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
+        lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
+
+        im_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)))
+        cv2.cvtColor(im_hsv, cv2.COLOR_HSV2BGR, dst=im)  # no return needed
+
+
+def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleup=True, stride=32):
+    '''Resize and pad image while meeting stride-multiple constraints.'''
+    shape = im.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+    elif isinstance(new_shape, list) and len(new_shape) == 1:
+       new_shape = (new_shape[0], new_shape[0])
+
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+    if not scaleup:  # only scale down, do not scale up (for better val mAP)
+        r = min(r, 1.0)
+
+    # Compute padding
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+
+    if auto:  # minimum rectangle
+        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
+
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+
+    if shape[::-1] != new_unpad:  # resize
+        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
+
+    return im, r, (left, top)
+
+
+def mixup(im, labels, segments, im2, labels2, segments2):
+    # Applies MixUp augmentation https://arxiv.org/pdf/1710.09412.pdf
+    r = np.random.beta(32.0, 32.0)  # mixup ratio, alpha=beta=32.0
+    im = (im * r + im2 * (1 - r)).astype(np.uint8)
+    labels = np.concatenate((labels, labels2), 0)
+    segments = np.concatenate((segments, segments2), 0)
+    return im, labels, segments
+
+
+def box_candidates(box1, box2, wh_thr=2, ar_thr=20, area_thr=0.1, eps=1e-16):  # box1(4,n), box2(4,n)
+    '''Compute candidate boxes: box1 before augment, box2 after augment, wh_thr (pixels), aspect_ratio_thr, area_ratio.'''
+    w1, h1 = box1[2] - box1[0], box1[3] - box1[1]
+    w2, h2 = box2[2] - box2[0], box2[3] - box2[1]
+    ar = np.maximum(w2 / (h2 + eps), h2 / (w2 + eps))  # aspect ratio
+    return (w2 > wh_thr) & (h2 > wh_thr) & (ar < ar_thr)  # candidates
+
+
+def random_affine(img, labels=(), segments=(), degrees=10, translate=.1, scale=.1, shear=10,
+                  new_shape=(640, 640), task=""):
+    '''Applies Random affine transformation.'''
+    n = len(labels)
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+    height, width = new_shape
+    # print(height, width, (height, width))
+
+    M, s = get_transform_matrix(img.shape[:2], (height, width), degrees, scale, shear, translate)
+    if (M != np.eye(3)).any():  # image changed
+        img = cv2.warpAffine(img, M[:2], dsize=(width, height), borderValue=(114, 114, 114))
+
+    new_segments = []
+    # Transform label coordinates
+    if n:
+        new = np.zeros((n, 4))
+        segments = resample_segments(segments)
+        for i, segment in enumerate(segments):
+            xy = np.ones((len(segment), 3))
+            xy[:, :2] = segment
+            xy = xy @ M.T  # transform
+            xy = (xy[:, :2])
+
+            # clip
+            new[i] = segment2box(xy, width, height)
+            new_segments.append(xy)
+        i = box_candidates(box1=labels[:, 1:5].T * s, box2=new.T, area_thr=0.01)
+        if task!="val":
+            labels = labels[i]
+            labels[:, 1:5] = new[i]
+            new_segments = np.array(new_segments)[i]
+        else:
+            labels[:, 1:5] = new
+            new_segments = np.array(new_segments)
+    return img, labels, new_segments
+
+def copy_paste(im, labels, segments, p=0.5):
+    # Implement Copy-Paste augmentation https://arxiv.org/abs/2012.07177, labels as nx5 np.array(cls, xyxy)
+    n = len(segments)
+    if p and n:
+        h, w, c = im.shape  # height, width, channels
+        im_new = np.zeros(im.shape, np.uint8)
+        for j in random.sample(range(n), k=round(p * n)):
+            l, s = labels[j], segments[j]
+            box = w - l[3], l[2], w - l[1], l[4]
+            ioa = bbox_ioa(box, labels[:, 1:5])  # intersection over area
+            if (ioa < 0.30).all():  # allow 30% obscuration of existing labels
+                labels = np.concatenate((labels, [[l[0], *box]]), 0)
+                segments.append(np.concatenate((w - s[:, 0:1], s[:, 1:2]), 1))
+                cv2.drawContours(im_new, [segments[j].astype(np.int32)], -1, (1, 1, 1), cv2.FILLED)
+        result = cv2.flip(im, 1)  # augment segments (flip left-right)
+        i = cv2.flip(im_new, 1).astype(bool)
+        im[i] = result[i]  # cv2.imwrite('debug.jpg', im)  # debug
+
+    return im, labels, segments
+
+def bbox_ioa(box1, box2, eps=1e-7):
+    """ Returns the intersection over box2 area given box1, box2. Boxes are x1y1x2y2
+    box1:       np.array of shape(4)
+    box2:       np.array of shape(nx4)
+    returns:    np.array of shape(n)
+    """
+
+    # Get the coordinates of bounding boxes
+    b1_x1, b1_y1, b1_x2, b1_y2 = box1
+    b2_x1, b2_y1, b2_x2, b2_y2 = box2.T
+
+    # Intersection area
+    inter_area = (np.minimum(b1_x2, b2_x2) - np.maximum(b1_x1, b2_x1)).clip(0) * \
+                 (np.minimum(b1_y2, b2_y2) - np.maximum(b1_y1, b2_y1)).clip(0)
+
+    # box2 area
+    box2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + eps
+
+    # Intersection over box2 area
+    return inter_area / box2_area
+
+
+def regen_labels(labels=None, segments=None, new_shape=(640, 640)):
+    '''Applies Random affine transformation.'''
+    n = len(segments)
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+    height, width = new_shape
+
+    new_segments = []
+    # Transform label coordinates
+    if n:
+        new = np.zeros((n, 4))
+        segments = resample_segments(segments)
+        for i, segment in enumerate(segments):
+            new[i] = segment2box(segment, width, height)
+            new_segments.append(segment)
+        labels[:, 1:5] = new[i]
+        new_segments = np.array(new_segments)[i]
+
+    return labels, new_segments
+
+def resample_segments(segments, n=1000):
+    # Up-sample an (n,2) segment
+    for i, s in enumerate(segments):
+        s = np.concatenate((s, s[0:1, :]), axis=0)
+        x = np.linspace(0, len(s) - 1, n)
+        xp = np.arange(len(s))
+        segments[i] = np.concatenate([np.interp(x, xp, s[:, i]) for i in range(2)]).reshape(2, -1).T  # segment xy
+    return segments
+
+
+def get_transform_matrix(img_shape, new_shape, degrees, scale, shear, translate):
+    new_height, new_width = new_shape
+    # print(new_height, new_width)
+    # Center
+    C = np.eye(3)
+    C[0, 2] = -img_shape[1] / 2  # x translation (pixels)
+    C[1, 2] = -img_shape[0] / 2  # y translation (pixels)
+
+    # Rotation and Scale
+    R = np.eye(3)
+    a = random.uniform(-degrees, degrees)
+    # a += random.choice([-180, -90, 0, 90])  # add 90deg rotations to small rotations
+    s = random.uniform(1 - scale, 1 + scale)
+    # s = 2 ** random.uniform(-scale, scale)
+    R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s)
+
+    # Shear
+    S = np.eye(3)
+    S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # x shear (deg)
+    S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # y shear (deg)
+
+    # Translation
+    T = np.eye(3)
+    T[0, 2] = random.uniform(0.5 - translate, 0.5 + translate) * new_width  # x translation (pixels)
+    T[1, 2] = random.uniform(0.5 - translate, 0.5 + translate) * new_height  # y transla ion (pixels)
+
+    # Combined rotation matrix
+    M = T @ S @ R @ C  # order of operations (right to left) is IMPORTANT
+    return M, s
+
+
+def mosaic_augmentation(shape, imgs, hs, ws, labels, segments, hyp, specific_shape = False, target_height=640, target_width=640):
+    '''Applies Mosaic augmentation.'''
+    assert len(imgs) == 4, "Mosaic augmentation of current version only supports 4 images."
+    labels4 = []
+    segments4 = []
+    if not specific_shape:
+        if isinstance(shape, list) or isinstance(shape, np.ndarray):
+            target_height, target_width = shape
+        else:
+            target_height = target_width = shape
+
+    yc, xc = (int(random.uniform(x//2, 3*x//2)) for x in (target_height, target_width) )  # mosaic center x, y
+
+    for i in range(len(imgs)):
+        # Load image
+        img, h, w = imgs[i], hs[i], ws[i]
+        # place img in img4
+        if i == 0:  # top left
+            img4 = np.full((target_height * 2, target_width * 2, img.shape[2]), 114, dtype=np.uint8)  # base image with 4 tiles
+
+            x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc  # xmin, ymin, xmax, ymax (large image)
+            x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h  # xmin, ymin, xmax, ymax (small image)
+        elif i == 1:  # top right
+            x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, target_width * 2), yc
+            x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
+        elif i == 2:  # bottom left
+            x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(target_height * 2, yc + h)
+            x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h)
+        elif i == 3:  # bottom right
+            x1a, y1a, x2a, y2a = xc, yc, min(xc + w, target_width * 2), min(target_height * 2, yc + h)
+            x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)
+
+        img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b]  # img4[ymin:ymax, xmin:xmax]
+        padw = x1a - x1b
+        padh = y1a - y1b
+
+        # Labels
+        labels_per_img = labels[i].copy()
+        segments_per_img = segments[i].copy()
+        if labels_per_img.size:
+            boxes = np.copy(labels_per_img[:, 1:])
+            boxes[:, 0] = w * (labels_per_img[:, 1] - labels_per_img[:, 3] / 2) + padw  # top left x
+            boxes[:, 1] = h * (labels_per_img[:, 2] - labels_per_img[:, 4] / 2) + padh  # top left y
+            boxes[:, 2] = w * (labels_per_img[:, 1] + labels_per_img[:, 3] / 2) + padw  # bottom right x
+            boxes[:, 3] = h * (labels_per_img[:, 2] + labels_per_img[:, 4] / 2) + padh  # bottom right y
+            for __ in range(len(segments_per_img)):
+                segments_per_img[__][:, 0] = w * segments_per_img[__][:, 0] + padw
+                segments_per_img[__][:, 1] = h * segments_per_img[__][:, 1] + padh
+            labels_per_img[:, 1:] = boxes
+
+        labels4.append(labels_per_img)
+        segments4.extend(segments_per_img)
+
+    # Concat/clip labels
+    labels4 = np.concatenate(labels4, 0)
+    # for x in (labels4[:, 1:]):
+    #     np.clip(x, 0, 2 * s, out=x)
+    labels4[:, 1::2] = np.clip(labels4[:, 1::2], 0, 2 * target_width)
+    labels4[:, 2::2] = np.clip(labels4[:, 2::2], 0, 2 * target_height)
+    for __ in range(len(segments4)):
+        segments4[__][:, 0] = np.clip(segments4[__][:, 0], 0, 2 * target_width)
+        segments4[__][:, 1] = np.clip(segments4[__][:, 1], 0, 2 * target_height)
+
+    # Augment
+    return img4, labels4, segments4
+    img4, labels4, segments4 = random_affine(img4, labels4, segments4,
+                                  degrees=hyp['degrees'],
+                                  translate=hyp['translate'],
+                                  scale=hyp['scale'],
+                                  shear=hyp['shear'],
+                                  new_shape=(target_height, target_width))
+
+    return img4, labels4, segments4
+
+def segment2box(segment, width=640, height=640):
+    # Convert 1 segment label to 1 box label, applying inside-image constraint, i.e. (xy1, xy2, ...) to (xyxy)
+    x, y = segment.T  # segment xy
+    inside = (x >= 0) & (y >= 0) & (x <= width) & (y <= height)
+    x, y, = x[inside], y[inside]
+    return np.array([x.min(), y.min(), x.max(), y.max()]) if any(x) else np.zeros((1, 4))  # xyxy
+
diff --git a/yolov6/data/seg_datasets.py b/yolov6/data/seg_datasets.py
new file mode 100644
index 00000000..8cca6513
--- /dev/null
+++ b/yolov6/data/seg_datasets.py
@@ -0,0 +1,859 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+import glob
+from io import UnsupportedOperation
+import os
+import os.path as osp
+import random
+import json
+import time
+import hashlib
+from pathlib import Path
+import copy
+
+from multiprocessing.pool import Pool
+
+import cv2
+import numpy as np
+from tqdm import tqdm
+from PIL import ExifTags, Image, ImageOps
+
+import torch
+from torch.utils.data import Dataset
+import torch.distributed as dist
+
+from .seg_data_augment import (
+    augment_hsv,
+    letterbox,
+    mixup,
+    random_affine,
+    mosaic_augmentation,
+    copy_paste
+)
+from yolov6.utils.events import LOGGER
+import pickle
+
+
+# Parameters
+IMG_FORMATS = ["bmp", "jpg", "jpeg", "png", "tif", "tiff", "dng", "webp", "mpo"]
+VID_FORMATS = ["mp4", "mov", "avi", "mkv"]
+IMG_FORMATS.extend([f.upper() for f in IMG_FORMATS])
+VID_FORMATS.extend([f.upper() for f in VID_FORMATS])
+# Get orientation exif tag
+for k, v in ExifTags.TAGS.items():
+    if v == "Orientation":
+        ORIENTATION = k
+        break
+
+def img2label_paths(img_paths):
+    # Define label paths as a function of image paths
+    sa, sb = f'{os.sep}images{os.sep}', f'{os.sep}labels{os.sep}'  # /images/, /labels/ substrings
+    return [sb.join(x.rsplit(sa, 1)).rsplit('.', 1)[0] + '.txt' for x in img_paths]
+
+class TrainValDataset(Dataset):
+    '''YOLOv6 train_loader/val_loader, loads images and labels for training and validation.'''
+    def __init__(
+        self,
+        img_dir,
+        img_size=640,
+        batch_size=16,
+        augment=False,
+        hyp=None,
+        rect=False,
+        check_images=False,
+        check_labels=False,
+        stride=32,
+        pad=0.0,
+        rank=-1,
+        data_dict=None,
+        task="train",
+        specific_shape = False,
+        height=1088,
+        width=1920,
+        downsample_ratio=4,
+        overlap=False
+    ):
+        assert task.lower() in ("train", "val", "test", "speed"), f"Not supported task: {task}"
+        
+        t1 = time.time()
+        self.__dict__.update(locals())
+        if task.lower()!="train":
+            self.downsample_ratio = 1
+        self.main_process = self.rank in (-1, 0)
+        self.task = self.task.capitalize()
+        self.class_names = data_dict["names"]
+        self.img_paths, self.labels = self.get_imgs_labels(self.img_dir)
+        self.labels, self.segments = self.get_segment(self.labels)
+
+        self.rect = rect
+        self.specific_shape = specific_shape
+        self.target_height = height
+        self.target_width = width
+        if self.rect:
+            shapes = [self.img_info[p]["shape"] for p in self.img_paths]
+            self.shapes = np.array(shapes, dtype=np.float64)
+            if dist.is_initialized():
+                # in DDP mode, we need to make sure all images within batch_size * gpu_num
+                # will resized and padded to same shape.
+                sample_batch_size = self.batch_size * dist.get_world_size()
+            else:
+                sample_batch_size = self.batch_size
+            self.batch_indices = np.floor(
+                np.arange(len(shapes)) / sample_batch_size
+            ).astype(
+                np.int_
+            )  # batch indices of each image
+
+            self.sort_files_shapes()
+
+        t2 = time.time()
+        if self.main_process:
+            LOGGER.info(f"%.1fs for dataset initialization." % (t2 - t1))
+
+    def __len__(self):
+        """Get the length of dataset"""
+        return len(self.img_paths)
+
+    def __getitem__(self, index):
+        """Fetching a data sample for a given key.
+        This function applies mosaic and mixup augments during training.
+        During validation, letterbox augment is applied.
+        """
+        target_shape = (
+                (self.target_height, self.target_width) if self.specific_shape else
+                self.batch_shapes[self.batch_indices[index]] if self.rect
+                else self.img_size
+                )
+
+        # Mosaic Augmentation
+        if self.augment and random.random() < self.hyp["mosaic"]:
+            img, labels, segments = self.get_mosaic(index, target_shape)
+            shapes = None
+            
+
+            # MixUp augmentation
+            if random.random() < self.hyp["mixup"]:
+                img_other, labels_other, segments_other = self.get_mosaic(
+                    random.randint(0, len(self.img_paths) - 1), target_shape
+                )
+                img, labels, segments = mixup(img, labels, segments, img_other, labels_other, segments_other) # To Change
+
+        else:
+            # Load image
+            if self.hyp and "shrink_size" in self.hyp:
+                img, (h0, w0), (h, w) = self.load_image(index, self.hyp["shrink_size"])
+            else:
+                img, (h0, w0), (h, w) = self.load_image(index)
+
+            # letterbox
+            img, ratio, pad = letterbox(img, target_shape, auto=False, scaleup=self.augment)
+            shapes = (h0, w0), ((h * ratio / h0, w * ratio / w0), pad)  # for COCO mAP rescaling
+            labels = copy.deepcopy(self.labels[index])
+            segments = copy.deepcopy(self.segments[index])
+
+            if labels.size:
+                w *= ratio
+                h *= ratio
+                # new boxes
+                boxes = np.copy(labels[:, 1:5])
+                boxes[:, 0] = (
+                    w * (labels[:, 1] - labels[:, 3] / 2) + pad[0]
+                )  # top left x
+                boxes[:, 1] = (
+                    h * (labels[:, 2] - labels[:, 4] / 2) + pad[1]
+                )  # top left y
+                boxes[:, 2] = (
+                    w * (labels[:, 1] + labels[:, 3] / 2) + pad[0]
+                )  # bottom right x
+                boxes[:, 3] = (
+                    h * (labels[:, 2] + labels[:, 4] / 2) + pad[1]
+                )  # bottom right y
+                labels[:, 1:] = boxes
+            
+            if len(segments):
+                for i_s in range(len(segments)):
+                    segments[i_s][:, 0] = segments[i_s][:, 0] * ratio * w + pad[0]
+                    segments[i_s][:, 1] = segments[i_s][:, 1] * ratio * h + pad[1]
+
+            if self.augment:
+                img, labels, segments = random_affine(
+                    img,
+                    labels,
+                    segments,
+                    degrees=self.hyp["degrees"],
+                    translate=self.hyp["translate"],
+                    scale=self.hyp["scale"],
+                    shear=self.hyp["shear"],
+                    new_shape=target_shape,
+                )
+            else:
+                img, labels, segments = random_affine(
+                    img,
+                    labels,
+                    segments,
+                    degrees=0,
+                    translate=0,
+                    scale=0,
+                    shear=0,
+                    new_shape=target_shape,
+                    task="val"
+                )
+
+
+        if len(labels):
+            h, w = img.shape[:2]
+
+            labels[:, [1, 3]] = labels[:, [1, 3]].clip(0, w - 1e-3)  # x1, x2
+            labels[:, [2, 4]] = labels[:, [2, 4]].clip(0, h - 1e-3)  # y1, y2
+
+            boxes = np.copy(labels[:, 1:])
+            boxes[:, 0] = ((labels[:, 1] + labels[:, 3]) / 2) / w  # x center
+            boxes[:, 1] = ((labels[:, 2] + labels[:, 4]) / 2) / h  # y center
+            boxes[:, 2] = (labels[:, 3] - labels[:, 1]) / w  # width
+            boxes[:, 3] = (labels[:, 4] - labels[:, 2]) / h  # height
+            labels[:, 1:] = boxes
+            lindex = labels[:, 0] >= 0
+            masks = self.polygons2masks(img.shape[:2], segments, color=1, downsample_ratio=self.downsample_ratio)
+            labels = labels[lindex]
+            masks = masks[lindex]
+        
+        else:
+            masks = np.asarray([])
+
+        if self.augment:
+            img, labels, masks = self.general_augment(img, labels, masks.transpose(1, 2, 0) if masks.shape[0]!=0 else masks)
+        
+        #?
+        
+        masks_out = (torch.from_numpy(masks.copy()) if len(masks) else torch.zeros(1 if self.overlap else len(labels), img.shape[0] //
+                                                                        self.downsample_ratio, img.shape[1] //
+                                                                        self.downsample_ratio))
+
+        labels_out = torch.zeros((len(labels), 6))
+        if len(labels):
+            labels_out[:, 1:] = torch.from_numpy(labels)
+
+        # Convert
+        # self.drawit(img, labels, masks, self.img_paths[index], self.task)
+        img = img.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
+        img = np.ascontiguousarray(img)
+        return torch.from_numpy(img), labels_out, self.img_paths[index], shapes, masks_out
+
+    def load_image(self, index, shrink_size=None):
+        """Load image.
+        This function loads image by cv2, resize original image to target shape(img_size) with keeping ratio.
+
+        Returns:
+            Image, original shape of image, resized image shape
+        """
+        path = self.img_paths[index]
+        try:
+            im = cv2.imread(path)
+            assert im is not None, f"opencv cannot read image correctly or {path} not exists"
+        except:
+            im = cv2.cvtColor(np.asarray(Image.open(path)), cv2.COLOR_RGB2BGR)
+            assert im is not None, f"Image Not Found {path}, workdir: {os.getcwd()}"
+
+        h0, w0 = im.shape[:2]  # origin shape
+        if self.specific_shape:
+            # keep ratio resize
+            ratio = min(self.target_width / w0, self.target_height / h0)
+
+        elif shrink_size:
+            ratio = (self.img_size - shrink_size) / max(h0, w0)
+
+        else:
+            ratio = self.img_size / max(h0, w0)
+
+        if ratio != 1:
+                im = cv2.resize(
+                    im,
+                    (int(w0 * ratio), int(h0 * ratio)),
+                    interpolation=cv2.INTER_AREA
+                    if ratio < 1 and not self.augment
+                    else cv2.INTER_LINEAR,
+                )
+        return im, (h0, w0), im.shape[:2]
+
+    @staticmethod
+    def collate_fn(batch):
+        """Merges a list of samples to form a mini-batch of Tensor(s)"""
+        img, label, path, shapes, masks = zip(*batch)
+        for i, l in enumerate(label):
+            l[:, 0] = i  # add target image index for build_targets()
+        return torch.stack(img, 0), torch.cat(label, 0), path, shapes, torch.cat(masks, 0)
+    
+    @staticmethod
+    def get_segment(labels):
+        rlabels = []
+        segments = []
+        if len(labels) == 0:
+            return np.asarray([])
+        for label in labels:
+            z1 = []#labels
+            z2 = []#seg
+            for l in label:
+                z1.append(np.asarray(l[:5]).reshape(1, 5).astype(np.float32))
+                z2.append(np.asarray(l[1:]).reshape(-1, 2).astype(np.float32))
+            if z1:
+                rlabels.append(np.concatenate(z1, axis = 0))
+                segments.append(z2)
+            else:
+                t = np.zeros((1, 5), dtype = np.float32)
+                t[..., 0]= -1
+                rlabels.append(t)
+                segments.append([np.zeros((2, 2), dtype = np.float32)])
+        return rlabels, segments
+            
+                
+
+
+    def get_imgs_labels(self, img_dirs):
+        if not isinstance(img_dirs, list):
+            img_dirs = [img_dirs]
+        # we store the cache img file in the first directory of img_dirs
+        valid_img_record = osp.join(
+            osp.dirname(img_dirs[0]), "." + osp.basename(img_dirs[0]) + "_cache.json"
+        )
+        NUM_THREADS = min(8, os.cpu_count())
+        img_paths = []
+        for img_dir in img_dirs:
+            assert osp.exists(img_dir), f"{img_dir} is an invalid directory path!"
+            img_paths += glob.glob(osp.join(img_dir, "**/*"), recursive=True)
+
+        img_paths = sorted(
+            p for p in img_paths if p.split(".")[-1].lower() in IMG_FORMATS and os.path.isfile(p)
+        )
+
+        assert img_paths, f"No images found in {img_dir}."
+        img_hash = self.get_hash(img_paths)
+        LOGGER.info(f'img record infomation path is:{valid_img_record}')
+        if osp.exists(valid_img_record):
+            with open(valid_img_record, "r") as f:
+                cache_info = json.load(f)
+                if "image_hash" in cache_info and cache_info["image_hash"] == img_hash:
+                    img_info = cache_info["information"]
+                else:
+                    self.check_images = True
+        else:
+            self.check_images = True
+
+        # check images
+        if self.check_images and self.main_process:
+            img_info = {}
+            nc, msgs = 0, []  # number corrupt, messages
+            LOGGER.info(
+                f"{self.task}: Checking formats of images with {NUM_THREADS} process(es): "
+            )
+            with Pool(NUM_THREADS) as pool:
+                pbar = tqdm(
+                    pool.imap(TrainValDataset.check_image, img_paths),
+                    total=len(img_paths),
+                )
+                for img_path, shape_per_img, nc_per_img, msg in pbar:
+                    if nc_per_img == 0:  # not corrupted
+                        img_info[img_path] = {"shape": shape_per_img}
+                    nc += nc_per_img
+                    if msg:
+                        msgs.append(msg)
+                    pbar.desc = f"{nc} image(s) corrupted"
+            pbar.close()
+            if msgs:
+                LOGGER.info("\n".join(msgs))
+
+            cache_info = {"information": img_info, "image_hash": img_hash}
+            # save valid image paths.
+            with open(valid_img_record, "w") as f:
+                json.dump(cache_info, f)
+
+        # check and load anns
+
+        img_paths = list(img_info.keys())
+        label_paths = img2label_paths(img_paths)
+        assert label_paths, f"No labels found."
+        label_hash = self.get_hash(label_paths)
+        if "label_hash" not in cache_info or cache_info["label_hash"] != label_hash:
+            self.check_labels = True
+
+        if self.check_labels:
+            cache_info["label_hash"] = label_hash
+            nm, nf, ne, nc, msgs = 0, 0, 0, 0, []  # number corrupt, messages
+            LOGGER.info(
+                f"{self.task}: Checking formats of labels with {NUM_THREADS} process(es): "
+            )
+            with Pool(NUM_THREADS) as pool:
+                pbar = pool.imap(
+                    TrainValDataset.check_label_files, zip(img_paths, label_paths)
+                )
+                pbar = tqdm(pbar, total=len(label_paths)) if self.main_process else pbar
+                for (
+                    img_path,
+                    labels_per_file,
+                    nc_per_file,
+                    nm_per_file,
+                    nf_per_file,
+                    ne_per_file,
+                    msg,
+                ) in pbar:
+                    if nc_per_file == 0:
+                        img_info[img_path]["labels"] = labels_per_file
+                    else:
+                        img_info.pop(img_path)
+                    nc += nc_per_file
+                    nm += nm_per_file
+                    nf += nf_per_file
+                    ne += ne_per_file
+                    if msg:
+                        msgs.append(msg)
+                    if self.main_process:
+                        pbar.desc = f"{nf} label(s) found, {nm} label(s) missing, {ne} label(s) empty, {nc} invalid label files"
+            if self.main_process:
+                pbar.close()
+                with open(valid_img_record, "w") as f:
+                    json.dump(cache_info, f)
+            if msgs:
+                LOGGER.info("\n".join(msgs))
+            if nf == 0:
+                LOGGER.warning(
+                    f"WARNING: No labels found in {osp.dirname(img_paths[0])}. "
+                )
+
+        if self.task.lower() == "val":
+            if self.data_dict.get("is_coco", False): # use original json file when evaluating on coco dataset.
+                assert osp.exists(self.data_dict["anno_path"]), "Eval on coco dataset must provide valid path of the annotation file in config file: data/coco.yaml"
+            else:
+                assert (
+                    self.class_names
+                ), "Class names is required when converting labels to coco format for evaluating."
+                save_dir = osp.join(osp.dirname(osp.dirname(img_dirs[0])), "annotations")
+                if not osp.exists(save_dir):
+                    os.mkdir(save_dir)
+                save_path = osp.join(
+                    save_dir, "instances_" + osp.basename(img_dirs[0]) + ".json"
+                )
+                TrainValDataset.generate_coco_format_labels(
+                    img_info, self.class_names, save_path
+                )
+
+        # img_paths, labels = list(
+        #     zip(
+        #         *[
+        #             (
+        #                 img_path,
+        #                 np.array(info["labels"], dtype=np.float32)
+        #                 if info["labels"]
+        #                 else np.zeros((0, 5), dtype=np.float32),
+        #             )
+        #             for img_path, info in img_info.items()
+        #         ]
+        #     )
+        # )
+        img_paths, labels = list(
+            zip(
+                *[
+                    (
+                        img_path,
+                        info["labels"]
+                        if info["labels"]
+                        else [],
+                    )
+                    for img_path, info in img_info.items()
+                ]
+            )
+        )
+        self.img_info = img_info
+        LOGGER.info(
+            f"{self.task}: Final numbers of valid images: {len(img_paths)}/ labels: {len(labels)}. "
+        )
+        return img_paths, labels
+
+    def get_mosaic(self, index, shape):
+        """Gets images and labels after mosaic augments"""
+        indices = [index] + random.choices(
+            range(0, len(self.img_paths)), k=3
+        )  # 3 additional image indices
+        random.shuffle(indices)
+        imgs, hs, ws, labels, segments = [], [], [], [], []
+        for index in indices:
+            img, _, (h, w) = self.load_image(index)
+            labels_per_img = self.labels[index]
+            segments_per_img = copy.deepcopy(self.segments[index])
+            imgs.append(img)
+            hs.append(h)
+            ws.append(w)
+            labels.append(labels_per_img)
+            segments.append(segments_per_img)
+        img, labels, segments = mosaic_augmentation(shape, imgs, hs, ws, labels, segments, self.hyp, self.specific_shape, self.target_height, self.target_width)
+        img, labels, segments = copy_paste(img, labels, segments, 0)
+        img, labels, segments = random_affine(img, labels, segments,
+                                degrees=self.hyp['degrees'],
+                                translate=self.hyp['translate'],
+                                scale=self.hyp['scale'],
+                                shear=self.hyp['shear'],
+                                new_shape=shape if not self.specific_shape else (self.target_height, self.target_width))
+        return img, labels, segments
+
+    def general_augment(self, img, labels, segments):
+        """Gets images and labels after general augment
+        This function applies hsv, random ud-flip and random lr-flips augments.
+        """
+        nl = len(labels)
+
+        # HSV color-space
+        augment_hsv(
+            img,
+            hgain=self.hyp["hsv_h"],
+            sgain=self.hyp["hsv_s"],
+            vgain=self.hyp["hsv_v"],
+        )
+
+        # Flip up-down
+        if random.random() < self.hyp["flipud"]:
+            img = np.flipud(img)
+            if nl:
+                segments = np.flipud(segments)
+                labels[:, 2] = 1 - labels[:, 2]
+
+        # Flip left-right
+        if random.random() < self.hyp["fliplr"]:
+            img = np.fliplr(img)
+            if nl:
+                segments = np.fliplr(segments)
+                labels[:, 1] = 1 - labels[:, 1]
+
+        return img, labels, segments.transpose(2, 0, 1) if segments.shape[0]!=0 else segments
+
+    def sort_files_shapes(self):
+        '''Sort by aspect ratio.'''
+        batch_num = self.batch_indices[-1] + 1
+        s = self.shapes  # [height, width]
+        ar = s[:, 1] / s[:, 0]  # aspect ratio
+        irect = ar.argsort()
+        self.img_paths = [self.img_paths[i] for i in irect]
+        self.labels = [self.labels[i] for i in irect]
+        self.segments = [self.segments[i] for i in irect]
+        self.shapes = s[irect]  # wh
+        ar = ar[irect]
+
+        # Set training image shapes
+        shapes = [[1, 1]] * batch_num
+        for i in range(batch_num):
+            ari = ar[self.batch_indices == i]
+            mini, maxi = ari.min(), ari.max()
+            if maxi < 1:
+                shapes[i] = [1, maxi]
+            elif mini > 1:
+                shapes[i] = [1 / mini, 1]
+        self.batch_shapes = (
+            np.ceil(np.array(shapes) * self.img_size / self.stride + self.pad).astype(
+                np.int_
+            )
+            * self.stride
+        )
+
+    @staticmethod
+    def check_image(im_file):
+        '''Verify an image.'''
+        nc, msg = 0, ""
+        try:
+            im = Image.open(im_file)
+            im.verify()  # PIL verify
+            im = Image.open(im_file)  # need to reload the image after using verify()
+            shape = (im.height, im.width)  # (height, width)
+            try:
+                im_exif = im._getexif()
+                if im_exif and ORIENTATION in im_exif:
+                    rotation = im_exif[ORIENTATION]
+                    if rotation in (6, 8):
+                        shape = (shape[1], shape[0])
+            except:
+                im_exif = None
+
+            assert (shape[0] > 9) & (shape[1] > 9), f"image size {shape} <10 pixels"
+            assert im.format.lower() in IMG_FORMATS, f"invalid image format {im.format}"
+            if im.format.lower() in ("jpg", "jpeg"):
+                with open(im_file, "rb") as f:
+                    f.seek(-2, 2)
+                    if f.read() != b"\xff\xd9":  # corrupt JPEG
+                        ImageOps.exif_transpose(Image.open(im_file)).save(
+                            im_file, "JPEG", subsampling=0, quality=100
+                        )
+                        msg += f"WARNING: {im_file}: corrupt JPEG restored and saved"
+            return im_file, shape, nc, msg
+        except Exception as e:
+            nc = 1
+            msg = f"WARNING: {im_file}: ignoring corrupt image: {e}"
+            return im_file, None, nc, msg
+
+    @staticmethod
+    def xyn2xy(x, w=640, h=640, padw=0, padh=0):
+        # Convert normalized segments into pixel segments, shape (n,2)
+        y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+        y[..., 0] = w * x[..., 0] + padw  # top left x
+        y[..., 1] = h * x[..., 1] + padh  # top left y
+        return y
+
+    @staticmethod
+    def drawit(img, labels, masks, imgname = "", task = ""):
+        # Convert normalized segments into pixel segments, shape (n,2)
+        # There are some bugs in Val!
+        if task == "Val":
+            return 0
+        import copy
+        
+        spsp = copy.deepcopy(img)
+        for label in labels:
+            xy = label[1:3] * np.asarray(img.shape[:2])[::-1]
+            wh = label[3:5] * np.asarray(img.shape[:2])[::-1]
+            pt1 = (xy - wh / 2).astype(np.int_)
+            pt2 = (xy + wh / 2).astype(np.int_)
+            cv2.rectangle(spsp, pt1, pt2, (0,255,255), 1)
+        ssss = random.randint(0,100000000)
+        for mask in masks:
+            if mask.shape[:2]!=(img.shape[0], img.shape[1]):
+                m = cv2.resize(mask,(img.shape[0], img.shape[1]))
+            else:
+                m = mask
+            m = m.reshape(img.shape[0], img.shape[1], 1)
+            q = np.ones((img.shape[0], img.shape[1], 1), dtype = np.int_) * 255 * m
+            q = q * m
+            s = np.zeros((img.shape[0], img.shape[1], 2))
+            s = np.concatenate([s, q], axis = 2)
+            spsp = cv2.addWeighted(spsp, 1, s.astype(np.int_), 0.5, 0, dtype=cv2.CV_8U)
+        print(img.shape, labels.shape, masks.shape)
+        try:
+            print(cv2.imwrite("/home/hadoop-seccv/ssd/wangzhaonian/yolov6_seg/test_img/{}.jpg".format(ssss), spsp))
+            print(imgname, ssss, len(labels), len(masks))
+        except:
+            print("?")
+
+
+    @staticmethod
+    def check_label_files(args):
+        img_path, lb_path = args
+        nm, nf, ne, nc, msg = 0, 0, 0, 0, ""  # number (missing, found, empty, message
+        try:
+            if osp.exists(lb_path):
+                nf = 1  # label found
+                with open(lb_path, "r") as f:
+                    labels = [
+                        x.split() for x in f.read().strip().splitlines() if len(x) > 5 # get which has seg
+                    ]
+                    # labels = np.array(labels, dtype=np.float32)
+                if len(labels):
+                    # assert all(
+                    #     len(l) >= 5 for l in labels
+                    # ), f"{lb_path}: wrong label format."
+                    # assert (
+                    #     labels >= 0
+                    # ).all(), f"{lb_path}: Label values error: all values in label file must > 0"
+                    # assert (
+                    #     labels[:, 1:] <= 1
+                    # ).all(), f"{lb_path}: Label values error: all coordinates must be normalized"
+
+                    # _, indices = np.unique(labels, axis=0, return_index=True)
+                    # if len(indices) < len(labels):  # duplicate row check
+                    #     labels = labels[indices]  # remove duplicates
+                    #     msg += f"WARNING: {lb_path}: {len(labels) - len(indices)} duplicate labels removed"
+                    # labels = labels.tolist()
+                    _t = 0
+                else:
+                    ne = 1  # label empty
+                    labels = []
+            else:
+                nm = 1  # label missing
+                labels = []
+            return img_path, labels, nc, nm, nf, ne, msg
+        except Exception as e:
+            nc = 1
+            msg = f"WARNING: {lb_path}: ignoring invalid labels: {e}"
+            return img_path, None, nc, nm, nf, ne, msg
+
+    @staticmethod
+    def polygon2mask(img_size, polygons, color=1, downsample_ratio=1):
+        mask = np.zeros(img_size, dtype=np.uint8)
+        polygons = np.asarray(polygons)
+        polygons = polygons.astype(np.int32)
+        shape = polygons.shape
+        polygons = polygons.reshape(shape[0], -1, 2)
+        cv2.fillPoly(mask, polygons, color=color)
+        nh, nw = (img_size[0] // downsample_ratio, img_size[1] // downsample_ratio)
+        # NOTE: fillPoly firstly then resize is trying the keep the same way
+        # of loss calculation when mask-ratio=1.
+        mask = cv2.resize(mask, (nw, nh))
+        return mask
+
+    def polygons2masks(self, img_size, polygons, color, downsample_ratio=1):
+        """
+        Args:
+            img_size (tuple): The image size.
+            polygons (list[np.ndarray]): each polygon is [N, M],
+                N is the number of polygons,
+                M is the number of points(Be divided by 2).
+        """
+        masks = []
+        for si in range(len(polygons)):
+            mask = self.polygon2mask(img_size, [polygons[si].reshape(-1)], color, downsample_ratio)
+            masks.append(mask)
+        return np.array(masks)
+
+
+    def polygons2masks_overlap(self, img_size, segments, downsample_ratio=1):
+        """Return a (640, 640) overlap mask."""
+        masks = np.zeros((img_size[0] // downsample_ratio, img_size[1] // downsample_ratio),
+                        dtype=np.int32 if len(segments) > 255 else np.uint8)
+        areas = []
+        ms = []
+        for si in range(len(segments)):
+            mask = self.polygon2mask(
+                img_size,
+                [segments[si].reshape(-1)],
+                downsample_ratio=downsample_ratio,
+                color=1,
+            )
+            ms.append(mask)
+            areas.append(mask.sum())
+        areas = np.asarray(areas)
+        index = np.argsort(-areas)
+        ms = np.array(ms)[index]
+        for i in range(len(segments)):
+            mask = ms[i] * (i + 1)
+            masks = masks + mask
+            masks = np.clip(masks, a_min=0, a_max=i + 1)
+        return masks, index
+
+    @staticmethod
+    def generate_coco_format_labels(img_info, class_names, save_path):
+        # for evaluation with pycocotools
+        dataset = {"categories": [], "annotations": [], "images": []}
+        for i, class_name in enumerate(class_names):
+            dataset["categories"].append(
+                {"id": i, "name": class_name, "supercategory": ""}
+            )
+
+        ann_id = 0
+        LOGGER.info(f"Convert to COCO format")
+        for i, (img_path, info) in enumerate(tqdm(img_info.items())):
+            labels = info["labels"] if info["labels"] else []
+            img_id = osp.splitext(osp.basename(img_path))[0]
+            img_h, img_w = info["shape"]
+            dataset["images"].append(
+                {
+                    "file_name": os.path.basename(img_path),
+                    "id": img_id,
+                    "width": img_w,
+                    "height": img_h,
+                }
+            )
+            if labels:
+                for label in labels:
+                    c, x, y, w, h = label[:5]
+                    c, x, y, w, h = float(c), float(x), float(y), float(w), float(h)
+                    seg = np.asarray(label[5:]).astype(np.float32)
+                    seg = seg.reshape(-1, 2)
+                    #breakpoint()
+                    seg = seg * np.asarray([img_w, img_h])
+                    seg = seg.reshape(-1)
+                    # convert x,y,w,h to x1,y1,x2,y2
+                    x1 = (x - w / 2) * img_w
+                    y1 = (y - h / 2) * img_h
+                    x2 = (x + w / 2) * img_w
+                    y2 = (y + h / 2) * img_h
+                    # cls_id starts from 0
+                    cls_id = int(c)
+                    w = max(0, x2 - x1)
+                    h = max(0, y2 - y1)
+                    dataset["annotations"].append(
+                        {
+                            "area": h * w,
+                            "bbox": [x1, y1, w, h],
+                            "category_id": cls_id,
+                            "id": ann_id,
+                            "image_id": img_id,
+                            "iscrowd": 0,
+                            # mask
+                            "segmentation": list(seg),
+                        }
+                    )
+                    ann_id += 1
+
+        with open(save_path, "w") as f:
+            json.dump(dataset, f)
+            LOGGER.info(
+                f"Convert to COCO format finished. Resutls saved in {save_path}"
+            )
+
+    @staticmethod
+    def get_hash(paths):
+        """Get the hash value of paths"""
+        assert isinstance(paths, list), "Only support list currently."
+        h = hashlib.md5("".join(paths).encode())
+        return h.hexdigest()
+
+
+class LoadData:
+    def __init__(self, path, webcam, webcam_addr):
+        self.webcam = webcam
+        self.webcam_addr = webcam_addr
+        if webcam: # if use web camera
+            imgp = []
+            vidp = [int(webcam_addr) if webcam_addr.isdigit() else webcam_addr]
+        else:
+            p = str(Path(path).resolve())  # os-agnostic absolute path
+            if os.path.isdir(p):
+                files = sorted(glob.glob(os.path.join(p, '**/*.*'), recursive=True))  # dir
+            elif os.path.isfile(p):
+                files = [p]  # files
+            else:
+                raise FileNotFoundError(f'Invalid path {p}')
+            imgp = [i for i in files if i.split('.')[-1] in IMG_FORMATS]
+            vidp = [v for v in files if v.split('.')[-1] in VID_FORMATS]
+        self.files = imgp + vidp
+        self.nf = len(self.files)
+        self.type = 'image'
+        if len(vidp) > 0:
+            self.add_video(vidp[0])  # new video
+        else:
+            self.cap = None
+
+    # @staticmethod
+    def checkext(self, path):
+        if self.webcam:
+            file_type = 'video'
+        else:
+            file_type = 'image' if path.split('.')[-1].lower() in IMG_FORMATS else 'video'
+        return file_type
+
+    def __iter__(self):
+        self.count = 0
+        return self
+
+    def __next__(self):
+        if self.count == self.nf:
+            raise StopIteration
+        path = self.files[self.count]
+        if self.checkext(path) == 'video':
+            self.type = 'video'
+            ret_val, img = self.cap.read()
+            while not ret_val:
+                self.count += 1
+                self.cap.release()
+                if self.count == self.nf:  # last video
+                    raise StopIteration
+                path = self.files[self.count]
+                self.add_video(path)
+                ret_val, img = self.cap.read()
+        else:
+            # Read image
+            self.count += 1
+            img = cv2.imread(path)  # BGR
+        return img, path, self.cap
+
+    def add_video(self, path):
+        self.frame = 0
+        self.cap = cv2.VideoCapture(path)
+        self.frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+    def __len__(self):
+        return self.nf  # number of files
diff --git a/yolov6/models/efficientrep.py b/yolov6/models/efficientrep.py
index 5d0de7ce..4ca75083 100644
--- a/yolov6/models/efficientrep.py
+++ b/yolov6/models/efficientrep.py
@@ -387,20 +387,11 @@ def __init__(
         block=RepVGGBlock,
         csp_e=float(1)/2,
         fuse_P2=False,
-        cspsppf=False,
-        stage_block_type="BepC3"
+        cspsppf=False
     ):
         super().__init__()
         assert channels_list is not None
         assert num_repeats is not None
-
-        if stage_block_type == "BepC3":
-            stage_block = BepC3
-        elif stage_block_type == "MBLABlock":
-            stage_block = MBLABlock
-        else:
-            raise NotImplementedError
-        
         self.fuse_P2 = fuse_P2
 
         self.stem = block(
@@ -417,7 +408,7 @@ def __init__(
                 kernel_size=3,
                 stride=2
             ),
-            stage_block(
+            BepC3(
                 in_channels=channels_list[1],
                 out_channels=channels_list[1],
                 n=num_repeats[1],
@@ -433,7 +424,7 @@ def __init__(
                 kernel_size=3,
                 stride=2
             ),
-            stage_block(
+            BepC3(
                 in_channels=channels_list[2],
                 out_channels=channels_list[2],
                 n=num_repeats[2],
@@ -449,7 +440,7 @@ def __init__(
                 kernel_size=3,
                 stride=2
             ),
-            stage_block(
+            BepC3(
                 in_channels=channels_list[3],
                 out_channels=channels_list[3],
                 n=num_repeats[3],
@@ -469,7 +460,7 @@ def __init__(
                 kernel_size=3,
                 stride=2,
             ),
-            stage_block(
+            BepC3(
                 in_channels=channels_list[4],
                 out_channels=channels_list[4],
                 n=num_repeats[4],
@@ -484,7 +475,7 @@ def __init__(
                 kernel_size=3,
                 stride=2,
             ),
-            stage_block(
+            BepC3(
                 in_channels=channels_list[5],
                 out_channels=channels_list[5],
                 n=num_repeats[5],
diff --git a/yolov6/models/effidehead_seg.py b/yolov6/models/effidehead_seg.py
new file mode 100644
index 00000000..2bfe9843
--- /dev/null
+++ b/yolov6/models/effidehead_seg.py
@@ -0,0 +1,452 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from yolov6.layers.common import *
+from yolov6.assigners.anchor_generator import generate_anchors
+from yolov6.utils.general import dist2bbox
+
+
+class Detect(nn.Module):
+    export = False
+    '''Efficient Decoupled Head
+    With hardware-aware degisn, the decoupled head is optimized with
+    hybridchannels methods.
+    '''
+    def __init__(self, num_classes=80, num_layers=3, inplace=True, head_layers=None, reg_mask=None, use_dfl=True, reg_max=16, nm=32):  # detection layer
+        # nm: number of masks
+        super().__init__()
+        assert head_layers is not None
+        assert reg_mask is not None
+        self.nc = num_classes  # number of classes
+        self.no = num_classes + 5 + nm  # number of outputs per anchor
+        self.nl = num_layers  # number of detection layers
+        self.nm = nm
+        self.grid = [torch.zeros(1)] * num_layers
+        self.prior_prob = 1e-2
+        self.inplace = inplace
+        stride = [8, 16, 32] if num_layers == 3 else [8, 16, 32, 64] # strides computed during build
+        self.stride = torch.tensor(stride)
+        self.use_dfl = use_dfl
+        self.reg_max = reg_max
+        self.proj_conv = nn.Conv2d(self.reg_max + 1, 1, 1, bias=False)
+        self.grid_cell_offset = 0.5
+        self.grid_cell_size = 5.0
+
+        # Init decouple head
+        self.stems = nn.ModuleList()
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.cls_preds = nn.ModuleList()
+        self.reg_preds = nn.ModuleList()
+
+        # Efficient decoupled head layers
+        for i in range(num_layers):
+            idx = i*5
+            self.stems.append(head_layers[idx])
+            self.cls_convs.append(head_layers[idx+1])
+            self.reg_convs.append(head_layers[idx+2])
+            self.cls_preds.append(head_layers[idx+3])
+            self.reg_preds.append(head_layers[idx+4])
+
+    def initialize_biases(self):
+
+        for conv in self.cls_preds:
+            b = conv.bias.view(-1, )
+            b.data.fill_(-math.log((1 - self.prior_prob) / self.prior_prob))
+            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+            w = conv.weight
+            w.data.fill_(0.)
+            conv.weight = torch.nn.Parameter(w, requires_grad=True)
+
+        for conv in self.reg_preds:
+            b = conv.bias.view(-1, )
+            b.data.fill_(1.0)
+            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+            w = conv.weight
+            w.data.fill_(0.)
+            conv.weight = torch.nn.Parameter(w, requires_grad=True)
+
+        self.proj = nn.Parameter(torch.linspace(0, self.reg_max, self.reg_max + 1), requires_grad=False)
+        self.proj_conv.weight = nn.Parameter(self.proj.view([1, self.reg_max + 1, 1, 1]).clone().detach(),
+                                                   requires_grad=False)
+
+    def forward(self, x):
+        if self.training:
+            cls_score_list = []
+            reg_distri_list = []
+
+            for i in range(self.nl):
+                x[i] = self.stems[i](x[i])
+                cls_x = x[i]
+                reg_x = x[i]
+                cls_feat = self.cls_convs[i](cls_x)
+                cls_output = self.cls_preds[i](cls_feat)
+                reg_feat = self.reg_convs[i](reg_x)
+                reg_output = self.reg_preds[i](reg_feat)
+
+                cls_output = torch.sigmoid(cls_output)
+                cls_score_list.append(cls_output.flatten(2).permute((0, 2, 1)))
+                reg_distri_list.append(reg_output.flatten(2).permute((0, 2, 1)))
+
+            cls_score_list = torch.cat(cls_score_list, axis=1)
+            reg_distri_list = torch.cat(reg_distri_list, axis=1)
+
+            return x, cls_score_list, reg_distri_list
+        else:
+            cls_score_list = []
+            reg_dist_list = []
+
+            for i in range(self.nl):
+                b, _, h, w = x[i].shape
+                l = h * w
+                x[i] = self.stems[i](x[i])
+                cls_x = x[i]
+                reg_x = x[i]
+                cls_feat = self.cls_convs[i](cls_x)
+                cls_output = self.cls_preds[i](cls_feat)
+                reg_feat = self.reg_convs[i](reg_x)
+                reg_output = self.reg_preds[i](reg_feat)
+
+                if self.use_dfl:
+                    reg_output = reg_output.reshape([-1, 4, self.reg_max + 1, l]).permute(0, 2, 1, 3)
+                    reg_output = self.proj_conv(F.softmax(reg_output, dim=1))
+
+                cls_output = torch.sigmoid(cls_output)
+
+                if self.export:
+                    cls_score_list.append(cls_output)
+                    reg_dist_list.append(reg_output)
+                else:
+                    cls_score_list.append(cls_output.reshape([b, self.nc, l]))
+                    reg_dist_list.append(reg_output.reshape([b, 4, l]))
+
+            if self.export:
+                return tuple(torch.cat([cls, reg], 1) for cls, reg in zip(cls_score_list, reg_dist_list))
+
+            cls_score_list = torch.cat(cls_score_list, axis=-1).permute(0, 2, 1)
+            reg_dist_list = torch.cat(reg_dist_list, axis=-1).permute(0, 2, 1)
+
+
+            anchor_points, stride_tensor = generate_anchors(
+                x, self.stride, self.grid_cell_size, self.grid_cell_offset, device=x[0].device, is_eval=True, mode='af')
+
+            pred_bboxes = dist2bbox(reg_dist_list, anchor_points, box_format='xywh')
+            pred_bboxes *= stride_tensor
+            return torch.cat(
+                [
+                    pred_bboxes,
+                    torch.ones((b, pred_bboxes.shape[1], 1), device=pred_bboxes.device, dtype=pred_bboxes.dtype),
+                    cls_score_list
+                ],
+                axis=-1)
+
+def build_seg_layer(channels_list, num_anchors, num_classes, reg_max=16, num_layers=3):
+
+    chx = [6, 8, 10] if num_layers == 3 else [8, 9, 10, 11]
+
+    head_layers = nn.Sequential(
+        # stem0
+        ConvBNSiLU(
+            in_channels=channels_list[chx[0]],
+            out_channels=channels_list[chx[0]],
+            kernel_size=1,
+            stride=1
+        ),
+        # cls_conv0
+        ConvBNSiLU(
+            in_channels=channels_list[chx[0]],
+            out_channels=channels_list[chx[0]],
+            kernel_size=3,
+            stride=1
+        ),
+        # reg_conv0
+        ConvBNSiLU(
+            in_channels=channels_list[chx[0]],
+            out_channels=channels_list[chx[0]],
+            kernel_size=3,
+            stride=1
+        ),
+        # cls_pred0
+        nn.Conv2d(
+            in_channels=channels_list[chx[0]],
+            out_channels=num_classes * num_anchors,
+            kernel_size=1
+        ),
+        # reg_pred0
+        nn.Conv2d(
+            in_channels=channels_list[chx[0]],
+            out_channels=4 * (reg_max + num_anchors),
+            kernel_size=1
+        ),
+        # stem1
+        ConvBNSiLU(
+            in_channels=channels_list[chx[1]],
+            out_channels=channels_list[chx[1]],
+            kernel_size=1,
+            stride=1
+        ),
+        # cls_conv1
+        ConvBNSiLU(
+            in_channels=channels_list[chx[1]],
+            out_channels=channels_list[chx[1]],
+            kernel_size=3,
+            stride=1
+        ),
+        # reg_conv1
+        ConvBNSiLU(
+            in_channels=channels_list[chx[1]],
+            out_channels=channels_list[chx[1]],
+            kernel_size=3,
+            stride=1
+        ),
+        # cls_pred1
+        nn.Conv2d(
+            in_channels=channels_list[chx[1]],
+            out_channels=num_classes * num_anchors,
+            kernel_size=1
+        ),
+        # reg_pred1
+        nn.Conv2d(
+            in_channels=channels_list[chx[1]],
+            out_channels=4 * (reg_max + num_anchors),
+            kernel_size=1
+        ),
+        # stem2
+        ConvBNSiLU(
+            in_channels=channels_list[chx[2]],
+            out_channels=channels_list[chx[2]],
+            kernel_size=1,
+            stride=1
+        ),
+        # cls_conv2
+        ConvBNSiLU(
+            in_channels=channels_list[chx[2]],
+            out_channels=channels_list[chx[2]],
+            kernel_size=3,
+            stride=1
+        ),
+        # reg_conv2
+        ConvBNSiLU(
+            in_channels=channels_list[chx[2]],
+            out_channels=channels_list[chx[2]],
+            kernel_size=3,
+            stride=1
+        ),
+        # cls_pred2
+        nn.Conv2d(
+            in_channels=channels_list[chx[2]],
+            out_channels=num_classes * num_anchors,
+            kernel_size=1
+        ),
+        # reg_pred2
+        nn.Conv2d(
+            in_channels=channels_list[chx[2]],
+            out_channels=4 * (reg_max + num_anchors),
+            kernel_size=1
+        )
+    )
+
+    if num_layers == 4:
+        head_layers.add_module('stem3',
+            # stem3
+            ConvBNSiLU(
+                in_channels=channels_list[chx[3]],
+                out_channels=channels_list[chx[3]],
+                kernel_size=1,
+                stride=1
+            )
+        )
+        head_layers.add_module('cls_conv3',
+            # cls_conv3
+            ConvBNSiLU(
+                in_channels=channels_list[chx[3]],
+                out_channels=channels_list[chx[3]],
+                kernel_size=3,
+                stride=1
+            )
+        )
+        head_layers.add_module('reg_conv3',
+            # reg_conv3
+            ConvBNSiLU(
+                in_channels=channels_list[chx[3]],
+                out_channels=channels_list[chx[3]],
+                kernel_size=3,
+                stride=1
+            )
+        )
+        head_layers.add_module('cls_pred3',
+            # cls_pred3
+            nn.Conv2d(
+                in_channels=channels_list[chx[3]],
+                out_channels=num_classes * num_anchors,
+                kernel_size=1
+            )
+         )
+        head_layers.add_module('reg_pred3',
+            # reg_pred3
+            nn.Conv2d(
+                in_channels=channels_list[chx[3]],
+                out_channels=4 * (reg_max + num_anchors),
+                kernel_size=1
+            )
+        )
+
+    return head_layers
+
+
+######
+
+
+def build_effidehead_layer(channels_list, num_anchors, num_classes, reg_max=16, num_layers=3):
+
+    chx = [6, 8, 10] if num_layers == 3 else [8, 9, 10, 11]
+
+    head_layers = nn.Sequential(
+        # stem0
+        ConvBNSiLU(
+            in_channels=channels_list[chx[0]],
+            out_channels=channels_list[chx[0]],
+            kernel_size=1,
+            stride=1
+        ),
+        # cls_conv0
+        ConvBNSiLU(
+            in_channels=channels_list[chx[0]],
+            out_channels=channels_list[chx[0]],
+            kernel_size=3,
+            stride=1
+        ),
+        # reg_conv0
+        ConvBNSiLU(
+            in_channels=channels_list[chx[0]],
+            out_channels=channels_list[chx[0]],
+            kernel_size=3,
+            stride=1
+        ),
+        # cls_pred0
+        nn.Conv2d(
+            in_channels=channels_list[chx[0]],
+            out_channels=num_classes * num_anchors,
+            kernel_size=1
+        ),
+        # reg_pred0
+        nn.Conv2d(
+            in_channels=channels_list[chx[0]],
+            out_channels=4 * (reg_max + num_anchors),
+            kernel_size=1
+        ),
+        # stem1
+        ConvBNSiLU(
+            in_channels=channels_list[chx[1]],
+            out_channels=channels_list[chx[1]],
+            kernel_size=1,
+            stride=1
+        ),
+        # cls_conv1
+        ConvBNSiLU(
+            in_channels=channels_list[chx[1]],
+            out_channels=channels_list[chx[1]],
+            kernel_size=3,
+            stride=1
+        ),
+        # reg_conv1
+        ConvBNSiLU(
+            in_channels=channels_list[chx[1]],
+            out_channels=channels_list[chx[1]],
+            kernel_size=3,
+            stride=1
+        ),
+        # cls_pred1
+        nn.Conv2d(
+            in_channels=channels_list[chx[1]],
+            out_channels=num_classes * num_anchors,
+            kernel_size=1
+        ),
+        # reg_pred1
+        nn.Conv2d(
+            in_channels=channels_list[chx[1]],
+            out_channels=4 * (reg_max + num_anchors),
+            kernel_size=1
+        ),
+        # stem2
+        ConvBNSiLU(
+            in_channels=channels_list[chx[2]],
+            out_channels=channels_list[chx[2]],
+            kernel_size=1,
+            stride=1
+        ),
+        # cls_conv2
+        ConvBNSiLU(
+            in_channels=channels_list[chx[2]],
+            out_channels=channels_list[chx[2]],
+            kernel_size=3,
+            stride=1
+        ),
+        # reg_conv2
+        ConvBNSiLU(
+            in_channels=channels_list[chx[2]],
+            out_channels=channels_list[chx[2]],
+            kernel_size=3,
+            stride=1
+        ),
+        # cls_pred2
+        nn.Conv2d(
+            in_channels=channels_list[chx[2]],
+            out_channels=num_classes * num_anchors,
+            kernel_size=1
+        ),
+        # reg_pred2
+        nn.Conv2d(
+            in_channels=channels_list[chx[2]],
+            out_channels=4 * (reg_max + num_anchors),
+            kernel_size=1
+        )
+    )
+
+    if num_layers == 4:
+        head_layers.add_module('stem3',
+            # stem3
+            ConvBNSiLU(
+                in_channels=channels_list[chx[3]],
+                out_channels=channels_list[chx[3]],
+                kernel_size=1,
+                stride=1
+            )
+        )
+        head_layers.add_module('cls_conv3',
+            # cls_conv3
+            ConvBNSiLU(
+                in_channels=channels_list[chx[3]],
+                out_channels=channels_list[chx[3]],
+                kernel_size=3,
+                stride=1
+            )
+        )
+        head_layers.add_module('reg_conv3',
+            # reg_conv3
+            ConvBNSiLU(
+                in_channels=channels_list[chx[3]],
+                out_channels=channels_list[chx[3]],
+                kernel_size=3,
+                stride=1
+            )
+        )
+        head_layers.add_module('cls_pred3',
+            # cls_pred3
+            nn.Conv2d(
+                in_channels=channels_list[chx[3]],
+                out_channels=num_classes * num_anchors,
+                kernel_size=1
+            )
+         )
+        head_layers.add_module('reg_pred3',
+            # reg_pred3
+            nn.Conv2d(
+                in_channels=channels_list[chx[3]],
+                out_channels=4 * (reg_max + num_anchors),
+                kernel_size=1
+            )
+        )
+
+    return head_layers
diff --git a/yolov6/models/heads/effidehead_fuseab_seg.py b/yolov6/models/heads/effidehead_fuseab_seg.py
new file mode 100644
index 00000000..80272928
--- /dev/null
+++ b/yolov6/models/heads/effidehead_fuseab_seg.py
@@ -0,0 +1,551 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from yolov6.layers.common import *
+from yolov6.assigners.anchor_generator import generate_anchors
+from yolov6.utils.general import dist2bbox
+
+
+class Detect(nn.Module):
+    export = False
+    '''Efficient Decoupled Head for fusing anchor-base branches.
+    '''
+    def __init__(self, num_classes=80, anchors=None, num_layers=3, inplace=True, head_layers=None, reg_mask=None, use_dfl=True, reg_max=16, nm=32, fuse_ab=False):  # detection layer
+        super().__init__()
+        assert head_layers is not None
+        assert reg_mask is not None
+        self.nc = num_classes  # number of classes
+        self.no = num_classes + 5 + nm  # number of outputs per anchor
+        self.nl = num_layers  # number of detection layers
+        self.nm = nm # number of masks
+        if isinstance(anchors, (list, tuple)):
+            self.na = len(anchors[0]) // 2
+        else:
+            self.na = anchors
+        self.grid = [torch.zeros(1)] * num_layers
+        self.fuse_ab = fuse_ab
+        self.prior_prob = 1e-2
+        self.inplace = inplace
+        stride = [8, 16, 32] if num_layers == 3 else [8, 16, 32, 64] # strides computed during build
+        self.stride = torch.tensor(stride)
+        self.use_dfl = use_dfl
+        self.reg_max = reg_max
+        self.proj_conv = nn.Conv2d(self.reg_max + 1, 1, 1, bias=False)
+        self.grid_cell_offset = 0.5
+        self.grid_cell_size = 5.0
+        self.anchors_init= ((torch.tensor(anchors) / self.stride[:,None])).reshape(self.nl, self.na, 2)
+        self.reg_mask = reg_mask
+
+        # Init decouple head
+        self.stems = nn.ModuleList()
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.seg_convs = nn.ModuleList()
+        self.cls_preds = nn.ModuleList()
+        self.reg_preds = nn.ModuleList()
+        self.seg_preds = nn.ModuleList()
+        self.cls_preds_ab = nn.ModuleList()
+        self.reg_preds_ab = nn.ModuleList()
+        self.seg_preds_ab = nn.ModuleList()
+        self.seg_proto = nn.ModuleList()
+        self.seg_proto.append(reg_mask[0])
+
+
+        # Efficient decoupled head layers
+        for i in range(num_layers):
+            idx = i*10
+            self.stems.append(head_layers[idx])
+            self.cls_convs.append(head_layers[idx+1])
+            self.reg_convs.append(head_layers[idx+2])
+            self.seg_convs.append(head_layers[idx+3])
+            self.cls_preds.append(head_layers[idx+4])
+            self.reg_preds.append(head_layers[idx+5])
+            self.seg_preds.append(head_layers[idx+6])
+            if self.fuse_ab:
+                self.cls_preds_ab.append(head_layers[idx+7])
+                self.reg_preds_ab.append(head_layers[idx+8])
+                self.seg_preds_ab.append(head_layers[idx+9])
+            
+
+    def initialize_biases(self):
+
+        for conv in self.cls_preds:
+            b = conv.bias.view(-1, )
+            b.data.fill_(-math.log((1 - self.prior_prob) / self.prior_prob))
+            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+            w = conv.weight
+            w.data.fill_(0.)
+            conv.weight = torch.nn.Parameter(w, requires_grad=True)
+
+        if self.fuse_ab:
+            for conv in self.cls_preds_ab:
+                b = conv.bias.view(-1, )
+                b.data.fill_(-math.log((1 - self.prior_prob) / self.prior_prob))
+                conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+                w = conv.weight
+                w.data.fill_(0.)
+                conv.weight = torch.nn.Parameter(w, requires_grad=True)
+
+        for conv in self.reg_preds:
+            b = conv.bias.view(-1, )
+            b.data.fill_(1.0)
+            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+            w = conv.weight
+            w.data.fill_(0.)
+            conv.weight = torch.nn.Parameter(w, requires_grad=True)
+
+        if self.fuse_ab:
+            for conv in self.reg_preds_ab:
+                b = conv.bias.view(-1, )
+                b.data.fill_(1.0)
+                conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+                w = conv.weight
+                w.data.fill_(0.)
+                conv.weight = torch.nn.Parameter(w, requires_grad=True)
+
+        self.proj = nn.Parameter(torch.linspace(0, self.reg_max, self.reg_max + 1), requires_grad=False)
+        self.proj_conv.weight = nn.Parameter(self.proj.view([1, self.reg_max + 1, 1, 1]).clone().detach(),
+                                                   requires_grad=False)
+
+    def handleseg_af(self, sgot_lst, sg_msk_lst):
+        '''
+        sg_msk_lst --> lst sg_msk: segmask: Shape(bs, 32, w, h)
+        sgot_lst --> lst sgot: seg_output_conf: Shape(bs, n, 32)
+        '''
+        mask_res = []
+        for i in range(len(sgot_lst)):
+            sgot = sgot_lst[i]
+            sg_msk = sg_msk_lst[i]
+            t_mask_res = []
+            for j in range(sgot.shape[0]):
+                sgot_t = sgot[j] # (n, 32)
+                sg_msk_t = sg_msk[j] # (32, w, h)
+                m_t = (sgot_t@sg_msk_t.reshape(32, -1)).reshape(-1, *sg_msk_t.shape[1:])
+                m_t = m_t.unsqueeze(0)
+                t_mask_res.append(m_t)
+            mask_res.append(torch.cat(t_mask_res, 0).flatten(0,1))
+        return mask_res
+
+    def handleseg_ab(self, sgot_lst, sg_msk_lst):
+        '''
+        sg_msk_lst --> lst sg_msk: segmask: Shape(bs, 32, w, h)
+        sgot_lst --> lst sgot: seg_output_conf: Shape(bs, num_of_anchors, h, w, 32)
+        sgot.flatten(1, 3) -> Shape(bs, n*num_of_anchors, 32)
+        for j in range(bs) -> ((n*num_of_anchor, 32)@(32, w0, h0) = (n*num_of_anchor, 32)@(32, w0, h0))
+        '''
+        mask_res = []
+        for i in range(len(sgot_lst)):
+            sgot = sgot_lst[i]
+            sg_msk = sg_msk_lst[i]
+            s_shape = sgot.shape[1:4]
+            sgot = sgot.flatten(1, 3)
+            t_mask_res = []
+            for j in range(sgot.shape[0]):
+                sgot_t = sgot[j] # (n, 32)
+                sg_msk_t = sg_msk[j] # (32, w, h)
+                m_t = (sgot_t@sg_msk_t.reshape(32, -1)).reshape(-1, *sg_msk_t.shape[1:])
+                m_t = m_t.unsqueeze(0)
+                t_mask_res.append(m_t)
+            mask_res.append(torch.cat(t_mask_res, 0).flatten(0,1))
+        return mask_res
+            
+
+
+            
+
+    def forward(self, x):
+        if self.training:
+            device = x[0].device
+            cls_score_list_af = []
+            reg_dist_list_af = []
+            cls_score_list_ab = []
+            reg_dist_list_ab = []
+            seg_conf_list_af = []
+            seg_conf_list_ab = []
+            seg_list = []
+            af_seg_list = []
+            ab_seg_list = []
+
+            seg_mask = self.seg_proto[0](x[0])
+            seg_list.append(seg_mask)
+
+
+
+            for i in range(self.nl):
+                b, _, h, w = x[i].shape
+                l = h * w
+                
+                
+                x[i] = self.stems[i](x[i])
+                
+                
+                cls_x = x[i]
+                reg_x = x[i]
+                seg_x = x[i]
+
+                cls_feat = self.cls_convs[i](cls_x)
+                reg_feat = self.reg_convs[i](reg_x)
+                seg_feat = self.seg_convs[i](seg_x)
+
+                #anchor_base
+                if self.fuse_ab:
+                    cls_output_ab = self.cls_preds_ab[i](cls_feat)
+                    reg_output_ab = self.reg_preds_ab[i](reg_feat)
+                    seg_output_ab = self.seg_preds_ab[i](seg_feat)
+
+                    cls_output_ab = torch.sigmoid(cls_output_ab)
+                    seg_output_ab = torch.sigmoid(seg_output_ab)
+                    if self.fuse_ab:
+                        seg_conf_list_ab.append(seg_output_ab.reshape(b, self.na, -1, h, w).permute(0,1,3,4,2))
+                    cls_output_ab = cls_output_ab.reshape(b, self.na, -1, h, w).permute(0,1,3,4,2)
+                    cls_score_list_ab.append(cls_output_ab.flatten(1,3))
+                    
+
+                    reg_output_ab = reg_output_ab.reshape(b, self.na, -1, h, w).permute(0,1,3,4,2)
+                    reg_output_ab[..., 2:4] = ((reg_output_ab[..., 2:4].sigmoid() * 2) ** 2 ) * (self.anchors_init[i].reshape(1, self.na, 1, 1, 2).to(device))
+                    reg_dist_list_ab.append(reg_output_ab.flatten(1,3))
+
+                #anchor_free
+                cls_output_af = self.cls_preds[i](cls_feat)
+                reg_output_af = self.reg_preds[i](reg_feat)
+                seg_output_af = self.seg_preds[i](seg_feat)
+
+                cls_output_af = torch.sigmoid(cls_output_af)
+                # seg_output_af = torch.sigmoid(seg_output_af)
+                seg_conf_list_af.append(seg_output_af.flatten(2).permute((0, 2, 1)))
+
+                cls_score_list_af.append(cls_output_af.flatten(2).permute((0, 2, 1)))
+                reg_dist_list_af.append(reg_output_af.flatten(2).permute((0, 2, 1)))
+
+            #Not support fuseab now
+            if False:
+                ab_seg_list = self.handleseg_ab(seg_conf_list_ab, seg_list) if self.fuse_ab else []
+                cls_score_list_ab = torch.cat(cls_score_list_ab, axis=1)
+                reg_dist_list_ab = torch.cat(reg_dist_list_ab, axis=1)
+            cls_score_list_af = torch.cat(cls_score_list_af, axis=1)
+            reg_dist_list_af = torch.cat(reg_dist_list_af, axis=1)
+
+            return x, cls_score_list_ab, reg_dist_list_ab, cls_score_list_af, reg_dist_list_af, [seg_conf_list_af, seg_list], ab_seg_list
+
+        else:
+            device = x[0].device
+            cls_score_list_af = []
+            reg_dist_list_af = []
+            seg_list = []
+            seg_conf_list_af = []
+            seg_mask = self.seg_proto[0](x[0])
+            seg_list.append(seg_mask)
+
+            for i in range(self.nl):
+                b, _, h, w = x[i].shape
+                l = h * w
+
+                
+                x[i] = self.stems[i](x[i])
+                
+                cls_x = x[i]
+                reg_x = x[i]
+                seg_x = x[i]
+
+                cls_feat = self.cls_convs[i](cls_x)
+                reg_feat = self.reg_convs[i](reg_x)
+                seg_feat = self.seg_convs[i](seg_x)
+
+                #anchor_free
+                cls_output_af = self.cls_preds[i](cls_feat)
+                reg_output_af = self.reg_preds[i](reg_feat)
+                seg_output_af = self.seg_preds[i](seg_feat)
+
+                if self.use_dfl:
+                    reg_output_af = reg_output_af.reshape([-1, 4, self.reg_max + 1, l]).permute(0, 2, 1, 3)
+                    reg_output_af = self.proj_conv(F.softmax(reg_output_af, dim=1))
+
+                cls_output_af = torch.sigmoid(cls_output_af)
+                # seg_output_af = torch.sigmoid(seg_output_af)
+                proto_no = (torch.ones(b, 1, l) * i).cuda()
+                
+
+                if self.export:
+                    cls_score_list_af.append(cls_output_af)
+                    reg_dist_list_af.append(reg_output_af)
+                    seg_conf_list_af.append(seg_output_af)
+                else:
+                    cls_score_list_af.append(cls_output_af.reshape([b, self.nc, l]))
+                    reg_dist_list_af.append(reg_output_af.reshape([b, 4, l]))
+                    seg_conf_list_af.append(torch.cat([proto_no, seg_output_af.reshape([b, 32, l])], axis = 1)) #[which_proto, (32...)]
+
+            if self.export:
+                return tuple(torch.cat([cls, reg, seg], 1) for cls, reg, seg in zip(cls_score_list_af, reg_dist_list_af, seg_conf_list_af)), seg_list[0]
+
+            cls_score_list_af = torch.cat(cls_score_list_af, axis=-1).permute(0, 2, 1)
+            reg_dist_list_af = torch.cat(reg_dist_list_af, axis=-1).permute(0, 2, 1)
+            seg_conf_list_af = torch.cat(seg_conf_list_af, axis=-1).permute(0, 2, 1)
+
+
+
+            #anchor_free
+            anchor_points_af, stride_tensor_af = generate_anchors(
+                x, self.stride, self.grid_cell_size, self.grid_cell_offset, device=x[0].device, is_eval=True, mode='af')
+
+            pred_bboxes_af = dist2bbox(reg_dist_list_af, anchor_points_af, box_format='xywh')
+            pred_bboxes_af *= stride_tensor_af
+
+            pred_bboxes = pred_bboxes_af
+            cls_score_list = cls_score_list_af
+
+            return torch.cat(
+                [
+                    pred_bboxes,
+                    torch.ones((b, pred_bboxes.shape[1], 1), device=pred_bboxes.device, dtype=pred_bboxes.dtype),
+                    cls_score_list
+                ],
+                axis=-1), seg_list, seg_conf_list_af
+
+class Proto(nn.Module):
+    # Borrow from YOLOv5 
+    def __init__(self, num_layers, channels_list, pos, c_=256, c2=32):  # ch_in, number of protos, number of masks
+        super().__init__()
+        chx = [6, 8, 10] if num_layers == 3 else [8, 9, 10, 11]
+        c1 = channels_list[chx[pos]]
+        self.cv1 = Conv(c1, c_, k=3)
+        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
+        self.cv2 = Conv(c_, c_, k=3)
+        self.cv3 = Conv(c_, c2)
+
+    def forward(self, x):
+        return self.cv3(self.cv2(self.upsample(self.cv1(x))))
+    
+def autopad(k, p=None, d=1):  # kernel, padding, dilation
+    # Pad to 'same' shape outputs
+    if d > 1:
+        k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k]  # actual kernel-size
+    if p is None:
+        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad
+    return p
+
+
+class Conv(nn.Module):
+    # Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)
+    default_act = nn.SiLU()  # default activation
+
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
+        super().__init__()
+        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
+        self.bn = nn.BatchNorm2d(c2)
+        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
+
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+
+    def forward_fuse(self, x):
+        return self.act(self.conv(x))
+
+
+def build_effidehead_layer(channels_list, num_anchors, num_classes, reg_max=16, num_layers=3, num_masks=32, fuse_ab=False):
+
+    chx = [6, 8, 10] if num_layers == 3 else [8, 9, 10, 11]
+
+    head_layers = nn.Sequential(
+        # stem0
+        ConvBNSiLU(
+            in_channels=channels_list[chx[0]],
+            out_channels=channels_list[chx[0]],
+            kernel_size=1,
+            stride=1
+        ),
+        # cls_conv0
+        ConvBNSiLU(
+            in_channels=channels_list[chx[0]],
+            out_channels=channels_list[chx[0]],
+            kernel_size=3,
+            stride=1
+        ),
+        # reg_conv0
+        ConvBNSiLU(
+            in_channels=channels_list[chx[0]],
+            out_channels=channels_list[chx[0]],
+            kernel_size=3,
+            stride=1
+        ),
+        # seg_conv0
+        ConvBNSiLU(
+            in_channels=channels_list[chx[0]],
+            out_channels=channels_list[chx[0]],
+            kernel_size=3,
+            stride=1
+        ),
+        # cls_pred0_af
+        nn.Conv2d(
+            in_channels=channels_list[chx[0]],
+            out_channels=num_classes,
+            kernel_size=1
+        ),
+        # reg_pred0_af
+        nn.Conv2d(
+            in_channels=channels_list[chx[0]],
+            out_channels=4 * (reg_max + 1),
+            kernel_size=1
+        ),
+        # seg_pred0_af
+        nn.Conv2d(
+            in_channels=channels_list[chx[0]],
+            out_channels=num_masks,
+            kernel_size=1
+        ),
+        # cls_pred0_3ab
+        nn.Conv2d(
+            in_channels=channels_list[chx[0]],
+            out_channels=num_classes * num_anchors,
+            kernel_size=1
+        ),
+        # reg_pred0_3ab
+        nn.Conv2d(
+            in_channels=channels_list[chx[0]],
+            out_channels=4 * num_anchors,
+            kernel_size=1
+        ),
+        # seg_pred0_3ab
+        nn.Conv2d(
+            in_channels=channels_list[chx[0]],
+            out_channels=num_masks * num_anchors,
+            kernel_size=1
+        ),
+        # stem1
+        ConvBNSiLU(
+            in_channels=channels_list[chx[1]],
+            out_channels=channels_list[chx[1]],
+            kernel_size=1,
+            stride=1
+        ),
+        # cls_conv1
+        ConvBNSiLU(
+            in_channels=channels_list[chx[1]],
+            out_channels=channels_list[chx[1]],
+            kernel_size=3,
+            stride=1
+        ),
+        # reg_conv1
+        ConvBNSiLU(
+            in_channels=channels_list[chx[1]],
+            out_channels=channels_list[chx[1]],
+            kernel_size=3,
+            stride=1
+        ),
+        # seg_conv1
+        ConvBNSiLU(
+            in_channels=channels_list[chx[1]],
+            out_channels=channels_list[chx[1]],
+            kernel_size=3,
+            stride=1
+        ),
+        # cls_pred1_af
+        nn.Conv2d(
+            in_channels=channels_list[chx[1]],
+            out_channels=num_classes,
+            kernel_size=1
+        ),
+        # reg_pred1_af
+        nn.Conv2d(
+            in_channels=channels_list[chx[1]],
+            out_channels=4 * (reg_max + 1),
+            kernel_size=1
+        ),
+        # seg_pred1_af
+        nn.Conv2d(
+            in_channels=channels_list[chx[1]],
+            out_channels=num_masks,
+            kernel_size=1
+        ),
+        # cls_pred1_3ab
+        nn.Conv2d(
+            in_channels=channels_list[chx[1]],
+            out_channels=num_classes * num_anchors,
+            kernel_size=1
+        ),
+        # reg_pred1_3ab
+        nn.Conv2d(
+            in_channels=channels_list[chx[1]],
+            out_channels=4 * num_anchors,
+            kernel_size=1
+        ),
+        # seg_pred1_3ab
+        nn.Conv2d(
+            in_channels=channels_list[chx[1]],
+            out_channels=num_masks * num_anchors,
+            kernel_size=1
+        ),
+        # stem2
+        ConvBNSiLU(
+            in_channels=channels_list[chx[2]],
+            out_channels=channels_list[chx[2]],
+            kernel_size=1,
+            stride=1
+        ),
+        # cls_conv2
+        ConvBNSiLU(
+            in_channels=channels_list[chx[2]],
+            out_channels=channels_list[chx[2]],
+            kernel_size=3,
+            stride=1
+        ),
+        # reg_conv2
+        ConvBNSiLU(
+            in_channels=channels_list[chx[2]],
+            out_channels=channels_list[chx[2]],
+            kernel_size=3,
+            stride=1
+        ),
+        # seg_conv2
+        ConvBNSiLU(
+            in_channels=channels_list[chx[2]],
+            out_channels=channels_list[chx[2]],
+            kernel_size=3,
+            stride=1
+        ),
+        # cls_pred2_af
+        nn.Conv2d(
+            in_channels=channels_list[chx[2]],
+            out_channels=num_classes,
+            kernel_size=1
+        ),
+        # reg_pred2_af
+        nn.Conv2d(
+            in_channels=channels_list[chx[2]],
+            out_channels=4 * (reg_max + 1),
+            kernel_size=1
+        ),
+        # seg_pred2_af
+        nn.Conv2d(
+            in_channels=channels_list[chx[2]],
+            out_channels=num_masks,
+            kernel_size=1
+        ),
+        # cls_pred2_3ab
+        nn.Conv2d(
+            in_channels=channels_list[chx[2]],
+            out_channels=num_classes * num_anchors,
+            kernel_size=1
+        ),
+        # reg_pred2_3ab
+        nn.Conv2d(
+            in_channels=channels_list[chx[2]],
+            out_channels=4 * num_anchors,
+            kernel_size=1
+        ),
+        # seg_pred2_3ab
+        nn.Conv2d(
+            in_channels=channels_list[chx[2]],
+            out_channels=num_masks * num_anchors,
+            kernel_size=1
+        ),
+    )
+
+    return head_layers
+
+
+
+
+
+
+
diff --git a/yolov6/models/heads/effidehead_fuseab_seg_solo.py b/yolov6/models/heads/effidehead_fuseab_seg_solo.py
new file mode 100644
index 00000000..61bd1328
--- /dev/null
+++ b/yolov6/models/heads/effidehead_fuseab_seg_solo.py
@@ -0,0 +1,540 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from yolov6.layers.common import *
+from yolov6.assigners.anchor_generator import generate_anchors
+from yolov6.utils.general import dist2bbox
+
+
+class Detect(nn.Module):
+    export = False
+    '''Efficient Decoupled Head for fusing anchor-base branches.
+    '''
+    def __init__(self, num_classes=80, anchors=None, num_layers=3, inplace=True, head_layers=None, reg_mask=None, use_dfl=True, reg_max=16, nm=32, fuse_ab=False):  # detection layer
+        super().__init__()
+        assert head_layers is not None
+        assert reg_mask is not None
+        self.nc = num_classes  # number of classes
+        self.no = num_classes + 5 + nm  # number of outputs per anchor
+        self.nl = num_layers  # number of detection layers
+        self.nm = nm # number of masks
+        if isinstance(anchors, (list, tuple)):
+            self.na = len(anchors[0]) // 2
+        else:
+            self.na = anchors
+        self.grid = [torch.zeros(1)] * num_layers
+        self.fuse_ab = fuse_ab
+        self.prior_prob = 1e-2
+        self.inplace = inplace
+        stride = [8, 16, 32] if num_layers == 3 else [8, 16, 32, 64] # strides computed during build
+        self.stride = torch.tensor(stride)
+        self.use_dfl = use_dfl
+        self.reg_max = reg_max
+        self.proj_conv = nn.Conv2d(self.reg_max + 1, 1, 1, bias=False)
+        self.grid_cell_offset = 0.5
+        self.grid_cell_size = 5.0
+        self.anchors_init= ((torch.tensor(anchors) / self.stride[:,None])).reshape(self.nl, self.na, 2)
+        self.reg_mask = reg_mask
+
+        # Init decouple head
+        self.stems = nn.ModuleList()
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.seg_convs = nn.ModuleList()
+        self.cls_preds = nn.ModuleList()
+        self.reg_preds = nn.ModuleList()
+        self.seg_preds = nn.ModuleList()
+        self.cls_preds_ab = nn.ModuleList()
+        self.reg_preds_ab = nn.ModuleList()
+        self.seg_preds_ab = nn.ModuleList()
+        self.seg_proto = nn.ModuleList()
+        self.seg_proto.append(reg_mask[0])
+        self.seg_proto.append(reg_mask[1])
+        self.seg_proto.append(reg_mask[2])
+
+
+        # Efficient decoupled head layers
+        for i in range(num_layers):
+            idx = i*10
+            self.stems.append(head_layers[idx])
+            self.cls_convs.append(head_layers[idx+1])
+            self.reg_convs.append(head_layers[idx+2])
+            self.seg_convs.append(head_layers[idx+3])
+            self.cls_preds.append(head_layers[idx+4])
+            self.reg_preds.append(head_layers[idx+5])
+            self.seg_preds.append(head_layers[idx+6])
+            if self.fuse_ab:
+                self.cls_preds_ab.append(head_layers[idx+7])
+                self.reg_preds_ab.append(head_layers[idx+8])
+                self.seg_preds_ab.append(head_layers[idx+9])
+            
+
+    def initialize_biases(self):
+
+        for conv in self.cls_preds:
+            b = conv.bias.view(-1, )
+            b.data.fill_(-math.log((1 - self.prior_prob) / self.prior_prob))
+            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+            w = conv.weight
+            w.data.fill_(0.)
+            conv.weight = torch.nn.Parameter(w, requires_grad=True)
+
+        if self.fuse_ab:
+            for conv in self.cls_preds_ab:
+                b = conv.bias.view(-1, )
+                b.data.fill_(-math.log((1 - self.prior_prob) / self.prior_prob))
+                conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+                w = conv.weight
+                w.data.fill_(0.)
+                conv.weight = torch.nn.Parameter(w, requires_grad=True)
+
+        for conv in self.reg_preds:
+            b = conv.bias.view(-1, )
+            b.data.fill_(1.0)
+            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+            w = conv.weight
+            w.data.fill_(0.)
+            conv.weight = torch.nn.Parameter(w, requires_grad=True)
+
+        if self.fuse_ab:
+            for conv in self.reg_preds_ab:
+                b = conv.bias.view(-1, )
+                b.data.fill_(1.0)
+                conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+                w = conv.weight
+                w.data.fill_(0.)
+                conv.weight = torch.nn.Parameter(w, requires_grad=True)
+
+        self.proj = nn.Parameter(torch.linspace(0, self.reg_max, self.reg_max + 1), requires_grad=False)
+        self.proj_conv.weight = nn.Parameter(self.proj.view([1, self.reg_max + 1, 1, 1]).clone().detach(),
+                                                   requires_grad=False)
+
+
+    def handleseg_ab(self, sgot_lst, sg_msk_lst):
+        '''
+        sg_msk_lst --> lst sg_msk: segmask: Shape(bs, 32, w, h)
+        sgot_lst --> lst sgot: seg_output_conf: Shape(bs, num_of_anchors, h, w, 32)
+        sgot.flatten(1, 3) -> Shape(bs, n*num_of_anchors, 32)
+        for j in range(bs) -> ((n*num_of_anchor, 32)@(32, w0, h0) = (n*num_of_anchor, 32)@(32, w0, h0))
+        '''
+        mask_res = []
+        for i in range(len(sgot_lst)):
+            sgot = sgot_lst[i]
+            sg_msk = sg_msk_lst[i]
+            s_shape = sgot.shape[1:4]
+            sgot = sgot.flatten(1, 3)
+            t_mask_res = []
+            for j in range(sgot.shape[0]):
+                sgot_t = sgot[j] # (n, 32)
+                sg_msk_t = sg_msk[j] # (32, w, h)
+                m_t = (sgot_t@sg_msk_t.reshape(self.nm, -1)).reshape(-1, *sg_msk_t.shape[1:])
+                m_t = m_t.unsqueeze(0)
+                t_mask_res.append(m_t)
+            mask_res.append(torch.cat(t_mask_res, 0).flatten(0,1))
+        return mask_res
+            
+
+
+            
+
+    def forward(self, x):
+        if self.training:
+            device = x[0].device
+            cls_score_list_af = []
+            reg_dist_list_af = []
+            cls_score_list_ab = []
+            reg_dist_list_ab = []
+            seg_conf_list_af = []
+            seg_conf_list_ab = []
+            seg_list = []
+            af_seg_list = []
+            ab_seg_list = []
+
+            s1 = self.seg_proto[0](x[0])
+            s2 = self.seg_proto[1](x[1])
+            s3 = self.seg_proto[2](x[2])
+            seg_mask = s1 + s2 + s3
+            seg_list.append(seg_mask)
+
+
+
+            for i in range(self.nl):
+                b, _, h, w = x[i].shape
+                l = h * w
+                
+                
+                x[i] = self.stems[i](x[i])
+                
+                
+                cls_x = x[i]
+                reg_x = x[i]
+                seg_x = x[i]
+
+                cls_feat = self.cls_convs[i](cls_x)
+                reg_feat = self.reg_convs[i](reg_x)
+                seg_feat = self.seg_convs[i](seg_x)
+
+                #anchor_base
+                if self.fuse_ab:
+                    cls_output_ab = self.cls_preds_ab[i](cls_feat)
+                    reg_output_ab = self.reg_preds_ab[i](reg_feat)
+                    seg_output_ab = self.seg_preds_ab[i](seg_feat)
+
+                    cls_output_ab = torch.sigmoid(cls_output_ab)
+                    seg_output_ab = torch.sigmoid(seg_output_ab)
+                    if self.fuse_ab:
+                        seg_conf_list_ab.append(seg_output_ab.reshape(b, self.na, -1, h, w).permute(0,1,3,4,2))
+                    cls_output_ab = cls_output_ab.reshape(b, self.na, -1, h, w).permute(0,1,3,4,2)
+                    cls_score_list_ab.append(cls_output_ab.flatten(1,3))
+                    
+
+                    reg_output_ab = reg_output_ab.reshape(b, self.na, -1, h, w).permute(0,1,3,4,2)
+                    reg_output_ab[..., 2:4] = ((reg_output_ab[..., 2:4].sigmoid() * 2) ** 2 ) * (self.anchors_init[i].reshape(1, self.na, 1, 1, 2).to(device))
+                    reg_dist_list_ab.append(reg_output_ab.flatten(1,3))
+
+                #anchor_free
+                cls_output_af = self.cls_preds[i](cls_feat)
+                reg_output_af = self.reg_preds[i](reg_feat)
+                seg_output_af = self.seg_preds[i](seg_feat)
+
+                cls_output_af = torch.sigmoid(cls_output_af)
+                # seg_output_af = torch.sigmoid(seg_output_af)
+                seg_conf_list_af.append(seg_output_af.flatten(2).permute((0, 2, 1)))
+
+                cls_score_list_af.append(cls_output_af.flatten(2).permute((0, 2, 1)))
+                reg_dist_list_af.append(reg_output_af.flatten(2).permute((0, 2, 1)))
+
+            #Not support fuseab now
+            if False:
+                ab_seg_list = self.handleseg_ab(seg_conf_list_ab, seg_list) if self.fuse_ab else []
+                cls_score_list_ab = torch.cat(cls_score_list_ab, axis=1)
+                reg_dist_list_ab = torch.cat(reg_dist_list_ab, axis=1)
+            cls_score_list_af = torch.cat(cls_score_list_af, axis=1)
+            reg_dist_list_af = torch.cat(reg_dist_list_af, axis=1)
+
+            return x, cls_score_list_ab, reg_dist_list_ab, cls_score_list_af, reg_dist_list_af, [seg_conf_list_af, seg_list], ab_seg_list
+
+        else:
+            device = x[0].device
+            cls_score_list_af = []
+            reg_dist_list_af = []
+            seg_list = []
+            seg_conf_list_af = []
+            s1 = self.seg_proto[0](x[0])
+            s2 = self.seg_proto[1](x[1])
+            s3 = self.seg_proto[2](x[2])
+            seg_mask = s1 + s2 + s3
+            seg_list.append(seg_mask)
+
+            for i in range(self.nl):
+                b, _, h, w = x[i].shape
+                l = h * w
+
+                
+                x[i] = self.stems[i](x[i])
+                
+                cls_x = x[i]
+                reg_x = x[i]
+                seg_x = x[i]
+
+                cls_feat = self.cls_convs[i](cls_x)
+                reg_feat = self.reg_convs[i](reg_x)
+                seg_feat = self.seg_convs[i](seg_x)
+
+                #anchor_free
+                cls_output_af = self.cls_preds[i](cls_feat)
+                reg_output_af = self.reg_preds[i](reg_feat)
+                seg_output_af = self.seg_preds[i](seg_feat)
+
+                if self.use_dfl:
+                    reg_output_af = reg_output_af.reshape([-1, 4, self.reg_max + 1, l]).permute(0, 2, 1, 3)
+                    reg_output_af = self.proj_conv(F.softmax(reg_output_af, dim=1))
+
+                cls_output_af = torch.sigmoid(cls_output_af)
+                proto_no = (torch.ones(b, 1, l) * i).cuda()
+                
+
+                if self.export:
+                    cls_score_list_af.append(cls_output_af)
+                    reg_dist_list_af.append(reg_output_af)
+                    seg_conf_list_af.append(seg_output_af)
+                else:
+                    cls_score_list_af.append(cls_output_af.reshape([b, self.nc, l]))
+                    reg_dist_list_af.append(reg_output_af.reshape([b, 4, l]))
+                    seg_conf_list_af.append(torch.cat([proto_no, seg_output_af.reshape([b, 67, l])], axis = 1)) #[which_proto, (32...)]
+
+            if self.export:
+                return tuple(torch.cat([cls, reg, seg], 1) for cls, reg, seg in zip(cls_score_list_af, reg_dist_list_af, seg_conf_list_af)), seg_list[0]
+
+            cls_score_list_af = torch.cat(cls_score_list_af, axis=-1).permute(0, 2, 1)
+            reg_dist_list_af = torch.cat(reg_dist_list_af, axis=-1).permute(0, 2, 1)
+            seg_conf_list_af = torch.cat(seg_conf_list_af, axis=-1).permute(0, 2, 1)
+
+
+
+            #anchor_free
+            anchor_points_af, stride_tensor_af = generate_anchors(
+                x, self.stride, self.grid_cell_size, self.grid_cell_offset, device=x[0].device, is_eval=True, mode='af')
+
+            pred_bboxes_af = dist2bbox(reg_dist_list_af, anchor_points_af, box_format='xywh')
+            pred_bboxes_af *= stride_tensor_af
+
+            pred_bboxes = pred_bboxes_af
+            cls_score_list = cls_score_list_af
+
+            return torch.cat(
+                [
+                    pred_bboxes,
+                    torch.ones((b, pred_bboxes.shape[1], 1), device=pred_bboxes.device, dtype=pred_bboxes.dtype),
+                    cls_score_list
+                ],
+                axis=-1), seg_list, seg_conf_list_af
+
+class Proto(nn.Module):
+    # Borrowed from YOLOv5
+    def __init__(self, num_layers, channels_list, pos, c_=256, c2=64, scale_factor=2):  # ch_in, number of protos, number of masks
+        super().__init__()
+        chx = [6, 8, 10] if num_layers == 3 else [8, 9, 10, 11]
+        c1 = channels_list[chx[pos]]
+        self.cv1 = Conv(c1, c_, k=3)
+        self.upsample = nn.Upsample(scale_factor=scale_factor, mode='nearest')
+        self.cv2 = Conv(c_, c_, k=3)
+        self.cv3 = Conv(c_, c2)
+
+    def forward(self, x):
+        return self.cv3(self.cv2(self.upsample(self.cv1(x))))
+    
+def autopad(k, p=None, d=1):  # kernel, padding, dilation
+    # Pad to 'same' shape outputs
+    if d > 1:
+        k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k]  # actual kernel-size
+    if p is None:
+        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad
+    return p
+
+
+class Conv(nn.Module):
+    # Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)
+    default_act = nn.SiLU()  # default activation
+
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
+        super().__init__()
+        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
+        self.bn = nn.BatchNorm2d(c2)
+        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
+
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+
+    def forward_fuse(self, x):
+        return self.act(self.conv(x))
+
+
+def build_effidehead_layer(channels_list, num_anchors, num_classes, reg_max=16, num_layers=3, num_masks=67, fuse_ab=False):
+
+    chx = [6, 8, 10] if num_layers == 3 else [8, 9, 10, 11]
+
+    head_layers = nn.Sequential(
+        # stem0
+        ConvBNSiLU(
+            in_channels=channels_list[chx[0]],
+            out_channels=channels_list[chx[0]],
+            kernel_size=1,
+            stride=1
+        ),
+        # cls_conv0
+        ConvBNSiLU(
+            in_channels=channels_list[chx[0]],
+            out_channels=channels_list[chx[0]],
+            kernel_size=3,
+            stride=1
+        ),
+        # reg_conv0
+        ConvBNSiLU(
+            in_channels=channels_list[chx[0]],
+            out_channels=channels_list[chx[0]],
+            kernel_size=3,
+            stride=1
+        ),
+        # seg_conv0
+        ConvBNSiLU(
+            in_channels=channels_list[chx[0]],
+            out_channels=channels_list[chx[0]],
+            kernel_size=3,
+            stride=1
+        ),
+        # cls_pred0_af
+        nn.Conv2d(
+            in_channels=channels_list[chx[0]],
+            out_channels=num_classes,
+            kernel_size=1
+        ),
+        # reg_pred0_af
+        nn.Conv2d(
+            in_channels=channels_list[chx[0]],
+            out_channels=4 * (reg_max + 1),
+            kernel_size=1
+        ),
+        # seg_pred0_af
+        nn.Conv2d(
+            in_channels=channels_list[chx[0]],
+            out_channels=num_masks,
+            kernel_size=1
+        ),
+        # cls_pred0_3ab
+        nn.Conv2d(
+            in_channels=channels_list[chx[0]],
+            out_channels=num_classes * num_anchors,
+            kernel_size=1
+        ),
+        # reg_pred0_3ab
+        nn.Conv2d(
+            in_channels=channels_list[chx[0]],
+            out_channels=4 * num_anchors,
+            kernel_size=1
+        ),
+        # seg_pred0_3ab
+        nn.Conv2d(
+            in_channels=channels_list[chx[0]],
+            out_channels=num_masks * num_anchors,
+            kernel_size=1
+        ),
+        # stem1
+        ConvBNSiLU(
+            in_channels=channels_list[chx[1]],
+            out_channels=channels_list[chx[1]],
+            kernel_size=1,
+            stride=1
+        ),
+        # cls_conv1
+        ConvBNSiLU(
+            in_channels=channels_list[chx[1]],
+            out_channels=channels_list[chx[1]],
+            kernel_size=3,
+            stride=1
+        ),
+        # reg_conv1
+        ConvBNSiLU(
+            in_channels=channels_list[chx[1]],
+            out_channels=channels_list[chx[1]],
+            kernel_size=3,
+            stride=1
+        ),
+        # seg_conv1
+        ConvBNSiLU(
+            in_channels=channels_list[chx[1]],
+            out_channels=channels_list[chx[1]],
+            kernel_size=3,
+            stride=1
+        ),
+        # cls_pred1_af
+        nn.Conv2d(
+            in_channels=channels_list[chx[1]],
+            out_channels=num_classes,
+            kernel_size=1
+        ),
+        # reg_pred1_af
+        nn.Conv2d(
+            in_channels=channels_list[chx[1]],
+            out_channels=4 * (reg_max + 1),
+            kernel_size=1
+        ),
+        # seg_pred1_af
+        nn.Conv2d(
+            in_channels=channels_list[chx[1]],
+            out_channels=num_masks,
+            kernel_size=1
+        ),
+        # cls_pred1_3ab
+        nn.Conv2d(
+            in_channels=channels_list[chx[1]],
+            out_channels=num_classes * num_anchors,
+            kernel_size=1
+        ),
+        # reg_pred1_3ab
+        nn.Conv2d(
+            in_channels=channels_list[chx[1]],
+            out_channels=4 * num_anchors,
+            kernel_size=1
+        ),
+        # seg_pred1_3ab
+        nn.Conv2d(
+            in_channels=channels_list[chx[1]],
+            out_channels=num_masks * num_anchors,
+            kernel_size=1
+        ),
+        # stem2
+        ConvBNSiLU(
+            in_channels=channels_list[chx[2]],
+            out_channels=channels_list[chx[2]],
+            kernel_size=1,
+            stride=1
+        ),
+        # cls_conv2
+        ConvBNSiLU(
+            in_channels=channels_list[chx[2]],
+            out_channels=channels_list[chx[2]],
+            kernel_size=3,
+            stride=1
+        ),
+        # reg_conv2
+        ConvBNSiLU(
+            in_channels=channels_list[chx[2]],
+            out_channels=channels_list[chx[2]],
+            kernel_size=3,
+            stride=1
+        ),
+        # seg_conv2
+        ConvBNSiLU(
+            in_channels=channels_list[chx[2]],
+            out_channels=channels_list[chx[2]],
+            kernel_size=3,
+            stride=1
+        ),
+        # cls_pred2_af
+        nn.Conv2d(
+            in_channels=channels_list[chx[2]],
+            out_channels=num_classes,
+            kernel_size=1
+        ),
+        # reg_pred2_af
+        nn.Conv2d(
+            in_channels=channels_list[chx[2]],
+            out_channels=4 * (reg_max + 1),
+            kernel_size=1
+        ),
+        # seg_pred2_af
+        nn.Conv2d(
+            in_channels=channels_list[chx[2]],
+            out_channels=num_masks,
+            kernel_size=1
+        ),
+        # cls_pred2_3ab
+        nn.Conv2d(
+            in_channels=channels_list[chx[2]],
+            out_channels=num_classes * num_anchors,
+            kernel_size=1
+        ),
+        # reg_pred2_3ab
+        nn.Conv2d(
+            in_channels=channels_list[chx[2]],
+            out_channels=4 * num_anchors,
+            kernel_size=1
+        ),
+        # seg_pred2_3ab
+        nn.Conv2d(
+            in_channels=channels_list[chx[2]],
+            out_channels=num_masks * num_anchors,
+            kernel_size=1
+        ),
+    )
+
+    return head_layers
+
+
+
+
+
+
+
diff --git a/yolov6/models/losses/loss.py b/yolov6/models/losses/loss.py
index ec534923..c4fe8d87 100644
--- a/yolov6/models/losses/loss.py
+++ b/yolov6/models/losses/loss.py
@@ -30,8 +30,6 @@ def __init__(self,
                  ):
 
         self.fpn_strides = fpn_strides
-        self.cached_feat_sizes = [torch.Size([0, 0]) for _ in fpn_strides]
-        self.cached_anchors = None
         self.grid_cell_size = grid_cell_size
         self.grid_cell_offset = grid_cell_offset
         self.num_classes = num_classes
@@ -60,13 +58,8 @@ def __call__(
     ):
 
         feats, pred_scores, pred_distri = outputs
-        if all(feat.shape[2:] == cfsize for feat, cfsize in zip(feats, self.cached_feat_sizes)):
-            anchors, anchor_points, n_anchors_list, stride_tensor = self.cached_anchors
-        else:
-            self.cached_feat_sizes = [feat.shape[2:] for feat in feats]
-            anchors, anchor_points, n_anchors_list, stride_tensor = \
-                   generate_anchors(feats, self.fpn_strides, self.grid_cell_size, self.grid_cell_offset, device=feats[0].device)
-            self.cached_anchors = anchors, anchor_points, n_anchors_list, stride_tensor
+        anchors, anchor_points, n_anchors_list, stride_tensor = \
+               generate_anchors(feats, self.fpn_strides, self.grid_cell_size, self.grid_cell_offset, device=feats[0].device)
 
         assert pred_scores.type() == pred_distri.type()
         gt_bboxes_scale = torch.tensor([batch_width, batch_height, batch_width, batch_height]).type_as(pred_scores)
diff --git a/yolov6/models/losses/seg_loss.py b/yolov6/models/losses/seg_loss.py
new file mode 100644
index 00000000..04a25ecd
--- /dev/null
+++ b/yolov6/models/losses/seg_loss.py
@@ -0,0 +1,532 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torch.nn.functional as F
+from yolov6.assigners.anchor_generator import generate_anchors
+from yolov6.utils.general import dist2bbox, bbox2dist, xywh2xyxy, box_iou
+from yolov6.utils.figure_iou import IOUloss
+from yolov6.assigners.atss_assigner_seg import ATSSAssigner
+from yolov6.assigners.tal_assigner_seg import TaskAlignedAssigner
+import time
+import pickle
+
+class ComputeLoss:
+    '''Loss computation func.'''
+    def __init__(self,
+                 fpn_strides=[8, 16, 32],
+                 grid_cell_size=5.0,
+                 grid_cell_offset=0.5,
+                 num_classes=80,
+                 ori_img_size=640,
+                 warmup_epoch=4,
+                 use_dfl=True,
+                 reg_max=16,
+                 nm=32,
+                 iou_type='giou',
+                 loss_weight={
+                     'class': 1.0,
+                     'iou': 2.5,
+                     'dfl': 0.5,
+                     'seg': 2.5},
+                 ):
+
+        self.fpn_strides = fpn_strides
+        self.grid_cell_size = grid_cell_size
+        self.grid_cell_offset = grid_cell_offset
+        self.num_classes = num_classes
+        self.ori_img_size = ori_img_size
+        self.nm = nm
+        self.tt = nm
+        self.warmup_epoch = warmup_epoch
+        self.warmup_assigner = ATSSAssigner(9, num_classes=self.num_classes)
+        self.formal_assigner = TaskAlignedAssigner(topk=13, num_classes=self.num_classes, alpha=1.0, beta=6.0)
+
+        self.use_dfl = use_dfl
+        self.reg_max = reg_max
+        self.proj = nn.Parameter(torch.linspace(0, self.reg_max, self.reg_max + 1), requires_grad=False)
+        self.iou_type = iou_type
+        self.varifocal_loss = VarifocalLoss().cuda()
+        self.bbox_loss = BboxLoss(self.num_classes, self.reg_max, self.use_dfl, self.iou_type).cuda()
+        self.loss_weight = loss_weight
+
+    def __call__(
+        self,
+        outputs,
+        targets,
+        epoch_num,
+        step_num,
+        batch_height,
+        batch_width,
+        segmasks,
+        img=None,
+    ):
+        
+        feats, pred_scores, pred_distri, pred_seg  = outputs # seg_list:shape(3)(b, nm, mw, mh) seg_conf_list:shape(3):(b, l ,nm)
+        seg_cf, seg_proto = pred_seg
+        anchors, anchor_points, n_anchors_list, stride_tensor = \
+               generate_anchors(feats, self.fpn_strides, self.grid_cell_size, self.grid_cell_offset, device=feats[0].device)
+
+        assert pred_scores.type() == pred_distri.type()
+        gt_bboxes_scale = torch.tensor([batch_width, batch_height, batch_width, batch_height]).type_as(pred_scores)
+        batch_size = pred_scores.shape[0]
+
+        targets, gt_segmasks =self.preprocess(targets, batch_size, gt_bboxes_scale, segmasks)
+        gt_labels = targets[:, :, :1]
+        gt_bboxes = targets[:, :, 1:] #xyxy
+        mask_gt = (gt_bboxes.sum(-1, keepdim=True) > 0).float()
+        
+        anchor_points_s = anchor_points / stride_tensor
+        pred_bboxes = self.bbox_decode(anchor_points_s, pred_distri) #xyxy
+        try:
+            if epoch_num < self.warmup_epoch:
+                target_labels, target_bboxes, target_scores, fg_mask, target_segmasks = \
+                    self.warmup_assigner(
+                        anchors,
+                        n_anchors_list,
+                        gt_labels,
+                        gt_bboxes,
+                        mask_gt,
+                        pred_bboxes.detach() * stride_tensor,
+                        gt_segmasks)
+            else:
+                target_labels, target_bboxes, target_scores, fg_mask, idx_lst = \
+                    self.formal_assigner(
+                        pred_scores.detach(),
+                        pred_bboxes.detach() * stride_tensor,
+                        anchor_points,
+                        gt_labels,
+                        gt_bboxes,
+                        mask_gt,
+                        gt_segmasks)
+
+        except RuntimeError:
+            print(
+                "OOM RuntimeError is raised due to the huge memory cost during label assignment. \
+                    CPU mode is applied in this batch. If you want to avoid this issue, \
+                    try to reduce the batch size or image size."
+            )
+            torch.cuda.empty_cache()
+            print("------------CPU Mode for This Batch-------------")
+            if epoch_num < self.warmup_epoch:
+                _anchors = anchors.cpu().float()
+                _n_anchors_list = n_anchors_list
+                _gt_labels = gt_labels.cpu().float()
+                _gt_bboxes = gt_bboxes.cpu().float()
+                _mask_gt = mask_gt.cpu().float()
+                _pred_bboxes = pred_bboxes.detach().cpu().float()
+                _stride_tensor = stride_tensor.cpu().float()
+                _segmasks = gt_segmasks.cpu().float()
+
+                target_labels, target_bboxes, target_scores, fg_mask, target_segmasks = \
+                    self.warmup_assigner(
+                        _anchors,
+                        _n_anchors_list,
+                        _gt_labels,
+                        _gt_bboxes,
+                        _mask_gt,
+                        _pred_bboxes * _stride_tensor,
+                        _segmasks)
+
+            else:
+                _pred_scores = pred_scores.detach().cpu().float()
+                _pred_bboxes = pred_bboxes.detach().cpu().float()
+                _anchor_points = anchor_points.cpu().float()
+                _gt_labels = gt_labels.cpu().float()
+                _gt_bboxes = gt_bboxes.cpu().float()
+                _mask_gt = mask_gt.cpu().float()
+                _stride_tensor = stride_tensor.cpu().float()
+                _segmasks = gt_segmasks.cpu().float()
+
+                target_labels, target_bboxes, target_scores, fg_mask, idx_lst = \
+                    self.formal_assigner(
+                        _pred_scores,
+                        _pred_bboxes * _stride_tensor,
+                        _anchor_points,
+                        _gt_labels,
+                        _gt_bboxes,
+                        _mask_gt,
+                        _segmasks)
+
+            target_labels = target_labels.cuda()
+            target_bboxes = target_bboxes.cuda()
+            target_scores = target_scores.cuda()
+            fg_mask = fg_mask.cuda()
+            for _ in idx_lst:
+                _ = _.cuda()
+
+        
+        if step_num % 10 == 0:
+            torch.cuda.empty_cache()
+
+        # rescale bbox
+        target_bboxes /= stride_tensor
+
+        # cls loss
+        target_labels = torch.where(fg_mask > 0, target_labels, torch.full_like(target_labels, self.num_classes))
+        one_hot_label = F.one_hot(target_labels.long(), self.num_classes + 1)[..., :-1]
+        loss_cls = self.varifocal_loss(pred_scores, target_scores, one_hot_label)
+
+
+        target_scores_sum = target_scores.sum()
+        
+		# avoid devide zero error, devide by zero will cause loss to be inf or nan.
+        # if target_scores_sum is 0, loss_cls equals to 0 alson
+        if target_scores_sum > 1:
+            loss_cls /= target_scores_sum
+
+        # bbox loss
+        loss_iou, loss_dfl = self.bbox_loss(pred_distri, pred_bboxes, anchor_points_s, target_bboxes,
+                                            target_scores, target_scores_sum, fg_mask)
+
+        loss_seg = self.mask_loss(gt_segmasks, seg_cf, seg_proto, target_bboxes, fg_mask, idx_lst, target_scores, target_scores_sum)
+
+        loss = self.loss_weight['class'] * loss_cls + \
+               self.loss_weight['iou'] * loss_iou + \
+               self.loss_weight['dfl'] * loss_dfl + \
+               self.loss_weight['seg'] * loss_seg
+
+
+        return loss, \
+            torch.cat(((self.loss_weight['iou'] * loss_iou).unsqueeze(0),
+                         (self.loss_weight['dfl'] * loss_dfl).unsqueeze(0),
+                         (self.loss_weight['class'] * loss_cls).unsqueeze(0),
+                         (self.loss_weight['seg'] * loss_seg).unsqueeze(0))).detach()
+
+    def preprocess(self, targets, batch_size, scale_tensor, segmask):
+        targets_list = np.zeros((batch_size, 1, 5)).tolist()
+        cu = []
+        already = []
+        # seg_list = np.zeros((batch_size, 1, *segmask.shape[1:])).tolist()
+        for i, item in enumerate(targets.cpu().numpy().tolist()):
+            index = int(item[0])
+            targets_list[index].append(item[1:])
+            if index not in already:
+                already.append(index)
+                cu.append(i)
+        cu.append(segmask.shape[0])
+        max_len = max((len(l) for l in targets_list))
+        segmasks = torch.zeros(batch_size, max_len - 1, segmask.shape[-2], segmask.shape[-1]).cuda()
+        if len(already) != 0:
+            for i in range(len(already)):
+                j = already[i]
+                start = cu[i]
+                end = cu[i+1]
+                segmasks[j, : end - start] = segmask[start: end].clone()
+        targets = torch.from_numpy(np.array(list(map(lambda l:l + [[-1,0,0,0,0]]*(max_len - len(l)), targets_list)))[:,1:,:]).to(targets.device)
+
+        batch_target = targets[:, :, 1:5].mul_(scale_tensor)
+        targets[..., 1:] = xywh2xyxy(batch_target)
+        return targets, segmasks
+
+    def bbox_decode(self, anchor_points, pred_dist):
+        if self.use_dfl:
+            batch_size, n_anchors, _ = pred_dist.shape
+            pred_dist = F.softmax(pred_dist.view(batch_size, n_anchors, 4, self.reg_max + 1), dim=-1).matmul(self.proj.to(pred_dist.device))
+        return dist2bbox(pred_dist, anchor_points)
+
+    def mask_loss(self, gt_segmasks, seg_cf, seg_proto, txyxy_ori, fg_mask, idx_lst, target_scores=None, target_scores_sum=None):
+            # pred_mask_lst -> list
+            '''
+            pred_mask -> Shape(n1, w, h)
+            gt_mask -> Shape(n, img_w, img_h)
+            xyxy -> Shape(n, 4)
+            sum(n1, n2, n3, ...) = n
+            torch.abs((xyxy[..., 3] - xyxy[..., 1]) * (xyxy[..., 4] - xyxy[..., 2])) -> area
+            fg_mask --> (bs, tsize)
+            idx -> (bs, tsize)
+            gt_segmasks -> (bs, labelsize, w, h)
+            '''
+            sl = 0
+            sl2 = 0
+            bl = [2, 4, 8]
+            num_pos = fg_mask.sum()
+            tloss = torch.zeros(1).float().cuda()
+            if num_pos<=0:
+                for ipred in seg_proto:
+                    tloss += (ipred.sum() * 0.)
+                for ipred in seg_cf:
+                    tloss += (ipred.sum() * 0.)
+                return tloss[0]
+
+
+            xyxy_mask = fg_mask.unsqueeze(-1).repeat([1, 1, 4])
+            mtarget_scores = target_scores.sum(-1) # (bs, nl, 1)
+
+            sl = 0
+            qf = len(idx_lst) == 1 and len(idx_lst[0].shape) == 2
+            if qf:
+                idx_lst = idx_lst[0]
+            for j in range(len(seg_cf)):
+                ishape = 0
+                pshape = 0
+
+                iseg_proto = seg_proto[0] # (bs, 32, h, w)
+                bs = iseg_proto.shape[0]
+                iseg_cf = seg_cf[j] # (bs, part_n, 32)
+
+                pshape = iseg_proto.shape[-1]
+                ishape = iseg_cf.shape[1] # (1) = part_n
+                idx = idx_lst[:, sl: sl + ishape] # (bs, part_n)
+
+                ifg_mask = fg_mask[:, sl: sl + ishape] # (n) --> (bs, part_n)
+                itarget_scores = mtarget_scores[:, sl: sl + ishape]
+                if ifg_mask.sum() <= 0:
+                     tloss += (iseg_proto.sum() * 0.)
+                     tloss += (iseg_cf.sum() * 0.)
+                     continue
+                target_sg = []
+                pred_sg = []
+                ixyxy_lst = []
+                mask_weight = []
+                for i in range(bs):
+                    idx_thisbatch = torch.masked_select(idx[i], ifg_mask[i]) #(casize)
+                    igt_segmasks = gt_segmasks.reshape(-1, *gt_segmasks.shape[-2:])[idx_thisbatch] # (?1, h?, w?) --> (?2, h?, w?)
+                    imask_weight = torch.masked_select(itarget_scores[i], ifg_mask[i]).unsqueeze(-1)
+                    mask_weight.append(imask_weight)
+                    target_sg.append(igt_segmasks)
+                    tiseg_cf = torch.masked_select(iseg_cf[i], ifg_mask[i].unsqueeze(-1).repeat(1, self.tt)) # (?2, 32)
+                    tiseg_cf = tiseg_cf.reshape(-1, self.tt)
+                    ipred_seg = (tiseg_cf@iseg_proto[i].reshape(self.tt, -1)).reshape(-1, pshape, pshape) # (?2, h, w)
+                    ixyxy = torch.masked_select(txyxy_ori[i, sl: sl + ishape], xyxy_mask[i, sl: sl + ishape, :]).reshape(-1, 4) # (n, 4) --> (part_n, 4) --> (?2, 4)
+                    ixyxy_lst.append(ixyxy)
+                    pred_sg.append(ipred_seg)
+                
+                
+
+
+                bxyxy = torch.cat(ixyxy_lst, dim = 0) * bl[j]
+                bpred_seg = torch.cat(pred_sg, dim = 0)
+                bgt_seg = torch.cat(target_sg, dim = 0)
+                masks_weight = torch.cat(mask_weight, dim = 0).reshape(-1)
+                if tuple(bgt_seg.shape[-2:]) != (pshape, pshape):  # downsample
+                    bgt_seg = F.interpolate(bgt_seg[None], (pshape, pshape), mode='nearest')[0]
+                area = torch.abs((bxyxy[..., 2] - bxyxy[..., 0]) * (bxyxy[..., 3] - bxyxy[..., 1]))
+                area = area / (pshape)
+                area = area / (pshape)
+
+                
+                
+                
+                    
+                sl += ishape
+                loss = F.binary_cross_entropy_with_logits(bpred_seg, bgt_seg, reduction='none')
+                
+                loss = (self.crop_mask(loss, bxyxy).mean(dim=(1, 2)) / area) * masks_weight
+                loss = loss.sum()
+                tloss += loss
+            if target_scores_sum > 1:
+                tloss[0] = tloss[0] / target_scores_sum
+            return tloss[0] / len(seg_cf)
+
+
+    @staticmethod
+    def crop_mask(masks, boxes):
+        """
+        "Crop" predicted masks by zeroing out everything not in the predicted bbox.
+        Vectorized by Chong (thanks Chong).
+
+        Args:
+            - masks should be a size [n, h, w] tensor of masks
+            - boxes should be a size [n, 4] tensor of bbox coords in relative point form
+        """
+
+        n, h, w = masks.shape
+        x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1)  # x1 shape(1,1,n)
+        r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :]  # rows shape(1,w,1)
+        c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None]  # cols shape(h,1,1)
+
+        return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
+
+
+class VarifocalLoss(nn.Module):
+    def __init__(self):
+        super(VarifocalLoss, self).__init__()
+
+    def forward(self, pred_score,gt_score, label, alpha=0.75, gamma=2.0):
+
+        weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
+        with torch.cuda.amp.autocast(enabled=False):
+            loss = (F.binary_cross_entropy(pred_score.float(), gt_score.float(), reduction='none') * weight).sum()
+
+        return loss
+
+
+class BboxLoss(nn.Module):
+    def __init__(self, num_classes, reg_max, use_dfl=False, iou_type='giou'):
+        super(BboxLoss, self).__init__()
+        self.num_classes = num_classes
+        self.iou_loss = IOUloss(box_format='xyxy', iou_type=iou_type, eps=1e-10)
+        self.reg_max = reg_max
+        self.use_dfl = use_dfl
+
+    def forward(self, pred_dist, pred_bboxes, anchor_points,
+                target_bboxes, target_scores, target_scores_sum, fg_mask):
+
+        # select positive samples mask
+        num_pos = fg_mask.sum()
+        if num_pos > 0:
+            # iou loss
+            bbox_mask = fg_mask.unsqueeze(-1).repeat([1, 1, 4])
+            pred_bboxes_pos = torch.masked_select(pred_bboxes,
+                                                  bbox_mask).reshape([-1, 4])
+            target_bboxes_pos = torch.masked_select(
+                target_bboxes, bbox_mask).reshape([-1, 4])
+            bbox_weight = torch.masked_select(
+                target_scores.sum(-1), fg_mask).unsqueeze(-1)
+            loss_iou = self.iou_loss(pred_bboxes_pos,
+                                     target_bboxes_pos) * bbox_weight
+            if target_scores_sum > 1:
+                loss_iou = loss_iou.sum() / target_scores_sum
+            else:
+                loss_iou = loss_iou.sum()
+
+            # dfl loss
+            if self.use_dfl:
+                dist_mask = fg_mask.unsqueeze(-1).repeat(
+                    [1, 1, (self.reg_max + 1) * 4])
+                pred_dist_pos = torch.masked_select(
+                    pred_dist, dist_mask).reshape([-1, 4, self.reg_max + 1])
+                target_ltrb = bbox2dist(anchor_points, target_bboxes, self.reg_max)
+                target_ltrb_pos = torch.masked_select(
+                    target_ltrb, bbox_mask).reshape([-1, 4])
+                loss_dfl = self._df_loss(pred_dist_pos,
+                                        target_ltrb_pos) * bbox_weight
+                if target_scores_sum > 1:
+                    loss_dfl = loss_dfl.sum() / target_scores_sum
+                else:
+                    loss_dfl = loss_dfl.sum()
+            else:
+                loss_dfl = pred_dist.sum() * 0.
+
+        else:
+            loss_iou = pred_dist.sum() * 0.
+            loss_dfl = pred_dist.sum() * 0.
+
+        return loss_iou, loss_dfl
+
+    def _df_loss(self, pred_dist, target):
+        target_left = target.to(torch.long)
+        target_right = target_left + 1
+        weight_left = target_right.to(torch.float) - target
+        weight_right = 1 - weight_left
+        loss_left = F.cross_entropy(
+            pred_dist.view(-1, self.reg_max + 1), target_left.view(-1), reduction='none').view(
+            target_left.shape) * weight_left
+        loss_right = F.cross_entropy(
+            pred_dist.view(-1, self.reg_max + 1), target_right.view(-1), reduction='none').view(
+            target_left.shape) * weight_right
+        return (loss_left + loss_right).mean(-1, keepdim=True)
+
+def dice_loss(pred,
+              target,
+              weight=None,
+              eps=1e-3,
+              reduction='mean',
+              naive_dice=False,
+              avg_factor=None):
+    """Calculate dice loss, there are two forms of dice loss is supported:
+
+        - the one proposed in `V-Net: Fully Convolutional Neural
+            Networks for Volumetric Medical Image Segmentation
+            <https://arxiv.org/abs/1606.04797>`_.
+        - the dice loss in which the power of the number in the
+            denominator is the first power instead of the second
+            power.
+
+    Args:
+        pred (torch.Tensor): The prediction, has a shape (n, *)
+        target (torch.Tensor): The learning label of the prediction,
+            shape (n, *), same shape of pred.
+        weight (torch.Tensor, optional): The weight of loss for each
+            prediction, has a shape (n,). Defaults to None.
+        eps (float): Avoid dividing by zero. Default: 1e-3.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+            Options are "none", "mean" and "sum".
+        naive_dice (bool, optional): If false, use the dice
+                loss defined in the V-Net paper, otherwise, use the
+                naive dice loss in which the power of the number in the
+                denominator is the first power instead of the second
+                power.Defaults to False.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+
+    input = pred.flatten(1)
+    target = target.flatten(1).float()
+
+    a = torch.sum(input * target, 1)
+    if naive_dice:
+        b = torch.sum(input, 1)
+        c = torch.sum(target, 1)
+        d = (2 * a + eps) / (b + c + eps)
+    else:
+        b = torch.sum(input * input, 1) + eps
+        c = torch.sum(target * target, 1) + eps
+        d = (2 * a) / (b + c)
+
+    loss = 1 - d
+    if weight is not None:
+        assert weight.ndim == loss.ndim
+        assert len(weight) == len(pred)
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+def weight_reduce_loss(loss,
+                       weight=None,
+                       reduction='mean',
+                       avg_factor=None):
+    """Apply element-wise weight and reduce loss.
+
+    Args:
+        loss (Tensor): Element-wise loss.
+        weight (Optional[Tensor], optional): Element-wise weights.
+            Defaults to None.
+        reduction (str, optional): Same as built-in losses of PyTorch.
+            Defaults to 'mean'.
+        avg_factor (Optional[float], optional): Average factor when
+            computing the mean of losses. Defaults to None.
+
+    Returns:
+        Tensor: Processed loss values.
+    """
+    # if weight is specified, apply element-wise weight
+    if weight is not None:
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        loss = reduce_loss(loss, reduction)
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == 'mean':
+            # Avoid causing ZeroDivisionError when avg_factor is 0.0,
+            # i.e., all labels of an image belong to ignore index.
+            eps = torch.finfo(torch.float32).eps
+            loss = loss.sum() / (avg_factor + eps)
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+    return loss
+
+def reduce_loss(loss, reduction):
+    """Reduce loss as specified.
+
+    Args:
+        loss (Tensor): Elementwise loss tensor.
+        reduction (str): Options are "none", "mean" and "sum".
+
+    Return:
+        Tensor: Reduced loss tensor.
+    """
+    reduction_enum = F._Reduction.get_enum(reduction)
+    # none: 0, elementwise_mean:1, sum: 2
+    if reduction_enum == 0:
+        return loss
+    elif reduction_enum == 1:
+        return loss.mean()
+    elif reduction_enum == 2:
+        return loss.sum()
\ No newline at end of file
diff --git a/yolov6/models/losses/seg_loss_solo_main.py b/yolov6/models/losses/seg_loss_solo_main.py
new file mode 100644
index 00000000..3a329beb
--- /dev/null
+++ b/yolov6/models/losses/seg_loss_solo_main.py
@@ -0,0 +1,583 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+import torch
+import torch.nn as nn
+import numpy as np
+import torch.nn.functional as F
+from yolov6.assigners.anchor_generator import generate_anchors
+from yolov6.utils.general import dist2bbox, bbox2dist, xywh2xyxy, box_iou
+from yolov6.utils.figure_iou import IOUloss
+from yolov6.assigners.atss_assigner_seg import ATSSAssigner
+from yolov6.assigners.tal_assigner_seg import TaskAlignedAssigner
+import time
+import pickle
+
+class ComputeLoss:
+    '''Loss computation func.'''
+    def __init__(self,
+                 fpn_strides=[8, 16, 32],
+                 grid_cell_size=5.0,
+                 grid_cell_offset=0.5,
+                 num_classes=80,
+                 ori_img_size=640,
+                 warmup_epoch=4,
+                 use_dfl=True,
+                 reg_max=16,
+                 weight_nums = 66,
+                 bias_nums = 1,
+                 nm = 64,
+                 dyconv_channels = 66,
+                 iou_type='giou',
+                 loss_weight={
+                     'class': 1.0,
+                     'iou': 2.5,
+                     'dfl': 0.5,
+                     'seg': 2.5},
+                 ):
+
+        self.fpn_strides = fpn_strides
+        self.grid_cell_size = grid_cell_size
+        self.grid_cell_offset = grid_cell_offset
+        self.num_classes = num_classes
+        self.ori_img_size = ori_img_size
+        self.nm = nm
+        self.tt = nm + bias_nums + 2
+        self.weight_nums = [nm + 2]
+        self.bias_nums = [bias_nums]
+        self.dyconv_channels = dyconv_channels
+
+        self.warmup_epoch = warmup_epoch
+        self.warmup_assigner = ATSSAssigner(9, num_classes=self.num_classes)
+        self.formal_assigner = TaskAlignedAssigner(topk=13, num_classes=self.num_classes, alpha=1.0, beta=6.0)
+
+        self.use_dfl = use_dfl
+        self.reg_max = reg_max
+        self.proj = nn.Parameter(torch.linspace(0, self.reg_max, self.reg_max + 1), requires_grad=False)
+        self.iou_type = iou_type
+        self.varifocal_loss = VarifocalLoss().cuda()
+        self.bbox_loss = BboxLoss(self.num_classes, self.reg_max, self.use_dfl, self.iou_type).cuda()
+        self.loss_weight = loss_weight
+        self.dice = True
+
+    def parse_dynamic_params(self, flatten_kernels):
+        """split kernel head prediction to conv weight and bias."""
+        n_inst = flatten_kernels.size(0)
+        n_layers = len(self.weight_nums)
+        params_splits = list(
+            torch.split_with_sizes(
+                flatten_kernels, self.weight_nums + self.bias_nums, dim=1))
+        weight_splits = params_splits[:n_layers]
+        bias_splits = params_splits[n_layers:]
+        for i in range(n_layers):
+            if i < n_layers - 1:
+                weight_splits[i] = weight_splits[i].reshape(
+                    n_inst * self.dyconv_channels, -1, 1, 1)
+                bias_splits[i] = bias_splits[i].reshape(n_inst *
+                                                        self.dyconv_channels)
+            else:
+                weight_splits[i] = weight_splits[i].reshape(n_inst, -1, 1, 1)
+                bias_splits[i] = bias_splits[i].reshape(n_inst)
+
+        return weight_splits, bias_splits
+
+    def handle_proto_coord(self, proto):
+        _ = proto.shape[-1]
+        x = torch.arange(0, 1, step = 1 / _).unsqueeze(0).unsqueeze(0).repeat(1, _, 1).to(proto.dtype).to(proto.device)
+        y = torch.arange(0, 1, step = 1 / _).unsqueeze(0).T.unsqueeze(0).repeat(1, 1, _).to(proto.dtype).to(proto.device)
+        return torch.cat([proto, x, y]).reshape(1, -1, _, _)
+
+    def __call__(
+        self,
+        outputs,
+        targets,
+        epoch_num,
+        step_num,
+        batch_height,
+        batch_width,
+        segmasks,
+        img=None,
+    ):
+
+        
+        feats, pred_scores, pred_distri, pred_seg  = outputs # seg_list:shape(3)(b, nm, mw, mh) seg_conf_list:shape(3):(b, l ,nm)
+        seg_cf, seg_proto = pred_seg
+        anchors, anchor_points, n_anchors_list, stride_tensor = \
+               generate_anchors(feats, self.fpn_strides, self.grid_cell_size, self.grid_cell_offset, device=feats[0].device)
+
+        assert pred_scores.type() == pred_distri.type()
+        gt_bboxes_scale = torch.tensor([batch_width, batch_height, batch_width, batch_height]).type_as(pred_scores)
+        batch_size = pred_scores.shape[0]
+
+        targets, gt_segmasks =self.preprocess(targets, batch_size, gt_bboxes_scale, segmasks)
+        gt_labels = targets[:, :, :1]
+        gt_bboxes = targets[:, :, 1:] #xyxy
+        mask_gt = (gt_bboxes.sum(-1, keepdim=True) > 0).float()
+
+
+        # pboxes
+        anchor_points_s = anchor_points / stride_tensor
+        pred_bboxes = self.bbox_decode(anchor_points_s, pred_distri) #xyxy
+
+
+        try:
+            if epoch_num < self.warmup_epoch:
+                target_labels, target_bboxes, target_scores, fg_mask, target_segmasks = \
+                    self.warmup_assigner(
+                        anchors,
+                        n_anchors_list,
+                        gt_labels,
+                        gt_bboxes,
+                        mask_gt,
+                        pred_bboxes.detach() * stride_tensor,
+                        gt_segmasks)
+            else:
+                target_labels, target_bboxes, target_scores, fg_mask, idx_lst = \
+                    self.formal_assigner(
+                        pred_scores.detach(),
+                        pred_bboxes.detach() * stride_tensor,
+                        anchor_points,
+                        gt_labels,
+                        gt_bboxes,
+                        mask_gt,
+                        gt_segmasks)
+
+        except RuntimeError:
+            print(
+                "OOM RuntimeError is raised due to the huge memory cost during label assignment. \
+                    CPU mode is applied in this batch. If you want to avoid this issue, \
+                    try to reduce the batch size or image size."
+            )
+            torch.cuda.empty_cache()
+            print("------------CPU Mode for This Batch-------------")
+            if epoch_num < self.warmup_epoch:
+                _anchors = anchors.cpu().float()
+                _n_anchors_list = n_anchors_list
+                _gt_labels = gt_labels.cpu().float()
+                _gt_bboxes = gt_bboxes.cpu().float()
+                _mask_gt = mask_gt.cpu().float()
+                _pred_bboxes = pred_bboxes.detach().cpu().float()
+                _stride_tensor = stride_tensor.cpu().float()
+                _segmasks = gt_segmasks.cpu().float()
+
+                target_labels, target_bboxes, target_scores, fg_mask, target_segmasks = \
+                    self.warmup_assigner(
+                        _anchors,
+                        _n_anchors_list,
+                        _gt_labels,
+                        _gt_bboxes,
+                        _mask_gt,
+                        _pred_bboxes * _stride_tensor,
+                        _segmasks)
+
+            else:
+                _pred_scores = pred_scores.detach().cpu().float()
+                _pred_bboxes = pred_bboxes.detach().cpu().float()
+                _anchor_points = anchor_points.cpu().float()
+                _gt_labels = gt_labels.cpu().float()
+                _gt_bboxes = gt_bboxes.cpu().float()
+                _mask_gt = mask_gt.cpu().float()
+                _stride_tensor = stride_tensor.cpu().float()
+                _segmasks = gt_segmasks.cpu().float()
+
+                target_labels, target_bboxes, target_scores, fg_mask, idx_lst = \
+                    self.formal_assigner(
+                        _pred_scores,
+                        _pred_bboxes * _stride_tensor,
+                        _anchor_points,
+                        _gt_labels,
+                        _gt_bboxes,
+                        _mask_gt,
+                        _segmasks)
+
+            target_labels = target_labels.cuda()
+            target_bboxes = target_bboxes.cuda()
+            target_scores = target_scores.cuda()
+            fg_mask = fg_mask.cuda()
+            for _ in idx_lst:
+                _ = _.cuda()
+
+        if step_num % 10 == 0:
+            torch.cuda.empty_cache()
+
+        # rescale bbox
+        target_bboxes /= stride_tensor
+
+        # cls loss
+        target_labels = torch.where(fg_mask > 0, target_labels, torch.full_like(target_labels, self.num_classes))
+        one_hot_label = F.one_hot(target_labels.long(), self.num_classes + 1)[..., :-1]
+        loss_cls = self.varifocal_loss(pred_scores, target_scores, one_hot_label)
+
+
+        target_scores_sum = target_scores.sum()
+        
+
+        if target_scores_sum > 1:
+            loss_cls /= target_scores_sum
+
+        # bbox loss
+        loss_iou, loss_dfl = self.bbox_loss(pred_distri, pred_bboxes, anchor_points_s, target_bboxes,
+                                            target_scores, target_scores_sum, fg_mask)
+
+        loss_seg = self.mask_loss(gt_segmasks, seg_cf, seg_proto, target_bboxes, fg_mask, idx_lst, target_scores, target_scores_sum, epoch=0)
+
+        loss = self.loss_weight['class'] * loss_cls + \
+               self.loss_weight['iou'] * loss_iou + \
+               self.loss_weight['dfl'] * loss_dfl + \
+               self.loss_weight['seg'] * loss_seg
+
+
+        return loss, \
+            torch.cat(((self.loss_weight['iou'] * loss_iou).unsqueeze(0),
+                         (self.loss_weight['dfl'] * loss_dfl).unsqueeze(0),
+                         (self.loss_weight['class'] * loss_cls).unsqueeze(0),
+                         (self.loss_weight['seg'] * loss_seg).unsqueeze(0))).detach()
+
+    def preprocess(self, targets, batch_size, scale_tensor, segmask):
+        targets_list = np.zeros((batch_size, 1, 5)).tolist()
+        cu = []
+        already = []
+        for i, item in enumerate(targets.cpu().numpy().tolist()):
+            index = int(item[0])
+            targets_list[index].append(item[1:])
+            if index not in already:
+                already.append(index)
+                cu.append(i)
+        cu.append(segmask.shape[0])
+        max_len = max((len(l) for l in targets_list))
+        segmasks = torch.zeros(batch_size, max_len - 1, segmask.shape[-2], segmask.shape[-1]).cuda()
+        if len(already) != 0:
+            for i in range(len(already)):
+                j = already[i]
+                start = cu[i]
+                end = cu[i+1]
+                segmasks[j, : end - start] = segmask[start: end].clone()
+        targets = torch.from_numpy(np.array(list(map(lambda l:l + [[-1,0,0,0,0]]*(max_len - len(l)), targets_list)))[:,1:,:]).to(targets.device)
+
+        batch_target = targets[:, :, 1:5].mul_(scale_tensor)
+        targets[..., 1:] = xywh2xyxy(batch_target)
+        return targets, segmasks
+
+    def bbox_decode(self, anchor_points, pred_dist):
+        if self.use_dfl:
+            batch_size, n_anchors, _ = pred_dist.shape
+            pred_dist = F.softmax(pred_dist.view(batch_size, n_anchors, 4, self.reg_max + 1), dim=-1).matmul(self.proj.to(pred_dist.device))
+        return dist2bbox(pred_dist, anchor_points)
+
+    def mask_loss(self, gt_segmasks, seg_cf, seg_proto, txyxy_ori_s, fg_mask, idx_lst, target_scores=None, target_scores_sum=None, epoch=0):
+            # pred_mask_lst -> list
+            '''
+            pred_mask -> Shape(n1, w, h)
+            gt_mask -> Shape(n, img_w, img_h)
+            xyxy -> Shape(n, 4)
+            sum(n1, n2, n3, ...) = n
+            torch.abs((xyxy[..., 3] - xyxy[..., 1]) * (xyxy[..., 4] - xyxy[..., 2])) -> area
+            fg_mask --> (bs, tsize)
+            idx -> (bs, tsize)
+            gt_segmasks -> (bs, labelsize, w, h)
+            '''
+            sl = 0
+            sl2 = 0
+            bl = [2, 4, 8]
+            num_pos = fg_mask.sum()
+            tloss = torch.zeros(1).float().cuda()
+            if num_pos<=0:
+                for ipred in seg_proto:
+                    tloss += (ipred.sum() * 0.)
+                for ipred in seg_cf:
+                    tloss += (ipred.sum() * 0.)
+                return tloss[0]
+
+
+            xyxy_mask = fg_mask.unsqueeze(-1).repeat([1, 1, 4])
+            mtarget_scores = target_scores.sum(-1) # (bs, nl, 1)
+
+            sl = 0
+            qf = len(idx_lst) == 1 and len(idx_lst[0].shape) == 2
+            if qf:
+                idx_lst = idx_lst[0]
+            _ = [_i.shape[1] for _i in seg_cf]
+            sp = [2, 4, 8]
+            fpn = []
+            for i in range(0, 3):
+                fpn.extend([sp[i]] * _[i])
+            fpn = torch.Tensor(fpn).unsqueeze(-1).cuda()
+            txyxy_ori = txyxy_ori_s * fpn.unsqueeze(0).repeat(seg_cf[0].shape[0], 1, 1)
+            iseg_cf = torch.cat(seg_cf, axis = 1)
+            iseg_proto = seg_proto[0] # (bs, 32, h, w)
+            bs = iseg_proto.shape[0]
+            if fg_mask.sum()<=0:
+                tloss += (iseg_proto.sum() * 0.)
+                tloss += (iseg_cf.sum() * 0.)
+                return tloss[0]
+
+            pshape = iseg_proto.shape[-1]
+            ishape = iseg_cf.shape[1] # (1) = part_n
+            idx = idx_lst[:, :] # (bs, part_n)
+
+            ifg_mask = fg_mask[:, :] # (n) --> (bs, part_n)
+            itarget_scores = mtarget_scores[:, :]
+            target_sg = []
+            pred_sg = []
+            ixyxy_lst = []
+            mask_weight = []
+            for i in range(bs):
+                siproto = self.handle_proto_coord(iseg_proto[i])
+                iproto = siproto.reshape(1, -1, *siproto.shape[-2:])
+                idx_thisbatch = torch.masked_select(idx[i], ifg_mask[i]) #(casize)
+                igt_segmasks = gt_segmasks.reshape(-1, *gt_segmasks.shape[-2:])[idx_thisbatch] # (?1, h?, w?) --> (?2, h?, w?)
+                imask_weight = torch.masked_select(itarget_scores[i], ifg_mask[i]).unsqueeze(-1)
+                tiseg_cf = torch.masked_select(iseg_cf[i], ifg_mask[i].unsqueeze(-1).repeat(1, self.tt)) # (?2, 32)
+                tiseg_cf = tiseg_cf.reshape(-1, self.tt)
+                num_inst = tiseg_cf.shape[0]
+                if num_inst == 0:
+                    tloss[0] += (tiseg_cf.sum() * 0.)
+                    continue
+                mask_weight.append(imask_weight)
+                target_sg.append(igt_segmasks)
+                weights, biases = self.parse_dynamic_params(tiseg_cf)
+                n_layers = len(weights)
+                for _i, (weight, bias) in enumerate(zip(weights, biases)):
+                    x = F.conv2d(
+                        iproto, weight, bias=bias, stride=1, padding=0, groups=1)
+                    if _i < n_layers - 1:
+                        x = F.relu(x)
+                x = x.reshape(num_inst, *iproto.shape[-2:])
+                ixyxy = torch.masked_select(txyxy_ori[i, :], xyxy_mask[i, :, :]).reshape(-1, 4) # (n, 4) --> (part_n, 4) --> (?2, 4)
+                ixyxy_lst.append(ixyxy)
+                pred_sg.append(x)
+            bxyxy = torch.cat(ixyxy_lst, dim = 0)
+            bpred_seg = torch.cat(pred_sg, dim = 0)
+            bgt_seg = torch.cat(target_sg, dim = 0)
+            masks_weight = torch.cat(mask_weight, dim = 0).reshape(-1)
+            if tuple(bgt_seg.shape[-2:]) != (pshape, pshape):  # downsample
+                bgt_seg = F.interpolate(bgt_seg[None], (pshape, pshape), mode='nearest')[0]
+            area = torch.abs((bxyxy[..., 2] - bxyxy[..., 0]) * (bxyxy[..., 3] - bxyxy[..., 1]))
+            area = area / (pshape)
+            area = area / (pshape)
+
+            if not self.dice:
+                loss = F.binary_cross_entropy_with_logits(bpred_seg, bgt_seg, reduction='none')
+                loss = (self.crop_mask(loss, bxyxy).mean(dim=(1, 2)) / area) * masks_weight
+                loss = loss.sum()
+                tloss += loss
+                if target_scores_sum > 1:
+                    tloss[0] = tloss[0] / target_scores_sum
+                return tloss[0] / len(seg_cf)
+            else:
+                bpred_seg = bpred_seg.sigmoid()
+                if epoch <= 160:
+                    loss = dice_loss(bpred_seg, bgt_seg, masks_weight, reduction='mean', avg_factor=target_scores_sum if target_scores_sum > 1 else 1)
+                else:
+                    loss = dice_loss(bpred_seg, bgt_seg, reduction='mean')
+                tloss += loss
+                return tloss[0]
+
+    @staticmethod
+    def crop_mask(masks, boxes):
+        """
+        "Crop" predicted masks by zeroing out everything not in the predicted bbox.
+        Vectorized by Chong (thanks Chong).
+
+        Args:
+            - masks should be a size [n, h, w] tensor of masks
+            - boxes should be a size [n, 4] tensor of bbox coords in relative point form
+        """
+
+        n, h, w = masks.shape
+        x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1)  # x1 shape(1,1,n)
+        r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :]  # rows shape(1,w,1)
+        c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None]  # cols shape(h,1,1)
+
+        return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
+
+
+class VarifocalLoss(nn.Module):
+    def __init__(self):
+        super(VarifocalLoss, self).__init__()
+
+    def forward(self, pred_score,gt_score, label, alpha=0.75, gamma=2.0):
+
+        weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
+        with torch.cuda.amp.autocast(enabled=False):
+            loss = (F.binary_cross_entropy(pred_score.float(), gt_score.float(), reduction='none') * weight).sum()
+
+        return loss
+
+
+class BboxLoss(nn.Module):
+    def __init__(self, num_classes, reg_max, use_dfl=False, iou_type='giou'):
+        super(BboxLoss, self).__init__()
+        self.num_classes = num_classes
+        self.iou_loss = IOUloss(box_format='xyxy', iou_type=iou_type, eps=1e-10)
+        self.reg_max = reg_max
+        self.use_dfl = use_dfl
+
+    def forward(self, pred_dist, pred_bboxes, anchor_points,
+                target_bboxes, target_scores, target_scores_sum, fg_mask):
+
+        # select positive samples mask
+        num_pos = fg_mask.sum()
+        if num_pos > 0:
+            # iou loss
+            bbox_mask = fg_mask.unsqueeze(-1).repeat([1, 1, 4])
+            pred_bboxes_pos = torch.masked_select(pred_bboxes,
+                                                  bbox_mask).reshape([-1, 4])
+            target_bboxes_pos = torch.masked_select(
+                target_bboxes, bbox_mask).reshape([-1, 4])
+            bbox_weight = torch.masked_select(
+                target_scores.sum(-1), fg_mask).unsqueeze(-1)
+            loss_iou = self.iou_loss(pred_bboxes_pos,
+                                     target_bboxes_pos) * bbox_weight
+            if target_scores_sum > 1:
+                loss_iou = loss_iou.sum() / target_scores_sum
+            else:
+                loss_iou = loss_iou.sum()
+
+            # dfl loss
+            if self.use_dfl:
+                dist_mask = fg_mask.unsqueeze(-1).repeat(
+                    [1, 1, (self.reg_max + 1) * 4])
+                pred_dist_pos = torch.masked_select(
+                    pred_dist, dist_mask).reshape([-1, 4, self.reg_max + 1])
+                target_ltrb = bbox2dist(anchor_points, target_bboxes, self.reg_max)
+                target_ltrb_pos = torch.masked_select(
+                    target_ltrb, bbox_mask).reshape([-1, 4])
+                loss_dfl = self._df_loss(pred_dist_pos,
+                                        target_ltrb_pos) * bbox_weight
+                if target_scores_sum > 1:
+                    loss_dfl = loss_dfl.sum() / target_scores_sum
+                else:
+                    loss_dfl = loss_dfl.sum()
+            else:
+                loss_dfl = pred_dist.sum() * 0.
+
+        else:
+            loss_iou = pred_dist.sum() * 0.
+            loss_dfl = pred_dist.sum() * 0.
+
+        return loss_iou, loss_dfl
+
+    def _df_loss(self, pred_dist, target):
+        target_left = target.to(torch.long)
+        target_right = target_left + 1
+        weight_left = target_right.to(torch.float) - target
+        weight_right = 1 - weight_left
+        loss_left = F.cross_entropy(
+            pred_dist.view(-1, self.reg_max + 1), target_left.view(-1), reduction='none').view(
+            target_left.shape) * weight_left
+        loss_right = F.cross_entropy(
+            pred_dist.view(-1, self.reg_max + 1), target_right.view(-1), reduction='none').view(
+            target_left.shape) * weight_right
+        return (loss_left + loss_right).mean(-1, keepdim=True)
+
+def dice_loss(pred,
+              target,
+              weight=None,
+              eps=1e-3,
+              reduction='mean',
+              naive_dice=False,
+              avg_factor=None):
+    """Calculate dice loss, there are two forms of dice loss is supported:
+        Borrowed from MMDetection
+        - the one proposed in `V-Net: Fully Convolutional Neural
+            Networks for Volumetric Medical Image Segmentation
+            <https://arxiv.org/abs/1606.04797>`_.
+        - the dice loss in which the power of the number in the
+            denominator is the first power instead of the second
+            power.
+
+    Args:
+        pred (torch.Tensor): The prediction, has a shape (n, *)
+        target (torch.Tensor): The learning label of the prediction,
+            shape (n, *), same shape of pred.
+        weight (torch.Tensor, optional): The weight of loss for each
+            prediction, has a shape (n,). Defaults to None.
+        eps (float): Avoid dividing by zero. Default: 1e-3.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+            Options are "none", "mean" and "sum".
+        naive_dice (bool, optional): If false, use the dice
+                loss defined in the V-Net paper, otherwise, use the
+                naive dice loss in which the power of the number in the
+                denominator is the first power instead of the second
+                power.Defaults to False.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+
+    input = pred.flatten(1)
+    target = target.flatten(1).float()
+
+    a = torch.sum(input * target, 1)
+    if naive_dice:
+        b = torch.sum(input, 1)
+        c = torch.sum(target, 1)
+        d = (2 * a + eps) / (b + c + eps)
+    else:
+        b = torch.sum(input * input, 1) + eps
+        c = torch.sum(target * target, 1) + eps
+        d = (2 * a) / (b + c)
+
+    loss = 1 - d
+    if weight is not None:
+        assert weight.ndim == loss.ndim
+        assert len(weight) == len(pred)
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+def weight_reduce_loss(loss,
+                       weight=None,
+                       reduction='none',
+                       avg_factor=None):
+    """Apply element-wise weight and reduce loss.
+
+    Args:
+        loss (Tensor): Element-wise loss.
+        weight (Optional[Tensor], optional): Element-wise weights.
+            Defaults to None.
+        reduction (str, optional): Same as built-in losses of PyTorch.
+            Defaults to 'mean'.
+        avg_factor (Optional[float], optional): Average factor when
+            computing the mean of losses. Defaults to None.
+
+    Returns:
+        Tensor: Processed loss values.
+    """
+    # if weight is specified, apply element-wise weight
+    if weight is not None:
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        loss = reduce_loss(loss, reduction)
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == 'mean':
+            # Avoid causing ZeroDivisionError when avg_factor is 0.0,
+            # i.e., all labels of an image belong to ignore index.
+            eps = torch.finfo(torch.float32).eps
+            loss = loss.sum() / (avg_factor + eps)
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+    return loss
+
+def reduce_loss(loss, reduction):
+    """Reduce loss as specified.
+
+    Args:
+        loss (Tensor): Elementwise loss tensor.
+        reduction (str): Options are "none", "mean" and "sum".
+
+    Return:
+        Tensor: Reduced loss tensor.
+    """
+    reduction_enum = F._Reduction.get_enum(reduction)
+    # none: 0, elementwise_mean:1, sum: 2
+    if reduction_enum == 0:
+        return loss
+    elif reduction_enum == 1:
+        return loss.mean()
+    elif reduction_enum == 2:
+        return loss.sum()
\ No newline at end of file
diff --git a/yolov6/models/reppan.py b/yolov6/models/reppan.py
index 2114f521..820f4211 100644
--- a/yolov6/models/reppan.py
+++ b/yolov6/models/reppan.py
@@ -551,22 +551,14 @@ def __init__(
         channels_list=None,
         num_repeats=None,
         block=BottleRep,
-        csp_e=float(1)/2,
-        stage_block_type="BepC3"
+        csp_e=float(1)/2
     ):
         super().__init__()
 
-        if stage_block_type == "BepC3":
-            stage_block = BepC3
-        elif stage_block_type == "MBLABlock":
-            stage_block = MBLABlock
-        else:
-            raise NotImplementedError
-
         assert channels_list is not None
         assert num_repeats is not None
 
-        self.Rep_p4 = stage_block(
+        self.Rep_p4 = BepC3(
             in_channels=channels_list[3] + channels_list[5], # 512 + 256
             out_channels=channels_list[5], # 256
             n=num_repeats[5],
@@ -574,7 +566,7 @@ def __init__(
             block=block
         )
 
-        self.Rep_p3 = stage_block(
+        self.Rep_p3 = BepC3(
             in_channels=channels_list[2] + channels_list[6], # 256 + 128
             out_channels=channels_list[6], # 128
             n=num_repeats[6],
@@ -582,7 +574,7 @@ def __init__(
             block=block
         )
 
-        self.Rep_n3 = stage_block(
+        self.Rep_n3 = BepC3(
             in_channels=channels_list[6] + channels_list[7], # 128 + 128
             out_channels=channels_list[8], # 256
             n=num_repeats[7],
@@ -590,7 +582,7 @@ def __init__(
             block=block
         )
 
-        self.Rep_n4 = stage_block(
+        self.Rep_n4 = BepC3(
             in_channels=channels_list[5] + channels_list[9], # 256 + 256
             out_channels=channels_list[10], # 512
             n=num_repeats[8],
@@ -795,21 +787,13 @@ def __init__(
         channels_list=None,
         num_repeats=None,
         block=BottleRep,
-        csp_e=float(1)/2,
-        stage_block_type="BepC3"
+        csp_e=float(1)/2
     ):
         super().__init__()
 
         assert channels_list is not None
         assert num_repeats is not None
 
-        if stage_block_type == "BepC3":
-            stage_block = BepC3
-        elif stage_block_type == "MBLABlock":
-            stage_block = MBLABlock
-        else:
-            raise NotImplementedError
-
         self.reduce_layer0 = ConvBNReLU(
             in_channels=channels_list[5], # 1024
             out_channels=channels_list[6], # 512
@@ -822,7 +806,7 @@ def __init__(
             out_channels=channels_list[6], # 512
         )
 
-        self.Rep_p5 = stage_block(
+        self.Rep_p5 = BepC3(
             in_channels=channels_list[4] + channels_list[6], # 768 + 512
             out_channels=channels_list[6], # 512
             n=num_repeats[6],
@@ -842,7 +826,7 @@ def __init__(
             out_channels=channels_list[7] # 256
         )
 
-        self.Rep_p4 = stage_block(
+        self.Rep_p4 = BepC3(
             in_channels=channels_list[3] + channels_list[7], # 512 + 256
             out_channels=channels_list[7], # 256
             n=num_repeats[7],
@@ -862,7 +846,7 @@ def __init__(
             out_channels=channels_list[8] # 128
         )
 
-        self.Rep_p3 = stage_block(
+        self.Rep_p3 = BepC3(
             in_channels=channels_list[2] + channels_list[8], # 256 + 128
             out_channels=channels_list[8], # 128
             n=num_repeats[8],
@@ -877,7 +861,7 @@ def __init__(
             stride=2
         )
 
-        self.Rep_n4 = stage_block(
+        self.Rep_n4 = BepC3(
             in_channels=channels_list[8] + channels_list[8], # 128 + 128
             out_channels=channels_list[9], # 256
             n=num_repeats[9],
@@ -892,7 +876,7 @@ def __init__(
             stride=2
         )
 
-        self.Rep_n5 = stage_block(
+        self.Rep_n5 = BepC3(
             in_channels=channels_list[7] + channels_list[9], # 256 + 256
             out_channels=channels_list[10], # 512
             n=num_repeats[10],
@@ -907,7 +891,7 @@ def __init__(
             stride=2
         )
 
-        self.Rep_n6 = stage_block(
+        self.Rep_n6 = BepC3(
             in_channels=channels_list[6] + channels_list[10], # 512 + 512
             out_channels=channels_list[11], # 1024
             n=num_repeats[11],
@@ -962,21 +946,13 @@ def __init__(
         channels_list=None,
         num_repeats=None,
         block=BottleRep,
-        csp_e=float(1)/2,
-        stage_block_type="BepC3"
+        csp_e=float(1)/2
     ):
         super().__init__()
 
         assert channels_list is not None
         assert num_repeats is not None
 
-        if stage_block_type == "BepC3":
-            stage_block = BepC3
-        elif stage_block_type == "MBLABlock":
-            stage_block = MBLABlock
-        else:
-            raise NotImplementedError
-
         self.reduce_layer0 = ConvBNReLU(
             in_channels=channels_list[5], # 1024
             out_channels=channels_list[6], # 512
@@ -989,7 +965,7 @@ def __init__(
             out_channels=channels_list[6], # 512
         )
 
-        self.Rep_p5 = stage_block(
+        self.Rep_p5 = BepC3(
             in_channels=channels_list[6], # 512
             out_channels=channels_list[6], # 512
             n=num_repeats[6],
@@ -1009,7 +985,7 @@ def __init__(
             out_channels=channels_list[7], # 256
         )
 
-        self.Rep_p4 = stage_block(
+        self.Rep_p4 = BepC3(
             in_channels=channels_list[7], # 256
             out_channels=channels_list[7], # 256
             n=num_repeats[7],
@@ -1029,7 +1005,7 @@ def __init__(
             out_channels=channels_list[8], # 128
         )
 
-        self.Rep_p3 = stage_block(
+        self.Rep_p3 = BepC3(
             in_channels=channels_list[8], # 128
             out_channels=channels_list[8], # 128
             n=num_repeats[8],
@@ -1044,7 +1020,7 @@ def __init__(
             stride=2
         )
 
-        self.Rep_n4 = stage_block(
+        self.Rep_n4 = BepC3(
             in_channels=channels_list[8] + channels_list[8], # 128 + 128
             out_channels=channels_list[9], # 256
             n=num_repeats[9],
@@ -1059,7 +1035,7 @@ def __init__(
             stride=2
         )
 
-        self.Rep_n5 = stage_block(
+        self.Rep_n5 = BepC3(
             in_channels=channels_list[7] + channels_list[9], # 256 + 256
             out_channels=channels_list[10], # 512
             n=num_repeats[10],
@@ -1074,7 +1050,7 @@ def __init__(
             stride=2
         )
 
-        self.Rep_n6 = stage_block(
+        self.Rep_n6 = BepC3(
             in_channels=channels_list[6] + channels_list[10], # 512 + 512
             out_channels=channels_list[11], # 1024
             n=num_repeats[11],
diff --git a/yolov6/models/yolo.py b/yolov6/models/yolo.py
index 2f37f1b1..5e121b79 100644
--- a/yolov6/models/yolo.py
+++ b/yolov6/models/yolo.py
@@ -63,6 +63,11 @@ def build_network(config, channels, num_classes, num_layers, fuse_ab=False, dist
     channels_list_neck = config.model.neck.out_channels
     use_dfl = config.model.head.use_dfl
     reg_max = config.model.head.reg_max
+    issolo = config.model.head.issolo
+    isseg = config.model.head.isseg
+    npr = config.model.head.npr
+    npr = make_divisible(npr * width_mul, 8)
+    nm = config.model.head.nm
     num_repeat = [(max(round(i * depth_mul), 1) if i > 1 else i) for i in (num_repeat_backbone + num_repeat_neck)]
     channels_list = [make_divisible(i * width_mul, 8) for i in (channels_list_backbone + channels_list_neck)]
 
@@ -110,8 +115,20 @@ def build_network(config, channels, num_classes, num_layers, fuse_ab=False, dist
             num_repeats=num_repeat,
             block=block
         )
-
-    if distill_ns:
+    if isseg:
+        if issolo:
+            from yolov6.models.heads.effidehead_fuseab_seg_solo import Detect, build_effidehead_layer, Proto
+            anchors_init = config.model.head.anchors_init
+            head_layers = build_effidehead_layer(channels_list, 3, num_classes, reg_max=reg_max, num_layers=num_layers, num_masks=nm + 2 + 1, fuse_ab=fuse_ab)
+            reg_masks = [Proto(num_layers, channels_list, 0, npr, nm, scale_factor=2), Proto(num_layers, channels_list, 1, npr, nm, scale_factor=4), Proto(num_layers, channels_list, 2, npr, nm, scale_factor=8)] 
+            head = Detect(num_classes, anchors_init, num_layers, head_layers=head_layers, use_dfl=use_dfl, reg_mask=reg_masks, fuse_ab=fuse_ab, nm=nm + 2 + 1)
+        else:
+            from yolov6.models.heads.effidehead_fuseab_seg import Detect, build_effidehead_layer, Proto
+            anchors_init = config.model.head.anchors_init
+            head_layers = build_effidehead_layer(channels_list, 3, num_classes, reg_max=reg_max, num_layers=num_layers, num_masks=nm, fuse_ab=fuse_ab)
+            reg_masks = [Proto(num_layers, channels_list, 0, npr, nm)] 
+            head = Detect(num_classes, anchors_init, num_layers, head_layers=head_layers, use_dfl=use_dfl, reg_mask=reg_masks, fuse_ab=fuse_ab)
+    elif distill_ns:
         from yolov6.models.heads.effidehead_distill_ns import Detect, build_effidehead_layer
         if num_layers != 3:
             LOGGER.error('ERROR in: Distill mode not fit on n/s models with P6 head.\n')
diff --git a/yolov6/utils/general.py b/yolov6/utils/general.py
index cb4418cd..e144f95d 100644
--- a/yolov6/utils/general.py
+++ b/yolov6/utils/general.py
@@ -5,7 +5,6 @@
 import math
 import torch
 import requests
-import pkg_resources as pkg
 from pathlib import Path
 from yolov6.utils.events import LOGGER
 
@@ -94,7 +93,6 @@ def download_ckpt(path):
     LOGGER.info(f"checkpoint {basename} not exist, try to downloaded it from github.")
     # need to update the link with every release
     url = f"https://github.com/meituan/YOLOv6/releases/download/0.4.0/{basename}"
-    LOGGER.warning(f"downloading url is: {url}, pealse make sure the version of the downloading model is correspoing to the code version!")
     r = requests.get(url, allow_redirects=True)
     assert r.status_code == 200, "Unable to download checkpoints, manually download it"
     open(path, 'wb').write(r.content)
@@ -115,13 +113,3 @@ def check_img_size(imgsz, s=32, floor=0):
     if new_size != imgsz:
         LOGGER.warning(f'--img-size {imgsz} must be multiple of max stride {s}, updating to {new_size}')
     return new_size
-
-
-def check_version(current='0.0.0', minimum='0.0.0', name='version ', pinned=False, hard=False, verbose=False):
-    # Check whether the package's version is match the required version.
-    current, minimum = (pkg.parse_version(x) for x in (current, minimum))
-    result = (current == minimum) if pinned else (current >= minimum)  # bool
-    if hard:
-        info = f'⚠️ {name}{minimum} is required by YOLOv6, but {name}{current} is currently installed'
-        assert result, info  # assert minimum version requirement
-    return result
diff --git a/yolov6/utils/metrics.py b/yolov6/utils/metrics.py
index cbfa130e..c54b4f8a 100644
--- a/yolov6/utils/metrics.py
+++ b/yolov6/utils/metrics.py
@@ -9,8 +9,9 @@
 import torch
 import warnings
 from . import general
+import torch.nn.functional as F
 
-def ap_per_class(tp, conf, pred_cls, target_cls, plot=False, save_dir='.', names=()):
+def ap_per_class_v6(tp, conf, pred_cls, target_cls, plot=False, save_dir='.', names=(), prefix = ''):
     """ Compute the average precision, given the recall and precision curves.
     Source: https://github.com/rafaelpadilla/Object-Detection-Metrics.
     # Arguments
@@ -57,7 +58,7 @@ def ap_per_class(tp, conf, pred_cls, target_cls, plot=False, save_dir='.', names
 
             # AP from recall-precision curve
             for j in range(tp.shape[1]):
-                ap[ci, j], mpre, mrec = compute_ap(recall[:, j], precision[:, j])
+                ap[ci, j], mpre, mrec = compute_ap_v6(recall[:, j], precision[:, j])
                 if plot and j == 0:
                     py.append(np.interp(px, mrec, mpre))  # precision at mAP@0.5
 
@@ -71,8 +72,112 @@ def ap_per_class(tp, conf, pred_cls, target_cls, plot=False, save_dir='.', names
 
     # i = f1.mean(0).argmax()  # max F1 index
     # return p[:, i], r[:, i], ap, f1[:, i], unique_classes.astype('int32')
-    return p, r, ap, f1, unique_classes.astype('int32')
+    AP50_F1_max_idx = len(f1.mean(0)) - f1.mean(0)[::-1].argmax() -1
+    ap50, ap = ap[:, 0], ap.mean(1)
+    mp, mr, map50, map = p[:, AP50_F1_max_idx].mean(), r[:, AP50_F1_max_idx].mean(), ap50.mean(), ap.mean()
+    return mp, mr, map50, map, AP50_F1_max_idx
 
+def ap_per_class(tp, conf, pred_cls, target_cls, plot=False, save_dir='.', names=(), eps=1e-16, prefix=''):
+    """ Compute the average precision, given the recall and precision curves.
+    Source: https://github.com/rafaelpadilla/Object-Detection-Metrics.
+    # Arguments
+        tp:  True positives (nparray, nx1 or nx10).
+        conf:  Objectness value from 0-1 (nparray).
+        pred_cls:  Predicted object classes (nparray).
+        target_cls:  True object classes (nparray).
+        plot:  Plot precision-recall curve at mAP@0.5
+        save_dir:  Plot save directory
+    # Returns
+        The average precision as computed in py-faster-rcnn.
+    """
+
+    # Sort by objectness
+    i = np.argsort(-conf)
+    tp, conf, pred_cls = tp[i], conf[i], pred_cls[i]
+
+    # Find unique classes
+    unique_classes, nt = np.unique(target_cls, return_counts=True)
+    nc = unique_classes.shape[0]  # number of classes, number of detections
+
+    # Create Precision-Recall curve and compute AP for each class
+    px, py = np.linspace(0, 1, 1000), []  # for plotting
+    ap, p, r = np.zeros((nc, tp.shape[1])), np.zeros((nc, 1000)), np.zeros((nc, 1000))
+    for ci, c in enumerate(unique_classes):
+        i = pred_cls == c
+        n_l = nt[ci]  # number of labels
+        n_p = i.sum()  # number of predictions
+        if n_p == 0 or n_l == 0:
+            continue
+
+        # Accumulate FPs and TPs
+        fpc = (1 - tp[i]).cumsum(0)
+        tpc = tp[i].cumsum(0)
+
+        # Recall
+        recall = tpc / (n_l + eps)  # recall curve
+        r[ci] = np.interp(-px, -conf[i], recall[:, 0], left=0)  # negative x, xp because xp decreases
+
+        # Precision
+        precision = tpc / (tpc + fpc)  # precision curve
+        p[ci] = np.interp(-px, -conf[i], precision[:, 0], left=1)  # p at pr_score
+
+        # AP from recall-precision curve
+        for j in range(tp.shape[1]):
+            ap[ci, j], mpre, mrec = compute_ap(recall[:, j], precision[:, j])
+            if plot and j == 0:
+                py.append(np.interp(px, mrec, mpre))  # precision at mAP@0.5
+
+    # Compute F1 (harmonic mean of precision and recall)
+    f1 = 2 * p * r / (p + r + eps)
+    plot = False
+    if plot:
+        names = [v for k, v in names.items() if k in unique_classes]  # list: only classes that have data
+        names = dict(enumerate(names))  # to dict
+        plot_pr_curve(px, py, ap, Path(save_dir) / f'{prefix}PR_curve.png', names)
+        plot_mc_curve(px, f1, Path(save_dir) / f'{prefix}F1_curve.png', names, ylabel='F1')
+        plot_mc_curve(px, p, Path(save_dir) / f'{prefix}P_curve.png', names, ylabel='Precision')
+        plot_mc_curve(px, r, Path(save_dir) / f'{prefix}R_curve.png', names, ylabel='Recall')
+
+    i = smooth(f1.mean(0), 0.1).argmax()  # max F1 index
+    p, r, f1 = p[:, i], r[:, i], f1[:, i]
+    tp = (r * nt).round()  # true positives
+    fp = (tp / (p + eps) - tp).round()  # false positives
+    return tp, fp, p, r, f1, ap, unique_classes.astype(int)
+
+def smooth(y, f=0.05):
+    # Box filter of fraction f
+    nf = round(len(y) * f * 2) // 2 + 1  # number of filter elements (must be odd)
+    p = np.ones(nf // 2)  # ones padding
+    yp = np.concatenate((p * y[0], y, p * y[-1]), 0)  # y padded
+    return np.convolve(yp, np.ones(nf) / nf, mode='valid')  # y-smoothed
+
+
+def compute_ap_v6(recall, precision):
+    """ Compute the average precision, given the recall and precision curves
+    # Arguments
+        recall:    The recall curve (list)
+        precision: The precision curve (list)
+    # Returns
+        Average precision, precision curve, recall curve
+    """
+
+    # Append sentinel values to beginning and end
+    mrec = np.concatenate(([0.0], recall, [1.0]))
+    mpre = np.concatenate(([1.0], precision, [0.0]))
+
+    # Compute the precision envelope
+    mpre = np.flip(np.maximum.accumulate(np.flip(mpre)))
+
+    # Integrate area under curve
+    method = 'interp'  # methods: 'continuous', 'interp'
+    if method == 'interp':
+        x = np.linspace(0, 1, 101)  # 101-point interp (COCO)
+        ap = np.trapz(np.interp(x, mrec, mpre), x)  # integrate
+    else:  # 'continuous'
+        i = np.where(mrec[1:] != mrec[:-1])[0]  # points where x axis (recall) changes
+        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])  # area under curve
+
+    return ap, mpre, mrec
 
 def compute_ap(recall, precision):
     """ Compute the average precision, given the recall and precision curves
@@ -101,7 +206,6 @@ def compute_ap(recall, precision):
 
     return ap, mpre, mrec
 
-# Plots ----------------------------------------------------------------------------------------------------------------
 
 def plot_pr_curve(px, py, ap, save_dir='pr_curve.png', names=()):
     # Precision-recall curve
@@ -142,17 +246,54 @@ def plot_mc_curve(px, py, save_dir='mc_curve.png', names=(), xlabel='Confidence'
     plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
     fig.savefig(Path(save_dir), dpi=250)
 
-def process_batch(detections, labels, iouv):
+# def process_batch(detections, labels, iouv):
+#     """
+#     Return correct predictions matrix. Both sets of boxes are in (x1, y1, x2, y2) format.
+#     Arguments:
+#         detections (Array[N, 6]), x1, y1, x2, y2, conf, class
+#         labels (Array[M, 5]), class, x1, y1, x2, y2
+#     Returns:
+#         correct (Array[N, 10]), for 10 IoU levels
+#     """
+#     correct = np.zeros((detections.shape[0], iouv.shape[0])).astype(bool)
+#     iou = general.box_iou(labels[:, 1:], detections[:, :4])
+#     correct_class = labels[:, 0:1] == detections[:, 5]
+#     for i in range(len(iouv)):
+#         x = torch.where((iou >= iouv[i]) & correct_class)  # IoU > threshold and classes match
+#         if x[0].shape[0]:
+#             matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), 1).cpu().numpy()  # [label, detect, iou]
+#             if x[0].shape[0] > 1:
+#                 matches = matches[matches[:, 2].argsort()[::-1]]
+#                 matches = matches[np.unique(matches[:, 1], return_index=True)[1]]
+#                 # matches = matches[matches[:, 2].argsort()[::-1]]
+#                 matches = matches[np.unique(matches[:, 0], return_index=True)[1]]
+#             correct[matches[:, 1].astype(int), i] = True
+#     return torch.tensor(correct, dtype=torch.bool, device=iouv.device)
+
+def process_batch(detections, labels, iouv, pred_masks=None, gt_masks=None, overlap=False, masks=False):
     """
-    Return correct predictions matrix. Both sets of boxes are in (x1, y1, x2, y2) format.
+    Return correct prediction matrix
     Arguments:
-        detections (Array[N, 6]), x1, y1, x2, y2, conf, class
-        labels (Array[M, 5]), class, x1, y1, x2, y2
+        detections (array[N, 6]), x1, y1, x2, y2, conf, class
+        labels (array[M, 5]), class, x1, y1, x2, y2
     Returns:
-        correct (Array[N, 10]), for 10 IoU levels
+        correct (array[N, 10]), for 10 IoU levels
     """
+    if masks:
+        gt_masks = gt_masks.to(pred_masks.device)
+        if overlap:
+            nl = len(labels)
+            index = torch.arange(nl, device=gt_masks.device).view(nl, 1, 1) + 1
+            gt_masks = gt_masks.repeat(nl, 1, 1)  # shape(1,640,640) -> (n,640,640)
+            gt_masks = torch.where(gt_masks == index, 1.0, 0.0)
+        if gt_masks.shape[1:] != pred_masks.shape[1:]:
+            gt_masks = F.interpolate(gt_masks[None].to(torch.float32), pred_masks.shape[1:], mode='bilinear', align_corners=False)[0]
+            gt_masks = gt_masks.gt_(0.5)
+        iou = mask_iou(gt_masks.view(gt_masks.shape[0], -1).float(), pred_masks.view(pred_masks.shape[0], -1)).to(iouv.device)
+    else:  # boxes
+        iou = box_iou(labels[:, 1:], detections[:, :4]).to(iouv.device)
+
     correct = np.zeros((detections.shape[0], iouv.shape[0])).astype(bool)
-    iou = general.box_iou(labels[:, 1:], detections[:, :4])
     correct_class = labels[:, 0:1] == detections[:, 5]
     for i in range(len(iouv)):
         x = torch.where((iou >= iouv[i]) & correct_class)  # IoU > threshold and classes match
@@ -256,3 +397,232 @@ def plot(self, normalize=True, save_dir='', names=()):
     def print(self):
         for i in range(self.nc + 1):
             print(' '.join(map(str, self.matrix[i])))
+
+
+def ap_per_class_box_and_mask(
+        tp_m,
+        tp_b,
+        conf,
+        pred_cls,
+        target_cls,
+        plot=False,
+        save_dir='.',
+        names=(),
+        is_v6=False
+):
+    """
+    Args:
+        tp_b: tp of boxes.
+        tp_m: tp of masks.
+        other arguments see `func: ap_per_class`.
+        #return p, r, ap, f1, unique_classes.astype('int32')
+    """
+    if not is_v6:
+        results_boxes = ap_per_class(tp_b,
+                                    conf,
+                                    pred_cls,
+                                    target_cls,
+                                    plot=plot,
+                                    save_dir=save_dir,
+                                    names=names,
+                                    prefix='Box')[2:]
+        results_masks = ap_per_class(tp_m,
+                                    conf,
+                                    pred_cls,
+                                    target_cls,
+                                    plot=plot,
+                                    save_dir=save_dir,
+                                    names=names,
+                                    prefix='Mask')[2:]
+
+        results = {
+            'boxes': {
+                'p': results_boxes[0],
+                'r': results_boxes[1],
+                'ap': results_boxes[3],
+                'f1': results_boxes[2],
+                'ap_class': results_boxes[4]},
+            'masks': {
+                'p': results_masks[0],
+                'r': results_masks[1],
+                'ap': results_masks[3],
+                'f1': results_masks[2],
+                'ap_class': results_masks[4]}}
+        return results
+    else:
+        results_boxes = ap_per_class_v6(tp_b,
+                                    conf,
+                                    pred_cls,
+                                    target_cls,
+                                    plot=plot,
+                                    save_dir=save_dir,
+                                    names=names,
+                                    prefix='Box')
+        results_masks = ap_per_class(tp_m,
+                                    conf,
+                                    pred_cls,
+                                    target_cls,
+                                    plot=plot,
+                                    save_dir=save_dir,
+                                    names=names,
+                                    prefix='Mask')
+        return results_boxes, results_masks
+
+class Metric:
+
+    def __init__(self) -> None:
+        self.p = []  # (nc, )
+        self.r = []  # (nc, )
+        self.f1 = []  # (nc, )
+        self.all_ap = []  # (nc, 10)
+        self.ap_class_index = []  # (nc, )
+
+    @property
+    def ap50(self):
+        """AP@0.5 of all classes.
+        Return:
+            (nc, ) or [].
+        """
+        return self.all_ap[:, 0] if len(self.all_ap) else []
+
+    @property
+    def ap(self):
+        """AP@0.5:0.95
+        Return:
+            (nc, ) or [].
+        """
+        return self.all_ap.mean(1) if len(self.all_ap) else []
+
+    @property
+    def mp(self):
+        """mean precision of all classes.
+        Return:
+            float.
+        """
+        return self.p.mean() if len(self.p) else 0.0
+
+    @property
+    def mr(self):
+        """mean recall of all classes.
+        Return:
+            float.
+        """
+        return self.r.mean() if len(self.r) else 0.0
+
+    @property
+    def map50(self):
+        """Mean AP@0.5 of all classes.
+        Return:
+            float.
+        """
+        return self.all_ap[:, 0].mean() if len(self.all_ap) else 0.0
+
+    @property
+    def map(self):
+        """Mean AP@0.5:0.95 of all classes.
+        Return:
+            float.
+        """
+        return self.all_ap.mean() if len(self.all_ap) else 0.0
+
+    def mean_results(self):
+        """Mean of results, return mp, mr, map50, map"""
+        return (self.mp, self.mr, self.map50, self.map)
+
+    def class_result(self, i):
+        """class-aware result, return p[i], r[i], ap50[i], ap[i]"""
+        return (self.p[i], self.r[i], self.ap50[i], self.ap[i])
+
+    def get_maps(self, nc):
+        maps = np.zeros(nc) + self.map
+        for i, c in enumerate(self.ap_class_index):
+            maps[c] = self.ap[i]
+        return maps
+
+    def update(self, results):
+        """
+        Args:
+            results: tuple(p, r, ap, f1, ap_class)
+        """
+        p, r, all_ap, f1, ap_class_index = results
+        self.p = p
+        self.r = r
+        self.all_ap = all_ap
+        self.f1 = f1
+        self.ap_class_index = ap_class_index
+
+
+class Metrics:
+    """Metric for boxes and masks."""
+
+    def __init__(self) -> None:
+        self.metric_box = Metric()
+        self.metric_mask = Metric()
+
+    def update(self, results):
+        """
+        Args:
+            results: Dict{'boxes': Dict{}, 'masks': Dict{}}
+        """
+        self.metric_box.update(list(results['boxes'].values()))
+        self.metric_mask.update(list(results['masks'].values()))
+
+    def mean_results(self):
+        return self.metric_box.mean_results() + self.metric_mask.mean_results()
+
+    def class_result(self, i):
+        return self.metric_box.class_result(i) + self.metric_mask.class_result(i)
+
+    def get_maps(self, nc):
+        return self.metric_box.get_maps(nc) + self.metric_mask.get_maps(nc)
+
+    @property
+    def ap_class_index(self):
+        # boxes and masks have the same ap_class_index
+        return self.metric_box.ap_class_index
+
+def mask_iou(mask1, mask2, eps=1e-7):
+    """
+    mask1: [N, n] m1 means number of predicted objects
+    mask2: [M, n] m2 means number of gt objects
+    Note: n means image_w x image_h
+
+    return: masks iou, [N, M]
+    """
+    mask1 = mask1.float()
+    intersection = torch.matmul(mask1, mask2.t()).clamp(0)
+    union = (mask1.sum(1)[:, None] + mask2.sum(1)[None]) - intersection  # (area1 + area2) - intersection
+    return intersection / (union + eps)
+
+
+def masks_iou(mask1, mask2, eps=1e-7):
+    """
+    mask1: [N, n] m1 means number of predicted objects
+    mask2: [N, n] m2 means number of gt objects
+    Note: n means image_w x image_h
+
+    return: masks iou, (N, )
+    """
+    intersection = (mask1 * mask2).sum(1).clamp(0)  # (N, )
+    union = (mask1.sum(1) + mask2.sum(1))[None] - intersection  # (area1 + area2) - intersection
+    return intersection / (union + eps)
+
+def box_iou(box1, box2, eps=1e-7):
+    # https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
+    """
+    Return intersection-over-union (Jaccard index) of boxes.
+    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
+    Arguments:
+        box1 (Tensor[N, 4])
+        box2 (Tensor[M, 4])
+    Returns:
+        iou (Tensor[N, M]): the NxM matrix containing the pairwise
+            IoU values for every element in boxes1 and boxes2
+    """
+
+    # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
+    (a1, a2), (b1, b2) = box1.unsqueeze(1).chunk(2, 2), box2.unsqueeze(0).chunk(2, 2)
+    inter = (torch.min(a2, b2) - torch.max(a1, b1)).clamp(0).prod(2)
+
+    # IoU = inter / (area1 + area2 - inter)
+    return inter / ((a2 - a1).prod(2) + (b2 - b1).prod(2) - inter + eps)
\ No newline at end of file
diff --git a/yolov6/utils/nms.py b/yolov6/utils/nms.py
index 0f812642..c7369ba0 100644
--- a/yolov6/utils/nms.py
+++ b/yolov6/utils/nms.py
@@ -103,3 +103,164 @@ def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, classes=Non
             break  # time limit exceeded
 
     return output
+
+
+def non_max_suppression_seg(predictions, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, multi_label=False, max_det=300):
+    """Runs Non-Maximum Suppression (NMS) on inference results.
+    This code is borrowed from: https://github.com/ultralytics/yolov5/blob/47233e1698b89fc437a4fb9463c815e9171be955/utils/general.py#L775
+    Args:
+        prediction: (tensor), with shape [N, 5 + num_classes], N is the number of bboxes.
+        conf_thres: (float) confidence threshold.
+        iou_thres: (float) iou threshold.
+        classes: (None or list[int]), if a list is provided, nms only keep the classes you provide.
+        agnostic: (bool), when it is set to True, we do class-independent nms, otherwise, different class would do nms respectively.
+        multi_label: (bool), when it is set to True, one box can have multi labels, otherwise, one box only huave one label.
+        max_det:(int), max number of output bboxes.
+
+    Returns:
+         list of detections, echo item is one tensor with shape (num_boxes, 6), 6 is for [xyxy, conf, cls].
+    """
+    prediction = predictions[0]
+    confs = predictions[2] # (bs, which_proto, fs)
+    prediction = torch.cat([prediction, confs], axis=2)# (bs, l ,5 + num_classes + 33)
+    
+    num_classes = prediction.shape[2] - 5 - 33  # number of classes
+    pred_candidates = torch.logical_and(prediction[..., 4] > conf_thres, torch.max(prediction[..., 5: 5 + num_classes], axis=-1)[0] > conf_thres)  # candidates
+    # Check the parameters.
+    assert 0 <= conf_thres <= 1, f'conf_thresh must be in 0.0 to 1.0, however {conf_thres} is provided.'
+    assert 0 <= iou_thres <= 1, f'iou_thres must be in 0.0 to 1.0, however {iou_thres} is provided.'
+
+    # Function settings.
+    max_wh = 4096  # maximum box width and height
+    max_nms = 30000  # maximum number of boxes put into torchvision.ops.nms()
+    time_limit = 10.0  # quit the function when nms cost time exceed the limit time.
+    multi_label &= num_classes > 1  # multiple labels per box
+
+    tik = time.time()
+    output = [torch.zeros((0, 6 + 33), device=prediction.device)] * prediction.shape[0]
+    for img_idx, x in enumerate(prediction):  # image index, image inference
+        x = x[pred_candidates[img_idx]]  # confidence
+
+        # If no box remains, skip the next process.
+        if not x.shape[0]:
+            continue
+
+        # confidence multiply the objectness
+        x[:, 5: 5 + num_classes] *= x[:, 4:5]  # conf = obj_conf * cls_conf
+
+        # (center x, center y, width, height) to (x1, y1, x2, y2)
+        box = xywh2xyxy(x[:, :4])
+        segconf = x[:, 5 + num_classes: ]
+
+        # Detections matrix's shape is  (n,6), each row represents (xyxy, conf, cls)
+        if multi_label:
+            box_idx, class_idx = (x[:, 5: 5 + num_classes] > conf_thres).nonzero(as_tuple=False).T
+            x = torch.cat((box[box_idx], x[box_idx, class_idx + 5, None], class_idx[:, None].float(), segconf[box_idx]), 1)
+        else:  # Only keep the class with highest scores.
+            conf, class_idx = x[:, 5: 5 + num_classes].max(1, keepdim=True)
+            x = torch.cat((box, conf, class_idx.float(), segconf), 1)[conf.view(-1) > conf_thres]
+
+        # Filter by class, only keep boxes whose category is in classes.
+        if classes is not None:
+            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
+
+        # Check shape
+        num_box = x.shape[0]  # number of boxes
+        if not num_box:  # no boxes kept.
+            continue
+        elif num_box > max_nms:  # excess max boxes' number.
+            x = x[x[:, 4].argsort(descending=True)[:max_nms]]  # sort by confidence
+
+        # Batched NMS
+        class_offset = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
+        boxes, scores = x[:, :4] + class_offset, x[:, 4]  # boxes (offset by class), scores
+        keep_box_idx = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
+        if keep_box_idx.shape[0] > max_det:  # limit detections
+            keep_box_idx = keep_box_idx[:max_det]
+
+        output[img_idx] = x[keep_box_idx]
+        if (time.time() - tik) > time_limit:
+            print(f'WARNING: NMS cost time exceed the limited {time_limit}s.')
+            break  # time limit exceeded
+
+    return output
+
+def non_max_suppression_seg_solo(predictions, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, multi_label=False, max_det=300):
+    """Runs Non-Maximum Suppression (NMS) on inference results.
+    This code is borrowed from: https://github.com/ultralytics/yolov5/blob/47233e1698b89fc437a4fb9463c815e9171be955/utils/general.py#L775
+    Args:
+        prediction: (tensor), with shape [N, 5 + num_classes], N is the number of bboxes.
+        conf_thres: (float) confidence threshold.
+        iou_thres: (float) iou threshold.
+        classes: (None or list[int]), if a list is provided, nms only keep the classes you provide.
+        agnostic: (bool), when it is set to True, we do class-independent nms, otherwise, different class would do nms respectively.
+        multi_label: (bool), when it is set to True, one box can have multi labels, otherwise, one box only huave one label.
+        max_det:(int), max number of output bboxes.
+
+    Returns:
+         list of detections, echo item is one tensor with shape (num_boxes, 6), 6 is for [xyxy, conf, cls].
+    """
+    prediction = predictions[0]
+    confs = predictions[2] # (bs, which_proto, fs)
+    prediction = torch.cat([prediction, confs], axis=2)# (bs, l ,5 + num_classes + 68)
+    
+    num_classes = prediction.shape[2] - 5 - 68  # number of classes
+    pred_candidates = torch.logical_and(prediction[..., 4] > conf_thres, torch.max(prediction[..., 5: 5 + num_classes], axis=-1)[0] > conf_thres)  # candidates
+    # Check the parameters.
+    assert 0 <= conf_thres <= 1, f'conf_thresh must be in 0.0 to 1.0, however {conf_thres} is provided.'
+    assert 0 <= iou_thres <= 1, f'iou_thres must be in 0.0 to 1.0, however {iou_thres} is provided.'
+
+    # Function settings.
+    max_wh = 4096  # maximum box width and height
+    max_nms = 30000  # maximum number of boxes put into torchvision.ops.nms()
+    time_limit = 10.0  # quit the function when nms cost time exceed the limit time.
+    multi_label &= num_classes > 1  # multiple labels per box
+
+    tik = time.time()
+    output = [torch.zeros((0, 6 + 68), device=prediction.device)] * prediction.shape[0]
+    for img_idx, x in enumerate(prediction):  # image index, image inference
+        x = x[pred_candidates[img_idx]]  # confidence
+
+        # If no box remains, skip the next process.
+        if not x.shape[0]:
+            continue
+
+        # confidence multiply the objectness
+        x[:, 5: 5 + num_classes] *= x[:, 4:5]  # conf = obj_conf * cls_conf
+
+        # (center x, center y, width, height) to (x1, y1, x2, y2)
+        box = xywh2xyxy(x[:, :4])
+        segconf = x[:, 5 + num_classes: ]
+
+        # Detections matrix's shape is  (n,6), each row represents (xyxy, conf, cls)
+        if multi_label:
+            box_idx, class_idx = (x[:, 5: 5 + num_classes] > conf_thres).nonzero(as_tuple=False).T
+            x = torch.cat((box[box_idx], x[box_idx, class_idx + 5, None], class_idx[:, None].float(), segconf[box_idx]), 1)
+        else:  # Only keep the class with highest scores.
+            conf, class_idx = x[:, 5: 5 + num_classes].max(1, keepdim=True)
+            x = torch.cat((box, conf, class_idx.float(), segconf), 1)[conf.view(-1) > conf_thres]
+
+        # Filter by class, only keep boxes whose category is in classes.
+        if classes is not None:
+            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
+
+        # Check shape
+        num_box = x.shape[0]  # number of boxes
+        if not num_box:  # no boxes kept.
+            continue
+        elif num_box > max_nms:  # excess max boxes' number.
+            x = x[x[:, 4].argsort(descending=True)[:max_nms]]  # sort by confidence
+
+        # Batched NMS
+        class_offset = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
+        boxes, scores = x[:, :4] + class_offset, x[:, 4]  # boxes (offset by class), scores
+        keep_box_idx = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
+        if keep_box_idx.shape[0] > max_det:  # limit detections
+            keep_box_idx = keep_box_idx[:max_det]
+
+        output[img_idx] = x[keep_box_idx]
+        if (time.time() - tik) > time_limit:
+            print(f'WARNING: NMS cost time exceed the limited {time_limit}s.')
+            break  # time limit exceeded
+
+    return output
diff --git a/yolov6/utils/test1.py b/yolov6/utils/test1.py
new file mode 100644
index 00000000..246494f2
--- /dev/null
+++ b/yolov6/utils/test1.py
@@ -0,0 +1,23 @@
+def process_batch(detections, labels, iouv):
+    """
+    Return correct predictions matrix. Both sets of boxes are in (x1, y1, x2, y2) format.
+    Arguments:
+        detections (Array[N, 6]), x1, y1, x2, y2, conf, class
+        labels (Array[M, 5]), class, x1, y1, x2, y2
+    Returns:
+        correct (Array[N, 10]), for 10 IoU levels
+    """
+    correct = np.zeros((detections.shape[0], iouv.shape[0])).astype(bool)
+    iou = general.box_iou(labels[:, 1:], detections[:, :4])
+    correct_class = labels[:, 0:1] == detections[:, 5]
+    for i in range(len(iouv)):
+        x = torch.where((iou >= iouv[i]) & correct_class)  # IoU > threshold and classes match
+        if x[0].shape[0]:
+            matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), 1).cpu().numpy()  # [label, detect, iou]
+            if x[0].shape[0] > 1:
+                matches = matches[matches[:, 2].argsort()[::-1]]
+                matches = matches[np.unique(matches[:, 1], return_index=True)[1]]
+                # matches = matches[matches[:, 2].argsort()[::-1]]
+                matches = matches[np.unique(matches[:, 0], return_index=True)[1]]
+            correct[matches[:, 1].astype(int), i] = True
+    return torch.tensor(correct, dtype=torch.bool, device=iouv.device)
\ No newline at end of file
diff --git a/yolov6/utils/test2.py b/yolov6/utils/test2.py
new file mode 100644
index 00000000..f21ad021
--- /dev/null
+++ b/yolov6/utils/test2.py
@@ -0,0 +1,37 @@
+def process_batch(detections, labels, iouv, pred_masks=None, gt_masks=None, overlap=False, masks=False):
+    """
+    Return correct prediction matrix
+    Arguments:
+        detections (array[N, 6]), x1, y1, x2, y2, conf, class
+        labels (array[M, 5]), class, x1, y1, x2, y2
+    Returns:
+        correct (array[N, 10]), for 10 IoU levels
+    """
+    #breakpoint()
+    if masks:
+        gt_masks = gt_masks.to(pred_masks.device)
+        if overlap:
+            nl = len(labels)
+            index = torch.arange(nl, device=gt_masks.device).view(nl, 1, 1) + 1
+            gt_masks = gt_masks.repeat(nl, 1, 1)  # shape(1,640,640) -> (n,640,640)
+            gt_masks = torch.where(gt_masks == index, 1.0, 0.0)
+        if gt_masks.shape[1:] != pred_masks.shape[1:]:
+            gt_masks = F.interpolate(gt_masks[None].to(torch.float32), pred_masks.shape[1:], mode='bilinear', align_corners=False)[0]
+            gt_masks = gt_masks.gt_(0.5)
+        iou = mask_iou(gt_masks.view(gt_masks.shape[0], -1), pred_masks.view(pred_masks.shape[0], -1)).to(iouv.device)
+    else:  # boxes
+        iou = box_iou(labels[:, 1:], detections[:, :4]).to(iouv.device)
+
+    correct = np.zeros((detections.shape[0], iouv.shape[0])).astype(bool)
+    correct_class = labels[:, 0:1] == detections[:, 5]
+    for i in range(len(iouv)):
+        x = torch.where((iou >= iouv[i]) & correct_class)  # IoU > threshold and classes match
+        if x[0].shape[0]:
+            matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), 1).cpu().numpy()  # [label, detect, iou]
+            if x[0].shape[0] > 1:
+                matches = matches[matches[:, 2].argsort()[::-1]]
+                matches = matches[np.unique(matches[:, 1], return_index=True)[1]]
+                # matches = matches[matches[:, 2].argsort()[::-1]]
+                matches = matches[np.unique(matches[:, 0], return_index=True)[1]]
+            correct[matches[:, 1].astype(int), i] = True
+    return torch.tensor(correct, dtype=torch.bool, device=iouv.device)
\ No newline at end of file