Skip to content

Commit

Permalink
Add UNITOPatho dataset (#750)
Browse files Browse the repository at this point in the history
  • Loading branch information
nkaenzig authored Feb 4, 2025
1 parent 2d81aaa commit 6a6e3f2
Show file tree
Hide file tree
Showing 16 changed files with 508 additions and 7 deletions.
110 changes: 110 additions & 0 deletions configs/vision/pathology/offline/classification/unitopatho.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
---
trainer:
class_path: eva.Trainer
init_args:
n_runs: &N_RUNS ${oc.env:N_RUNS, 5}
default_root_dir: &OUTPUT_ROOT ${oc.env:OUTPUT_ROOT, logs/${oc.env:MODEL_NAME, dino_vits16}/offline/unitopatho}
max_steps: &MAX_STEPS ${oc.env:MAX_STEPS, 12500}
checkpoint_type: ${oc.env:CHECKPOINT_TYPE, best}
callbacks:
- class_path: eva.callbacks.ConfigurationLogger
- class_path: lightning.pytorch.callbacks.TQDMProgressBar
init_args:
refresh_rate: ${oc.env:TQDM_REFRESH_RATE, 1}
- class_path: lightning.pytorch.callbacks.LearningRateMonitor
init_args:
logging_interval: epoch
- class_path: lightning.pytorch.callbacks.ModelCheckpoint
init_args:
filename: best
save_last: true
save_top_k: 1
monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/MulticlassAccuracy}
mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max}
- class_path: lightning.pytorch.callbacks.EarlyStopping
init_args:
min_delta: 0
patience: ${oc.env:PATIENCE, 45}
monitor: *MONITOR_METRIC
mode: *MONITOR_METRIC_MODE
- class_path: eva.callbacks.ClassificationEmbeddingsWriter
init_args:
output_dir: &DATASET_EMBEDDINGS_ROOT ${oc.env:EMBEDDINGS_ROOT, ./data/embeddings}/${oc.env:MODEL_NAME, dino_vits16}/unitopatho
dataloader_idx_map:
0: train
1: val
backbone:
class_path: eva.vision.models.ModelFromRegistry
init_args:
model_name: ${oc.env:MODEL_NAME, universal/vit_small_patch16_224_dino}
model_extra_kwargs: ${oc.env:MODEL_EXTRA_KWARGS, null}
overwrite: false
logger:
- class_path: lightning.pytorch.loggers.TensorBoardLogger
init_args:
save_dir: *OUTPUT_ROOT
name: ""
model:
class_path: eva.HeadModule
init_args:
head:
class_path: torch.nn.Linear
init_args:
in_features: ${oc.env:IN_FEATURES, 384}
out_features: &NUM_CLASSES 6
criterion: torch.nn.CrossEntropyLoss
optimizer:
class_path: torch.optim.AdamW
init_args:
lr: ${oc.env:LR_VALUE, 0.0003}
lr_scheduler:
class_path: torch.optim.lr_scheduler.CosineAnnealingLR
init_args:
T_max: *MAX_STEPS
eta_min: 0.0
metrics:
common:
- class_path: eva.metrics.AverageLoss
- class_path: eva.metrics.MulticlassClassificationMetrics
init_args:
num_classes: *NUM_CLASSES
data:
class_path: eva.DataModule
init_args:
datasets:
train:
class_path: eva.datasets.EmbeddingsClassificationDataset
init_args: &DATASET_ARGS
root: *DATASET_EMBEDDINGS_ROOT
manifest_file: manifest.csv
split: train
val:
class_path: eva.datasets.EmbeddingsClassificationDataset
init_args:
<<: *DATASET_ARGS
split: val
predict:
- class_path: eva.vision.datasets.UniToPatho
init_args: &PREDICT_DATASET_ARGS
root: ${oc.env:DATA_ROOT, ./data/unitopatho}
split: train
transforms:
class_path: eva.vision.data.transforms.common.ResizeAndCrop
init_args:
mean: ${oc.env:NORMALIZE_MEAN, [0.485, 0.456, 0.406]}
std: ${oc.env:NORMALIZE_STD, [0.229, 0.224, 0.225]}
- class_path: eva.vision.datasets.UniToPatho
init_args:
<<: *PREDICT_DATASET_ARGS
split: val
dataloaders:
train:
batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 256}
num_workers: &N_DATA_WORKERS ${oc.env:N_DATA_WORKERS, 4}
shuffle: true
val:
batch_size: *BATCH_SIZE
num_workers: *N_DATA_WORKERS
predict:
batch_size: &PREDICT_BATCH_SIZE ${oc.env:PREDICT_BATCH_SIZE, 64}
num_workers: *N_DATA_WORKERS
90 changes: 90 additions & 0 deletions configs/vision/pathology/online/classification/unitopatho.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
---
trainer:
class_path: eva.Trainer
init_args:
n_runs: &N_RUNS ${oc.env:N_RUNS, 5}
default_root_dir: &OUTPUT_ROOT ${oc.env:OUTPUT_ROOT, logs/${oc.env:MODEL_NAME, dino_vits16}/online/unitopatho}
max_steps: &MAX_STEPS ${oc.env:MAX_STEPS, 12500}
checkpoint_type: ${oc.env:CHECKPOINT_TYPE, best}
callbacks:
- class_path: eva.callbacks.ConfigurationLogger
- class_path: lightning.pytorch.callbacks.TQDMProgressBar
init_args:
refresh_rate: ${oc.env:TQDM_REFRESH_RATE, 1}
- class_path: lightning.pytorch.callbacks.LearningRateMonitor
init_args:
logging_interval: epoch
- class_path: lightning.pytorch.callbacks.ModelCheckpoint
init_args:
filename: best
save_last: true
save_top_k: 1
monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/MulticlassAccuracy}
mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max}
- class_path: lightning.pytorch.callbacks.EarlyStopping
init_args:
min_delta: 0
patience: ${oc.env:PATIENCE, 45}
monitor: *MONITOR_METRIC
mode: *MONITOR_METRIC_MODE
logger:
- class_path: lightning.pytorch.loggers.TensorBoardLogger
init_args:
save_dir: *OUTPUT_ROOT
name: ""
model:
class_path: eva.HeadModule
init_args:
backbone:
class_path: eva.vision.models.ModelFromRegistry
init_args:
model_name: ${oc.env:MODEL_NAME, universal/vit_small_patch16_224_dino}
model_extra_kwargs: ${oc.env:MODEL_EXTRA_KWARGS, null}
head:
class_path: torch.nn.Linear
init_args:
in_features: ${oc.env:IN_FEATURES, 384}
out_features: &NUM_CLASSES 6
criterion: torch.nn.CrossEntropyLoss
optimizer:
class_path: torch.optim.AdamW
init_args:
lr: ${oc.env:LR_VALUE, 0.0003}
lr_scheduler:
class_path: torch.optim.lr_scheduler.CosineAnnealingLR
init_args:
T_max: *MAX_STEPS
eta_min: 0.0
metrics:
common:
- class_path: eva.metrics.AverageLoss
- class_path: eva.metrics.MulticlassClassificationMetrics
init_args:
num_classes: *NUM_CLASSES
data:
class_path: eva.DataModule
init_args:
datasets:
train:
class_path: eva.vision.datasets.UniToPatho
init_args: &DATASET_ARGS
root: ${oc.env:DATA_ROOT, ./data/unitopatho}
split: train
transforms:
class_path: eva.vision.data.transforms.common.ResizeAndCrop
init_args:
mean: ${oc.env:NORMALIZE_MEAN, [0.485, 0.456, 0.406]}
std: ${oc.env:NORMALIZE_STD, [0.229, 0.224, 0.225]}
val:
class_path: eva.vision.datasets.UniToPatho
init_args:
<<: *DATASET_ARGS
split: val
dataloaders:
train:
batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 256}
num_workers: &N_DATA_WORKERS ${oc.env:N_DATA_WORKERS, 4}
shuffle: true
val:
batch_size: *BATCH_SIZE
num_workers: *N_DATA_WORKERS
15 changes: 8 additions & 7 deletions docs/datasets/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,18 @@
### Whole Slide (WSI) and microscopy image datasets

#### Patch-level
| Dataset | #Patches | Patch Size | Magnification (μm/px) | Task | Tissue Type |
|------------------------------------|----------|------------|------------------------|----------------------------|------------------|
| [BACH](bach.md) | 400 | 2048x1536 | 20x (0.5) | Classification (4 classes) | Breast |
| Dataset | #Patches | Patch Size | Magnification (μm/px) | Task | Tissue Type |
|------------------------------------|----------|-------------|------------------------|----------------------------|------------------|
| [BACH](bach.md) | 400 | 2048x1536 | 20x (0.5) | Classification (4 classes) | Breast |
| [BRACS](bracs.md) | 4539 | variable | 40x (0.25) | Classification (7 classes) | Breast |
| [BreakHis](breakhis.md) | 1995 | 700x460 | 40x (0.25) | Classification (8 classes) | Breast |
| [CRC](crc.md) | 107,180 | 224x224 | 20x (0.5) | Classification (9 classes) | Colorectal |
| [CRC](crc.md) | 107,180 | 224x224 | 20x (0.5) | Classification (9 classes) | Colorectal |
| [GleasonArvaniti](crc.md) | 22,752 | 750x750 | 40x (0.23) | Classification (4 classes) | Prostate |
| [PatchCamelyon](patch_camelyon.md) | 327,680 | 96x96 | 10x (1.0) \* | Classification (2 classes) | Breast |
| [MHIST](mhist.md) | 3,152 | 224x224 | 5x (2.0) \* | Classification (2 classes) | Colorectal Polyp |
| [PatchCamelyon](patch_camelyon.md) | 327,680 | 96x96 | 10x (1.0) \* | Classification (2 classes) | Breast |
| [MHIST](mhist.md) | 3,152 | 224x224 | 5x (2.0) \* | Classification (2 classes) | Colorectal Polyp |
| [UniToPatho](unitopatho.md) | 8669 | 1812 x 1812 | 20x (0.4415) | Classification (6 classes) | Colorectal Polyp |
| [MoNuSAC](monusac.md) | 294 | 113x81 - 1398x1956 | 40x (0.25) | Segmentation (4 classes) | Multi-Organ Cell Type (Breast, Kidney, Lung and Prostate) |
| [CoNSeP](consep.md) | 41 | 1000x1000 | 40x (0.25) \* | Segmentation (8 classes) | Colorectal Nuclear |
| [CoNSeP](consep.md) | 41 | 1000x1000 | 40x (0.25) \* | Segmentation (8 classes) | Colorectal Nuclear |

\* Downsampled from 40x (0.25 μm/px) to increase the field of view.

Expand Down
68 changes: 68 additions & 0 deletions docs/datasets/unitopatho.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# UniToPatho


UniToPatho is an annotated dataset of 9536 hematoxylin and eosin stained patches extracted from 292 whole-slide images, meant for training deep neural networks for colorectal polyps classification and adenomas grading. The slides are acquired through a Hamamatsu Nanozoomer S210 scanner at 20x magnification (0.4415 μm/px). Each slide belongs to a different patient and is annotated by expert pathologists, according to six classes as follows:

- NORM - Normal tissue;
- HP - Hyperplastic Polyp;
- TA.HG - Tubular Adenoma, High-Grade dysplasia;
- TA.LG - Tubular Adenoma, Low-Grade dysplasia;
- TVA.HG - Tubulo-Villous Adenoma, High-Grade dysplasia;
- TVA.LG - Tubulo-Villous Adenoma, Low-Grade dysplasia.

For this benchmark we used only the `800` subset which contains 8669 images of resolution 1812x1812 (the `7000` subset contains much bigger images and would therefore be difficult to handle as patch classification task).

## Raw data

### Key stats

| | |
|--------------------------------|-----------------------------|
| **Modality** | Vision (WSI patches) |
| **Task** | Multiclass classification (6 classes) |
| **Cancer type** | Colorectal |
| **Data size** | 48.37 GB |
| **Image dimension** | 1812 x 1812 |
| **Magnification (μm/px)** | 20x (0.4415) |
| **Magnification after resize (μm/px)** | 162x (3.57) |
| **Files format** | `png` |
| **Number of images** | 8669 |


### Splits

The data source provides train/validation splits

| Splits | Train | Validation |
|----------|--------------|---------------|
| #Samples | 6270 (72.33) | 2399 (27.67%) |

The dataset authors only provide two splits, which is why we don't report performance on a third test split.


### Organization

The UniToPatho data is organized as follows (note that we are using only the `800` subset):

```
unitopatho
├── 800
test.csv
train.csv
│ ├── HP # 1 folder per class
│ ├── NORM
│ ├── TA.HG
│ ├── ...
```


## Download and preprocessing
The `UniToPatho` dataset class doesn't download the data during runtime and must be downloaded manually from [the official source](https://ieee-dataport.org/open-access/unitopatho).

## Relevant links

* [GitHub Repo](https://github.com/EIDOSLAB/UNITOPATHO)

## License

[CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ nav:
- GleasonArvaniti: datasets/gleason_arvaniti.md
- MHIST: datasets/mhist.md
- PatchCamelyon: datasets/patch_camelyon.md
- UniToPatho: datasets/unitopatho.md
- MoNuSAC: datasets/monusac.md
- CoNSeP: datasets/consep.md
- BCSS: datasets/bcss.md
Expand Down
2 changes: 2 additions & 0 deletions src/eva/vision/data/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
GleasonArvaniti,
PANDASmall,
PatchCamelyon,
UniToPatho,
WsiClassificationDataset,
)
from eva.vision.data.datasets.segmentation import (
Expand Down Expand Up @@ -38,6 +39,7 @@
"PANDASmall",
"Camelyon16",
"PatchCamelyon",
"UniToPatho",
"WsiClassificationDataset",
"CoNSeP",
"EmbeddingsSegmentationDataset",
Expand Down
3 changes: 3 additions & 0 deletions src/eva/vision/data/datasets/classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from eva.vision.data.datasets.classification.mhist import MHIST
from eva.vision.data.datasets.classification.panda import PANDA, PANDASmall
from eva.vision.data.datasets.classification.patch_camelyon import PatchCamelyon
from eva.vision.data.datasets.classification.unitopatho import UniToPatho
from eva.vision.data.datasets.classification.wsi import WsiClassificationDataset

__all__ = [
Expand All @@ -20,7 +21,9 @@
"GleasonArvaniti",
"MHIST",
"PatchCamelyon",
"UniToPatho",
"WsiClassificationDataset",
"PANDA",
"PANDASmall",
"Camelyon16",
]
Loading

0 comments on commit 6a6e3f2

Please sign in to comment.