Skip to content

Commit

Permalink
Add GleasonArvaniti dataset (#754)
Browse files Browse the repository at this point in the history
  • Loading branch information
nkaenzig authored Feb 4, 2025
1 parent c60fe16 commit 2f75163
Show file tree
Hide file tree
Showing 14 changed files with 528 additions and 1 deletion.
123 changes: 123 additions & 0 deletions configs/vision/pathology/offline/classification/gleason_arvaniti.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
---
trainer:
class_path: eva.Trainer
init_args:
n_runs: &N_RUNS ${oc.env:N_RUNS, 5}
default_root_dir: &OUTPUT_ROOT ${oc.env:OUTPUT_ROOT, logs/${oc.env:MODEL_NAME, dino_vits16}/offline/gleason_arvaniti}
max_steps: &MAX_STEPS ${oc.env:MAX_STEPS, 12500}
checkpoint_type: ${oc.env:CHECKPOINT_TYPE, best}
callbacks:
- class_path: eva.callbacks.ConfigurationLogger
- class_path: lightning.pytorch.callbacks.TQDMProgressBar
init_args:
refresh_rate: ${oc.env:TQDM_REFRESH_RATE, 1}
- class_path: lightning.pytorch.callbacks.LearningRateMonitor
init_args:
logging_interval: epoch
- class_path: lightning.pytorch.callbacks.ModelCheckpoint
init_args:
filename: best
save_last: true
save_top_k: 1
monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/MulticlassAccuracy}
mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max}
- class_path: lightning.pytorch.callbacks.EarlyStopping
init_args:
min_delta: 0
patience: ${oc.env:PATIENCE, 21}
monitor: *MONITOR_METRIC
mode: *MONITOR_METRIC_MODE
- class_path: eva.callbacks.ClassificationEmbeddingsWriter
init_args:
output_dir: &DATASET_EMBEDDINGS_ROOT ${oc.env:EMBEDDINGS_ROOT, ./data/embeddings}/${oc.env:MODEL_NAME, dino_vits16}/gleason_arvaniti
dataloader_idx_map:
0: train
1: val
2: test
backbone:
class_path: eva.vision.models.ModelFromRegistry
init_args:
model_name: ${oc.env:MODEL_NAME, universal/vit_small_patch16_224_dino}
model_extra_kwargs: ${oc.env:MODEL_EXTRA_KWARGS, null}
overwrite: false
logger:
- class_path: lightning.pytorch.loggers.TensorBoardLogger
init_args:
save_dir: *OUTPUT_ROOT
name: ""
model:
class_path: eva.HeadModule
init_args:
head:
class_path: torch.nn.Linear
init_args:
in_features: ${oc.env:IN_FEATURES, 384}
out_features: &NUM_CLASSES 4
criterion: torch.nn.CrossEntropyLoss
optimizer:
class_path: torch.optim.AdamW
init_args:
lr: ${oc.env:LR_VALUE, 0.0003}
lr_scheduler:
class_path: torch.optim.lr_scheduler.CosineAnnealingLR
init_args:
T_max: *MAX_STEPS
eta_min: 0.0
metrics:
common:
- class_path: eva.metrics.AverageLoss
- class_path: eva.metrics.MulticlassClassificationMetrics
init_args:
num_classes: *NUM_CLASSES
data:
class_path: eva.DataModule
init_args:
datasets:
train:
class_path: eva.datasets.EmbeddingsClassificationDataset
init_args: &DATASET_ARGS
root: *DATASET_EMBEDDINGS_ROOT
manifest_file: manifest.csv
split: train
val:
class_path: eva.datasets.EmbeddingsClassificationDataset
init_args:
<<: *DATASET_ARGS
split: val
test:
class_path: eva.datasets.EmbeddingsClassificationDataset
init_args:
<<: *DATASET_ARGS
split: test
predict:
- class_path: eva.vision.datasets.GleasonArvaniti
init_args: &PREDICT_DATASET_ARGS
root: ${oc.env:DATA_ROOT, ./data/arvaniti_gleason_patches}
split: train
transforms:
class_path: eva.vision.data.transforms.common.ResizeAndCrop
init_args:
mean: ${oc.env:NORMALIZE_MEAN, [0.485, 0.456, 0.406]}
std: ${oc.env:NORMALIZE_STD, [0.229, 0.224, 0.225]}
- class_path: eva.vision.datasets.GleasonArvaniti
init_args:
<<: *PREDICT_DATASET_ARGS
split: val
- class_path: eva.vision.datasets.GleasonArvaniti
init_args:
<<: *PREDICT_DATASET_ARGS
split: test
dataloaders:
train:
batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 256}
num_workers: &N_DATA_WORKERS ${oc.env:N_DATA_WORKERS, 4}
shuffle: true
val:
batch_size: *BATCH_SIZE
num_workers: *N_DATA_WORKERS
test:
batch_size: *BATCH_SIZE
num_workers: *N_DATA_WORKERS
predict:
batch_size: &PREDICT_BATCH_SIZE ${oc.env:PREDICT_BATCH_SIZE, 64}
num_workers: *N_DATA_WORKERS
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
---
trainer:
class_path: eva.Trainer
init_args:
n_runs: &N_RUNS ${oc.env:N_RUNS, 5}
default_root_dir: &OUTPUT_ROOT ${oc.env:OUTPUT_ROOT, logs/${oc.env:MODEL_NAME, dino_vits16}/online/gleason_arvaniti}
max_steps: &MAX_STEPS ${oc.env:MAX_STEPS, 12500}
checkpoint_type: ${oc.env:CHECKPOINT_TYPE, best}
callbacks:
- class_path: eva.callbacks.ConfigurationLogger
- class_path: lightning.pytorch.callbacks.TQDMProgressBar
init_args:
refresh_rate: ${oc.env:TQDM_REFRESH_RATE, 1}
- class_path: lightning.pytorch.callbacks.LearningRateMonitor
init_args:
logging_interval: epoch
- class_path: lightning.pytorch.callbacks.ModelCheckpoint
init_args:
filename: best
save_last: true
save_top_k: 1
monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/MulticlassAccuracy}
mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max}
- class_path: lightning.pytorch.callbacks.EarlyStopping
init_args:
min_delta: 0
patience: ${oc.env:PATIENCE, 21}
monitor: *MONITOR_METRIC
mode: *MONITOR_METRIC_MODE
logger:
- class_path: lightning.pytorch.loggers.TensorBoardLogger
init_args:
save_dir: *OUTPUT_ROOT
name: ""
model:
class_path: eva.HeadModule
init_args:
backbone:
class_path: eva.vision.models.ModelFromRegistry
init_args:
model_name: ${oc.env:MODEL_NAME, universal/vit_small_patch16_224_dino}
model_extra_kwargs: ${oc.env:MODEL_EXTRA_KWARGS, null}
head:
class_path: torch.nn.Linear
init_args:
in_features: ${oc.env:IN_FEATURES, 384}
out_features: &NUM_CLASSES 4
criterion: torch.nn.CrossEntropyLoss
optimizer:
class_path: torch.optim.AdamW
init_args:
lr: ${oc.env:LR_VALUE, 0.0003}
lr_scheduler:
class_path: torch.optim.lr_scheduler.CosineAnnealingLR
init_args:
T_max: *MAX_STEPS
eta_min: 0.0
metrics:
common:
- class_path: eva.metrics.AverageLoss
- class_path: eva.metrics.MulticlassClassificationMetrics
init_args:
num_classes: *NUM_CLASSES
data:
class_path: eva.DataModule
init_args:
datasets:
train:
class_path: eva.vision.datasets.GleasonArvaniti
init_args: &DATASET_ARGS
root: ${oc.env:DATA_ROOT, ./data/arvaniti_gleason_patches}
split: train
transforms:
class_path: eva.vision.data.transforms.common.ResizeAndCrop
init_args:
mean: ${oc.env:NORMALIZE_MEAN, [0.485, 0.456, 0.406]}
std: ${oc.env:NORMALIZE_STD, [0.229, 0.224, 0.225]}
val:
class_path: eva.vision.datasets.GleasonArvaniti
init_args:
<<: *DATASET_ARGS
split: val
test:
class_path: eva.vision.datasets.GleasonArvaniti
init_args:
<<: *DATASET_ARGS
split: test
dataloaders:
train:
batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 256}
num_workers: &N_DATA_WORKERS ${oc.env:N_DATA_WORKERS, 4}
shuffle: true
val:
batch_size: *BATCH_SIZE
num_workers: *N_DATA_WORKERS
test:
batch_size: *BATCH_SIZE
num_workers: *N_DATA_WORKERS
74 changes: 74 additions & 0 deletions docs/datasets/gleason_arvaniti.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Gleason (Arvaniti)

Benchmark dataset for automated Gleason grading of prostate cancer tissue microarrays via deep learning as proposed by [Arvaniti et al.](https://www.nature.com/articles/s41598-018-30535-1).

Images are classified as benign, Gleason pattern 3, 4 or 5. The dataset contains annotations on a discovery / train cohort of 641 patients and an independent test cohort of 245 patients annotated by two pathologists. For the test cohort, we only use the labels from pathologist Nr. 1 for this benchmark

## Raw data

### Key stats

| | |
|--------------------------------|-----------------------------|
| **Modality** | Vision (WSI patches) |
| **Task** | Multiclass classification (4 classes) |
| **Cancer type** | Prostate |
| **Data size** | 4 GB |
| **Image dimension** | 750 x 750 |
| **Magnification (μm/px)** | 40x (0.23) |
| **Files format** | `jpg` |
| **Number of images** | 22,752 |


### Splits

We use the same splits as proposed in the paper:

| Splits | Train | Validation | Test |
|---|---------------|--------------|--------------|
| #Samples | 15,303 (67.26%) | 2,482 (10.91%) | 4,967 (21.83%) |

Note that the authors chose TMA 76 as validation cohort because it contains the most balanced distribution of Gleason scores.


## Download and preprocessing
The `GleasonArvaniti` dataset class doesn't download the data during runtime and must be downloaded and preprocessed manually:

1. Download dataset archives from the [official source](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/OCYCMP)
2. Unpack all .tar.gz archives into the same folder
3. Adjust the folder structure and then run the `create_patches.py` from https://github.com/eiriniar/gleason_CNN/tree/master

This should result in the folloing folder structure:

```
arvaniti_gleason_patches
├── test_patches_750
│ ├── patho_1
│ │ ├── ZT80_38_A_1_1
│ │ ├── ZT76_39_A_1_1_patch_12_class_0.jpg
│ │ ├── ZT76_39_A_1_1_patch_23_class_0.jpg
│ │ │ └── ...
│ │ ├── ZT80_38_A_1_2
│ │ │ └── ...
│ │ └── ...
│ ├── patho_2 # we don't use this
│ │ └── ...
├── train_validation_patches_750
│ ├── ZT76_39_A_1_1
│ │ ├── ZT76_39_A_1_1_patch_12_class_0.jpg
│ │ ├── ZT76_39_A_1_1_patch_23_class_0.jpg
│ │ └── ...
│ ├── ZT76_39_A_1_2
│ └── ...
```

## Relevant links

* [Paper](https://www.nature.com/articles/s41598-018-30535-1)
* [GitHub](https://github.com/eiriniar/gleason_CNN)
* [Dataset](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/OCYCMP)

## License

[CC0 1.0 Universal](https://creativecommons.org/publicdomain/zero/1.0/)

1 change: 1 addition & 0 deletions docs/datasets/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
|------------------------------------|----------|------------|------------------------|----------------------------|------------------|
| [BACH](bach.md) | 400 | 2048x1536 | 20x (0.5) | Classification (4 classes) | Breast |
| [CRC](crc.md) | 107,180 | 224x224 | 20x (0.5) | Classification (9 classes) | Colorectal |
| [GleasonArvaniti](crc.md) | 22,752 | 750x750 | 40x (0.23) | Classification (4 classes) | Prostate |
| [PatchCamelyon](patch_camelyon.md) | 327,680 | 96x96 | 10x (1.0) \* | Classification (2 classes) | Breast |
| [MHIST](mhist.md) | 3,152 | 224x224 | 5x (2.0) \* | Classification (2 classes) | Colorectal Polyp |
| [MoNuSAC](monusac.md) | 294 | 113x81 - 1398x1956 | 40x (0.25) | Segmentation (4 classes) | Multi-Organ Cell Type (Breast, Kidney, Lung and Prostate) |
Expand Down
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ nav:
- Patch-level:
- BACH: datasets/bach.md
- CRC: datasets/crc.md
- GleasonArvaniti: datasets/gleason_arvaniti.md
- MHIST: datasets/mhist.md
- PatchCamelyon: datasets/patch_camelyon.md
- MoNuSAC: datasets/monusac.md
Expand Down
2 changes: 2 additions & 0 deletions src/eva/vision/data/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
MHIST,
PANDA,
Camelyon16,
GleasonArvaniti,
PANDASmall,
PatchCamelyon,
WsiClassificationDataset,
Expand All @@ -27,6 +28,7 @@
"BACH",
"BCSS",
"CRC",
"GleasonArvaniti",
"MHIST",
"PANDA",
"PANDASmall",
Expand Down
4 changes: 3 additions & 1 deletion src/eva/vision/data/datasets/classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,20 @@
from eva.vision.data.datasets.classification.bach import BACH
from eva.vision.data.datasets.classification.camelyon16 import Camelyon16
from eva.vision.data.datasets.classification.crc import CRC
from eva.vision.data.datasets.classification.gleason_arvaniti import GleasonArvaniti
from eva.vision.data.datasets.classification.mhist import MHIST
from eva.vision.data.datasets.classification.panda import PANDA, PANDASmall
from eva.vision.data.datasets.classification.patch_camelyon import PatchCamelyon
from eva.vision.data.datasets.classification.wsi import WsiClassificationDataset

__all__ = [
"BACH",
"Camelyon16",
"CRC",
"GleasonArvaniti",
"MHIST",
"PatchCamelyon",
"WsiClassificationDataset",
"PANDA",
"PANDASmall",
"Camelyon16",
]
Loading

0 comments on commit 2f75163

Please sign in to comment.