Skip to content

Commit

Permalink
Add BRACS dataset (#751)
Browse files Browse the repository at this point in the history
  • Loading branch information
nkaenzig authored Feb 4, 2025
1 parent 2f75163 commit 6f6b45f
Show file tree
Hide file tree
Showing 15 changed files with 480 additions and 0 deletions.
123 changes: 123 additions & 0 deletions configs/vision/pathology/offline/classification/bracs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
---
trainer:
class_path: eva.Trainer
init_args:
n_runs: &N_RUNS ${oc.env:N_RUNS, 5}
default_root_dir: &OUTPUT_ROOT ${oc.env:OUTPUT_ROOT, logs/${oc.env:MODEL_NAME, dino_vits16}/offline/bracs}
max_steps: &MAX_STEPS ${oc.env:MAX_STEPS, 12500}
checkpoint_type: ${oc.env:CHECKPOINT_TYPE, best}
callbacks:
- class_path: eva.callbacks.ConfigurationLogger
- class_path: lightning.pytorch.callbacks.TQDMProgressBar
init_args:
refresh_rate: ${oc.env:TQDM_REFRESH_RATE, 1}
- class_path: lightning.pytorch.callbacks.LearningRateMonitor
init_args:
logging_interval: epoch
- class_path: lightning.pytorch.callbacks.ModelCheckpoint
init_args:
filename: best
save_last: true
save_top_k: 1
monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/MulticlassAccuracy}
mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max}
- class_path: lightning.pytorch.callbacks.EarlyStopping
init_args:
min_delta: 0
patience: ${oc.env:PATIENCE, 74}
monitor: *MONITOR_METRIC
mode: *MONITOR_METRIC_MODE
- class_path: eva.callbacks.ClassificationEmbeddingsWriter
init_args:
output_dir: &DATASET_EMBEDDINGS_ROOT ${oc.env:EMBEDDINGS_ROOT, ./data/embeddings}/${oc.env:MODEL_NAME, dino_vits16}/bracs
dataloader_idx_map:
0: train
1: val
2: test
backbone:
class_path: eva.vision.models.ModelFromRegistry
init_args:
model_name: ${oc.env:MODEL_NAME, universal/vit_small_patch16_224_dino}
model_extra_kwargs: ${oc.env:MODEL_EXTRA_KWARGS, null}
overwrite: false
logger:
- class_path: lightning.pytorch.loggers.TensorBoardLogger
init_args:
save_dir: *OUTPUT_ROOT
name: ""
model:
class_path: eva.HeadModule
init_args:
head:
class_path: torch.nn.Linear
init_args:
in_features: ${oc.env:IN_FEATURES, 384}
out_features: &NUM_CLASSES 7
criterion: torch.nn.CrossEntropyLoss
optimizer:
class_path: torch.optim.AdamW
init_args:
lr: ${oc.env:LR_VALUE, 0.0003}
lr_scheduler:
class_path: torch.optim.lr_scheduler.CosineAnnealingLR
init_args:
T_max: *MAX_STEPS
eta_min: 0.0
metrics:
common:
- class_path: eva.metrics.AverageLoss
- class_path: eva.metrics.MulticlassClassificationMetrics
init_args:
num_classes: *NUM_CLASSES
data:
class_path: eva.DataModule
init_args:
datasets:
train:
class_path: eva.datasets.EmbeddingsClassificationDataset
init_args: &DATASET_ARGS
root: *DATASET_EMBEDDINGS_ROOT
manifest_file: manifest.csv
split: train
val:
class_path: eva.datasets.EmbeddingsClassificationDataset
init_args:
<<: *DATASET_ARGS
split: val
test:
class_path: eva.datasets.EmbeddingsClassificationDataset
init_args:
<<: *DATASET_ARGS
split: test
predict:
- class_path: eva.vision.datasets.BRACS
init_args: &PREDICT_DATASET_ARGS
root: ${oc.env:DATA_ROOT, ./data/bracs}
split: train
transforms:
class_path: eva.vision.data.transforms.common.ResizeAndCrop
init_args:
mean: ${oc.env:NORMALIZE_MEAN, [0.485, 0.456, 0.406]}
std: ${oc.env:NORMALIZE_STD, [0.229, 0.224, 0.225]}
- class_path: eva.vision.datasets.BRACS
init_args:
<<: *PREDICT_DATASET_ARGS
split: val
- class_path: eva.vision.datasets.BRACS
init_args:
<<: *PREDICT_DATASET_ARGS
split: test
dataloaders:
train:
batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 256}
num_workers: &N_DATA_WORKERS ${oc.env:N_DATA_WORKERS, 4}
shuffle: true
val:
batch_size: *BATCH_SIZE
num_workers: *N_DATA_WORKERS
test:
batch_size: *BATCH_SIZE
num_workers: *N_DATA_WORKERS
predict:
batch_size: &PREDICT_BATCH_SIZE ${oc.env:PREDICT_BATCH_SIZE, 64}
num_workers: *N_DATA_WORKERS
98 changes: 98 additions & 0 deletions configs/vision/pathology/online/classification/bracs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
---
trainer:
class_path: eva.Trainer
init_args:
n_runs: &N_RUNS ${oc.env:N_RUNS, 5}
default_root_dir: &OUTPUT_ROOT ${oc.env:OUTPUT_ROOT, logs/${oc.env:MODEL_NAME, dino_vits16}/online/bracs}
max_steps: &MAX_STEPS ${oc.env:MAX_STEPS, 12500}
checkpoint_type: ${oc.env:CHECKPOINT_TYPE, best}
callbacks:
- class_path: eva.callbacks.ConfigurationLogger
- class_path: lightning.pytorch.callbacks.TQDMProgressBar
init_args:
refresh_rate: ${oc.env:TQDM_REFRESH_RATE, 1}
- class_path: lightning.pytorch.callbacks.LearningRateMonitor
init_args:
logging_interval: epoch
- class_path: lightning.pytorch.callbacks.ModelCheckpoint
init_args:
filename: best
save_last: true
save_top_k: 1
monitor: &MONITOR_METRIC ${oc.env:MONITOR_METRIC, val/MulticlassAccuracy}
mode: &MONITOR_METRIC_MODE ${oc.env:MONITOR_METRIC_MODE, max}
- class_path: lightning.pytorch.callbacks.EarlyStopping
init_args:
min_delta: 0
patience: ${oc.env:PATIENCE, 74}
monitor: *MONITOR_METRIC
mode: *MONITOR_METRIC_MODE
logger:
- class_path: lightning.pytorch.loggers.TensorBoardLogger
init_args:
save_dir: *OUTPUT_ROOT
name: ""
model:
class_path: eva.HeadModule
init_args:
backbone:
class_path: eva.vision.models.ModelFromRegistry
init_args:
model_name: ${oc.env:MODEL_NAME, universal/vit_small_patch16_224_dino}
model_extra_kwargs: ${oc.env:MODEL_EXTRA_KWARGS, null}
head:
class_path: torch.nn.Linear
init_args:
in_features: ${oc.env:IN_FEATURES, 384}
out_features: &NUM_CLASSES 7
criterion: torch.nn.CrossEntropyLoss
optimizer:
class_path: torch.optim.AdamW
init_args:
lr: ${oc.env:LR_VALUE, 0.0003}
lr_scheduler:
class_path: torch.optim.lr_scheduler.CosineAnnealingLR
init_args:
T_max: *MAX_STEPS
eta_min: 0.0
metrics:
common:
- class_path: eva.metrics.AverageLoss
- class_path: eva.metrics.MulticlassClassificationMetrics
init_args:
num_classes: *NUM_CLASSES
data:
class_path: eva.DataModule
init_args:
datasets:
train:
class_path: eva.vision.datasets.BRACS
init_args: &DATASET_ARGS
root: ${oc.env:DATA_ROOT, ./data/bracs}
split: train
transforms:
class_path: eva.vision.data.transforms.common.ResizeAndCrop
init_args:
mean: ${oc.env:NORMALIZE_MEAN, [0.485, 0.456, 0.406]}
std: ${oc.env:NORMALIZE_STD, [0.229, 0.224, 0.225]}
val:
class_path: eva.vision.datasets.BRACS
init_args:
<<: *DATASET_ARGS
split: val
test:
class_path: eva.vision.datasets.BRACS
init_args:
<<: *DATASET_ARGS
split: test
dataloaders:
train:
batch_size: &BATCH_SIZE ${oc.env:BATCH_SIZE, 256}
num_workers: &N_DATA_WORKERS ${oc.env:N_DATA_WORKERS, 4}
shuffle: true
val:
batch_size: *BATCH_SIZE
num_workers: *N_DATA_WORKERS
test:
batch_size: *BATCH_SIZE
num_workers: *N_DATA_WORKERS
71 changes: 71 additions & 0 deletions docs/datasets/bracs.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# BRACS


The BReAst Carcinoma Subtyping (BRACS) is a new dataset of hematoxylin and eosin (H&E)
histopathological images of breast carcinoma.

Images (WSI) of hematoxylin and eosin (H&E) stained breast tissues were generated by using an
Aperio AT2 scanner at 0.25 µm/pixel for 40× resolution. Some Regions of Interest (RoIs) are
associated with a subset of WSIs. See example figure below. Both WSIs and RoIs were annotated
according to the seven classes mentioned above (N, PB, UDH, FEA, ADH, DCIS, IC), by three expert
pathologists of the Complex Structure Pathological Anatomy and Cytopathology of National Cancer
Institute – IRCCS Fondazione Pascale, Naples, Italy.

While the BRACS contains 547 WSIs collected by 189 patients, the `BRACS_ROI` subset which
we use in this benchmarks contains 4539 extracted ROIs / patches.

## Raw data

### Key stats

| | |
|--------------------------------|-----------------------------|
| **Modality** | Vision (WSI patches) |
| **Task** | Multiclass classification (7 classes) |
| **Cancer type** | Breast |
| **Data size** | 52 GB |
| **Image dimension** | variable |
| **Magnification (μm/px)** | 40x (0.25) |
| **Files format** | `png` |
| **Number of images** | 4539 |


### Splits

The data source provides train/validation/test splits

| Splits | Train | Validation | Test |
|----------|---------------|-------------|--------------|
| #Samples | 3657 (80.57%) | 312 (6.87%) | 570 (12.56%) |


### Organization

The BRACS data is organized as follows:

```
BRACS_RoI
├── train
│ ├── 0_N # 1 folder per class
│ ├── 1_PB
│ ├── ...
├── val
│ ├── 0_N
│ ├── ...
├── test
│ ├── 0_N
│ ├── ...
```


## Download and preprocessing
The `BRACS` dataset class doesn't download the data during runtime and must be downloaded manually from [the official source](https://www.bracs.icar.cnr.it/download/).

## Relevant links

* [Official Website](https://www.bracs.icar.cnr.it/)
* [Paper](https://academic.oup.com/database/article/doi/10.1093/database/baac093/6762252)

## License

[CC BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/)
1 change: 1 addition & 0 deletions docs/datasets/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
| Dataset | #Patches | Patch Size | Magnification (μm/px) | Task | Tissue Type |
|------------------------------------|----------|------------|------------------------|----------------------------|------------------|
| [BACH](bach.md) | 400 | 2048x1536 | 20x (0.5) | Classification (4 classes) | Breast |
| [BRACS](bracs.md) | 4539 | variable | 40x (0.25) | Classification (7 classes) | Breast |
| [CRC](crc.md) | 107,180 | 224x224 | 20x (0.5) | Classification (9 classes) | Colorectal |
| [GleasonArvaniti](crc.md) | 22,752 | 750x750 | 40x (0.23) | Classification (4 classes) | Prostate |
| [PatchCamelyon](patch_camelyon.md) | 327,680 | 96x96 | 10x (1.0) \* | Classification (2 classes) | Breast |
Expand Down
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ nav:
- WSI:
- Patch-level:
- BACH: datasets/bach.md
- BRACS: datasets/bracs.md
- CRC: datasets/crc.md
- GleasonArvaniti: datasets/gleason_arvaniti.md
- MHIST: datasets/mhist.md
Expand Down
2 changes: 2 additions & 0 deletions src/eva/vision/data/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from eva.vision.data.datasets.classification import (
BACH,
BRACS,
CRC,
MHIST,
PANDA,
Expand All @@ -27,6 +28,7 @@
__all__ = [
"BACH",
"BCSS",
"BRACS",
"CRC",
"GleasonArvaniti",
"MHIST",
Expand Down
2 changes: 2 additions & 0 deletions src/eva/vision/data/datasets/classification/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Image classification datasets API."""

from eva.vision.data.datasets.classification.bach import BACH
from eva.vision.data.datasets.classification.bracs import BRACS
from eva.vision.data.datasets.classification.camelyon16 import Camelyon16
from eva.vision.data.datasets.classification.crc import CRC
from eva.vision.data.datasets.classification.gleason_arvaniti import GleasonArvaniti
Expand All @@ -11,6 +12,7 @@

__all__ = [
"BACH",
"BRACS",
"Camelyon16",
"CRC",
"GleasonArvaniti",
Expand Down
Loading

0 comments on commit 6f6b45f

Please sign in to comment.