From 61c0a95769f52beb4fb1700cf83d431c6e4c0c62 Mon Sep 17 00:00:00 2001 From: Matthew Wood <62712722+mattwoodx@users.noreply.github.com> Date: Thu, 16 Jan 2025 15:29:06 +0100 Subject: [PATCH] MKDocs fixes (#169) * Update logging * Mkdocs fixes (#168) * Update version in pyproject.toml --------- Co-authored-by: bputzeys Co-authored-by: Benoit Putzeys <157973952+bputzeys@users.noreply.github.com> --- docs/index.md | 14 +- examples/notebooks/Cell-Type-Annotation.ipynb | 1397 +---------------- helical/models/caduceus/fine_tuning_model.py | 4 +- helical/models/caduceus/model.py | 13 +- helical/models/geneformer/model.py | 13 +- helical/models/helix_mrna/model.py | 17 +- helical/models/hyena_dna/hyena_dna_config.py | 1 + helical/models/hyena_dna/model.py | 13 +- helical/models/mamba2_mrna/model.py | 20 +- helical/models/scgpt/model.py | 12 +- helical/models/scgpt/scgpt_utils.py | 7 +- helical/models/uce/gene_embeddings.py | 8 +- helical/models/uce/model.py | 13 +- helical/models/uce/uce_config.py | 1 + helical/utils/downloader.py | 9 +- mkdocs.yml | 1 - pyproject.toml | 2 +- 17 files changed, 93 insertions(+), 1452 deletions(-) diff --git a/docs/index.md b/docs/index.md index 534ff24f..a8032986 100644 --- a/docs/index.md +++ b/docs/index.md @@ -91,13 +91,13 @@ Within the `example/notebooks` folder, open the notebook of your choice. We reco | Example | Description | Colab | | ----------- | ----------- |----------- | -|[Quick-Start-Tutorial.ipynb](https://github.com/helicalAI/helical/blob/main/examples/notebooks/Quick-Start-Tutorial.ipynb)| A tutorial to quickly get used to the helical package and environment. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/helicalAI/helical/blob/main/examples/notebooks/Quick-Start-Tutorial.ipynb)| -|[Helix-mRNA.ipynb](./examples/notebooks/Helix-mRNA.ipynb)|An example of how to use the Helix-mRNA model.|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/helicalAI/helical/blob/main/examples/notebooks/Helix-mRNA.ipynb) | -|[Geneformer-vs-UCE.ipynb](https://github.com/helicalAI/helical/blob/main/examples/notebooks/Geneformer-vs-UCE.ipynb) | Zero-Shot Reference Mapping with Geneformer & UCE and compare the outcomes. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/helicalAI/helical/blob/main/examples/notebooks/Geneformer-vs-UCE.ipynb) | -|[Hyena-DNA-Inference.ipynb](https://github.com/helicalAI/helical/blob/main/examples/notebooks/Hyena-DNA-Inference.ipynb)|An example how to do probing with HyenaDNA by training a neural network on 18 downstream classification tasks.|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/helicalAI/helical/blob/main/examples/notebooks/Hyena-DNA-Inference.ipynb)| -|[Cell-Type-Annotation.ipynb](https://github.com/helicalAI/helical/blob/main/examples/notebooks/Cell-Type-Annotation.ipynb)|An example how to do probing with scGPT by training a neural network to predict cell type annotations.|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/helicalAI/helical/blob/main/examples/notebooks/Cell-Type-Annotation.ipynb) | -|[Cell-Type-Classification-Fine-Tuning.ipynb](./examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb)|An example how to fine-tune different models on classification tasks.|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/helicalAI/helical/blob/main/examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb) | -|[HyenaDNA-Fine-Tuning.ipynb](./examples/notebooks/HyenaDNA-Fine-Tuning.ipynb)|An example of how to fine-tune the HyenaDNA model on downstream benchmarks.|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/helicalAI/helical/blob/main/examples/notebooks/HyenaDNA-Fine-Tuning.ipynb) | +|[Quick-Start-Tutorial.ipynb](./notebooks/Quick-Start-Tutorial.ipynb)| A tutorial to quickly get used to the helical package and environment. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/helicalAI/helical/blob/main/examples/notebooks/Quick-Start-Tutorial.ipynb)| +|[Helix-mRNA.ipynb](./notebooks/Helix-mRNA.ipynb)|An example of how to use the Helix-mRNA model.|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/helicalAI/helical/blob/main/examples/notebooks/Helix-mRNA.ipynb) | +|[Geneformer-vs-UCE.ipynb](./notebooks/Geneformer-vs-UCE.ipynb) | Zero-Shot Reference Mapping with Geneformer & UCE and compare the outcomes. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/helicalAI/helical/blob/main/examples/notebooks/Geneformer-vs-UCE.ipynb) | +|[Hyena-DNA-Inference.ipynb](./notebooks/Hyena-DNA-Inference.ipynb)|An example how to do probing with HyenaDNA by training a neural network on 18 downstream classification tasks.|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/helicalAI/helical/blob/main/examples/notebooks/Hyena-DNA-Inference.ipynb)| +|[Cell-Type-Annotation.ipynb](./notebooks/Cell-Type-Annotation.ipynb)|An example how to do probing with scGPT by training a neural network to predict cell type annotations.|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/helicalAI/helical/blob/main/examples/notebooks/Cell-Type-Annotation.ipynb) | +|[Cell-Type-Classification-Fine-Tuning.ipynb](./notebooks/Cell-Type-Classification-Fine-Tuning.ipynb)|An example how to fine-tune different models on classification tasks.|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/helicalAI/helical/blob/main/examples/notebooks/Cell-Type-Classification-Fine-Tuning.ipynb) | +|[HyenaDNA-Fine-Tuning.ipynb](./notebooks/HyenaDNA-Fine-Tuning.ipynb)|An example of how to fine-tune the HyenaDNA model on downstream benchmarks.|[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/helicalAI/helical/blob/main/examples/notebooks/HyenaDNA-Fine-Tuning.ipynb) | | Coming Soon | New models such as SCimilarity, scVI; benchmarking scripts; new use cases; others | ## Stuck somewhere ? Other ideas ? diff --git a/examples/notebooks/Cell-Type-Annotation.ipynb b/examples/notebooks/Cell-Type-Annotation.ipynb index 2bf18f2b..883f398c 100644 --- a/examples/notebooks/Cell-Type-Annotation.ipynb +++ b/examples/notebooks/Cell-Type-Annotation.ipynb @@ -1147,1402 +1147,7 @@ ] } ], - "metadata": { - "accelerator": "GPU", - "colab": { - "gpuType": "T4", - "provenance": [] - }, - "kernelspec": { - "display_name": "helical-package", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.8" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "031f15e503284b6e989e599834bc0cfc": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "042588def40e42848c464da311dfa395": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_9f06f76e74ba4d4493a2b3810fa78501", - "IPY_MODEL_afde76acb7cd4642967e74fc01cc5e60", - "IPY_MODEL_bed90884638d44a1a0e8f2002485a49c" - ], - "layout": "IPY_MODEL_577aff58497a41afb68c49c4e4a3483c" - } - }, - "0536305995ae41b989c00348dec37a50": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_44164c48e4774fd28c8e7514e26dfec3", - "max": 157, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_7fba8d16f48b451ba2695e7d8d0c605b", - "value": 157 - } - }, - "06b297ce68a54494925355922e6a3e60": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "08d07e9439494aa895b90bda386cd2d9": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "0ae5dd4c69154c36ac6409002dbfdbe3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "0ba9a6eb6f0c4d63ae804a09f5d3b7a5": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "0fcbdfa61d8c47499ea91c045e6bdb88": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "11d0b1de79104633bd46099d3fc9f845": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "129fbaa5b93146839d3e095e066689b2": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_75250bb34c7a4816bb50bab813367823", - "IPY_MODEL_18e7db16f29b4c5fb37b12b7ecabd994", - "IPY_MODEL_811ae6f2713942459f44a8ad0b78f1c8" - ], - "layout": "IPY_MODEL_8333081f6e6844fb9872d9d14b9798f4" - } - }, - "18e7db16f29b4c5fb37b12b7ecabd994": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_ac8fd8833f8b434c98ebf9de7fd7c956", - "max": 157, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_746baf43caea4ee2831817691b968573", - "value": 157 - } - }, - "1a9866fd593a4773a9336548d9fb4e7c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "21078359f8b64f1f9e5b70fbb559ac5e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_031f15e503284b6e989e599834bc0cfc", - "placeholder": "​", - "style": "IPY_MODEL_d6a09da0033a4fff88ce7ad041ea5151", - "value": "100%" - } - }, - "44164c48e4774fd28c8e7514e26dfec3": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "45e0ffdc378040c8b8fb62172c321be5": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "475e8bc25e844eb7bda64024f6fd328b": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "577aff58497a41afb68c49c4e4a3483c": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "6105972c9a224bb2b83c39d498d82694": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "6793d6aa02fe4992b2a3079935804e68": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_b0387ec16b2346d6a690f8bd8ef7aae9", - "max": 7844, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_0fcbdfa61d8c47499ea91c045e6bdb88", - "value": 7844 - } - }, - "6c0eae3868aa4ea98c7c1106aba4cb84": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "746baf43caea4ee2831817691b968573": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "75250bb34c7a4816bb50bab813367823": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_1a9866fd593a4773a9336548d9fb4e7c", - "placeholder": "​", - "style": "IPY_MODEL_6105972c9a224bb2b83c39d498d82694", - "value": "100%" - } - }, - "7fba8d16f48b451ba2695e7d8d0c605b": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "ProgressStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "811ae6f2713942459f44a8ad0b78f1c8": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_94072eee12234d52906f701dbaec4ea7", - "placeholder": "​", - "style": "IPY_MODEL_0ba9a6eb6f0c4d63ae804a09f5d3b7a5", - "value": " 157/157 [03:35<00:00,  1.17s/it]" - } - }, - "8333081f6e6844fb9872d9d14b9798f4": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "8602ffeb4a5548f3b88e0011ff526b60": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "8b87146682714f9394465afb70fb6296": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_475e8bc25e844eb7bda64024f6fd328b", - "placeholder": "​", - "style": "IPY_MODEL_06b297ce68a54494925355922e6a3e60", - "value": " 7844/7844 [00:02<00:00, 3109.57 examples/s]" - } - }, - "8f571f239ad54eb19d35da67e47ddea9": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "94072eee12234d52906f701dbaec4ea7": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "96cf6bd80b23485a91309f97fceae386": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_dd28b99c3789438ba123044f9d8bd481", - "placeholder": "​", - "style": "IPY_MODEL_11d0b1de79104633bd46099d3fc9f845", - "value": "Map: 100%" - } - }, - "98077a70b5aa4738b2f9ce21fc8a76f8": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "9be7ad77cfbb4aebbf850bce0d703286": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_96cf6bd80b23485a91309f97fceae386", - "IPY_MODEL_6793d6aa02fe4992b2a3079935804e68", - "IPY_MODEL_8b87146682714f9394465afb70fb6296" - ], - "layout": "IPY_MODEL_8602ffeb4a5548f3b88e0011ff526b60" - } - }, - "9e58f857ee8a43ac94614686ba82cf82": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HBoxModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_21078359f8b64f1f9e5b70fbb559ac5e", - "IPY_MODEL_0536305995ae41b989c00348dec37a50", - "IPY_MODEL_da40c55164824b2e81fef416cea0bdf3" - ], - "layout": "IPY_MODEL_45e0ffdc378040c8b8fb62172c321be5" - } - }, - "9f06f76e74ba4d4493a2b3810fa78501": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_e6639753498c44abbb25f2dd39f224a6", - "placeholder": "​", - "style": "IPY_MODEL_0ae5dd4c69154c36ac6409002dbfdbe3", - "value": "Map: 100%" - } - }, - "ac8fd8833f8b434c98ebf9de7fd7c956": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "afde76acb7cd4642967e74fc01cc5e60": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "FloatProgressModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_98077a70b5aa4738b2f9ce21fc8a76f8", - "max": 7844, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_6c0eae3868aa4ea98c7c1106aba4cb84", - "value": 7844 - } - }, - "b0387ec16b2346d6a690f8bd8ef7aae9": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "bed90884638d44a1a0e8f2002485a49c": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_08d07e9439494aa895b90bda386cd2d9", - "placeholder": "​", - "style": "IPY_MODEL_c284a26712fc46a49f40081765203b11", - "value": " 7844/7844 [00:02<00:00, 4942.70 examples/s]" - } - }, - "c284a26712fc46a49f40081765203b11": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "d6a09da0033a4fff88ce7ad041ea5151": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "da40c55164824b2e81fef416cea0bdf3": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "HTMLModel", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_8f571f239ad54eb19d35da67e47ddea9", - "placeholder": "​", - "style": "IPY_MODEL_fb07aa128c734c1396a8d2e8e46dfd7e", - "value": " 157/157 [03:34<00:00,  1.17s/it]" - } - }, - "dd28b99c3789438ba123044f9d8bd481": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "e6639753498c44abbb25f2dd39f224a6": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "1.2.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "fb07aa128c734c1396a8d2e8e46dfd7e": { - "model_module": "@jupyter-widgets/controls", - "model_module_version": "1.5.0", - "model_name": "DescriptionStyleModel", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - } - } - } - }, + "metadata": {}, "nbformat": 4, "nbformat_minor": 0 } diff --git a/helical/models/caduceus/fine_tuning_model.py b/helical/models/caduceus/fine_tuning_model.py index 17b37276..7fd952d7 100644 --- a/helical/models/caduceus/fine_tuning_model.py +++ b/helical/models/caduceus/fine_tuning_model.py @@ -137,8 +137,8 @@ def train(self, e.g. optimizer_params = {'lr': 0.0001} loss_function : torch.nn.modules.loss, default=torch.nn.modules.loss.CrossEntropyLoss() The loss function to be used. - label : str, optional, default="cell_types" - The column in the dataset containing the training labels. These should be stored as unique per class integers. + train_labels : np.ndarray + training labels for the dataset. epochs : int, optional, default=10 The number of epochs to train the model trainable_layers : int, optional, default=2 diff --git a/helical/models/caduceus/model.py b/helical/models/caduceus/model.py index e0ebaed9..be234c1a 100644 --- a/helical/models/caduceus/model.py +++ b/helical/models/caduceus/model.py @@ -115,9 +115,9 @@ def process_data(self, sequences: List[str], return_tensors: str="pt", padding: Containing processed DNA sequences. """ - LOGGER.info("Processing data") - + LOGGER.info("Processing data for Caduceus.") self.ensure_dna_sequence_validity(sequences) + max_length = min(len(max(sequences, key=len)), self.config['input_size'])+1 # tokenized_sequences = [] @@ -125,7 +125,9 @@ def process_data(self, sequences: List[str], return_tensors: str="pt", padding: tokenized_sequences = self.tokenizer(sequences, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length) # tokenized_sequences.append(tokenized_seq) - return Dataset.from_dict(tokenized_sequences) + dataset = Dataset.from_dict(tokenized_sequences) + LOGGER.info("Successfully processed the data for Caduceus.") + return dataset def get_embeddings(self, dataset: Dataset) -> np.ndarray: """Get the embeddings for the tokenized sequence. @@ -141,7 +143,7 @@ def get_embeddings(self, dataset: Dataset) -> np.ndarray: The embeddings for the tokenized sequence in the form of a numpy array. NOTE: This method returns the embeddings using the pooling strategy specified in the config. """ - LOGGER.info("Inference started") + LOGGER.info("Started getting embeddings:") dataloader = DataLoader(dataset, collate_fn=self._collate_fn, batch_size=self.config['batch_size'], shuffle=False, num_workers=self.config['nproc']) embeddings = [] @@ -154,5 +156,6 @@ def get_embeddings(self, dataset: Dataset) -> np.ndarray: del batch del outputs - + + LOGGER.info(f"Finished getting embeddings.") return np.vstack(embeddings) diff --git a/helical/models/geneformer/model.py b/helical/models/geneformer/model.py index 0d863e40..bdf2d109 100644 --- a/helical/models/geneformer/model.py +++ b/helical/models/geneformer/model.py @@ -4,7 +4,6 @@ import numpy as np from anndata import AnnData from helical.utils.downloader import Downloader -import pickle from transformers import BertForMaskedLM from helical.models.geneformer.geneformer_utils import get_embs,quant_layers from helical.models.geneformer.geneformer_tokenizer import TranscriptomeTokenizer @@ -86,7 +85,6 @@ def __init__(self, configurer: GeneformerConfig = default_configurer) -> None: self.model.eval() self.model = self.model.to(self.device) - self.layer_to_quant = quant_layers(self.model) + self.config['emb_layer'] self.emb_mode = self.config['emb_mode'] self.forward_batch_size = self.config['batch_size'] @@ -105,7 +103,9 @@ def __init__(self, configurer: GeneformerConfig = default_configurer) -> None: self.eos_present = True if "" in self.tk.gene_token_dict else False LOGGER.info(f"Model finished initializing.") - + mode = "training" if self.model.training else "eval" + LOGGER.info(f"'{self.config['model_name']}' model is in '{mode}' mode, on device '{self.device}' with embedding mode '{self.emb_mode}'.") + def process_data(self, adata: AnnData, gene_names: str = "index", @@ -141,7 +141,7 @@ def process_data(self, Dataset The tokenized dataset in the form of a Huggingface Dataset object. """ - + LOGGER.info(f"Processing data for Geneformer.") self.ensure_rna_data_validity(adata, gene_names, use_raw_counts) # map gene symbols to ensemble ids if provided @@ -162,6 +162,8 @@ def process_data(self, if output_path: output_path = Path(output_path).with_suffix(".dataset") tokenized_dataset.save_to_disk(output_path) + + LOGGER.info(f"Successfully processed the data for Geneformer.") return tokenized_dataset def get_embeddings(self, dataset: Dataset) -> np.array: @@ -177,7 +179,7 @@ def get_embeddings(self, dataset: Dataset) -> np.array: np.array The gene embeddings in the form of a numpy array """ - LOGGER.info(f"Inference started:") + LOGGER.info(f"Started getting embeddings:") embeddings = get_embs( self.model, dataset, @@ -192,4 +194,5 @@ def get_embeddings(self, dataset: Dataset) -> np.array: self.device ) + LOGGER.info(f"Finished getting embeddings.") return embeddings diff --git a/helical/models/helix_mrna/model.py b/helical/models/helix_mrna/model.py index 1ebae162..f650cadc 100644 --- a/helical/models/helix_mrna/model.py +++ b/helical/models/helix_mrna/model.py @@ -12,7 +12,7 @@ import logging -logger = logging.getLogger(__name__) +LOGGER = logging.getLogger(__name__) class HelixmRNA(HelicalRNAModel): """Helix-mRNA Model. @@ -58,7 +58,11 @@ def __init__(self, configurer: HelixmRNAConfig = default_configurer): self.model = HelixmRNAPretrainedModel.from_pretrained(self.config["model_name"]) self.pretrained_config = HelixmRNAPretrainedConfig.from_pretrained(self.config["model_name"], trust_remote=True) self.tokenizer = CharTokenizer.from_pretrained(self.config["model_name"], trust_remote=True) - logger.info("Helix-mRNA initialized successfully.") + self.model.to(self.config["device"]) + + LOGGER.info("Helix-mRNA initialized successfully.") + mode = "training" if self.model.training else "eval" + LOGGER.info(f"'{self.config['model_name']}' model is in '{mode}' mode, on device '{next(self.model.parameters()).device.type}'.") def process_data(self, sequences: str) -> Dataset: """Process the mRNA sequences and return a Dataset object. @@ -73,6 +77,7 @@ def process_data(self, sequences: str) -> Dataset: Dataset The dataset object. """ + LOGGER.info(f"Processing data for Helix-mRNA.") self.ensure_rna_sequence_validity(sequences) tokenized_sequences = self.tokenizer(sequences, @@ -83,7 +88,8 @@ def process_data(self, sequences: str) -> Dataset: return_special_tokens_mask=True) dataset = Dataset.from_dict(tokenized_sequences) - + + LOGGER.info("Successfully processed the data for Helix-mRNA.") return dataset def get_embeddings(self, dataset: Dataset) -> np.ndarray: @@ -99,11 +105,10 @@ def get_embeddings(self, dataset: Dataset) -> np.ndarray: np.ndarray The embeddings array. """ + LOGGER.info("Started getting embeddings:") dataloader = DataLoader(dataset, collate_fn=self._collate_fn, batch_size=self.config["batch_size"], shuffle=False) embeddings = [] - self.model.to(self.config["device"]) - progress_bar = tqdm(dataloader, desc="Getting embeddings") with torch.no_grad(): for batch in progress_bar: @@ -120,9 +125,9 @@ def get_embeddings(self, dataset: Dataset) -> np.ndarray: del batch del output + LOGGER.info(f"Finished getting embeddings.") return np.concatenate(embeddings) - def _collate_fn(self, batch): input_ids = torch.tensor([item["input_ids"] for item in batch]) special_tokens_mask = torch.tensor([item["special_tokens_mask"] for item in batch]) diff --git a/helical/models/hyena_dna/hyena_dna_config.py b/helical/models/hyena_dna/hyena_dna_config.py index 3a3e5f9a..70670e9f 100644 --- a/helical/models/hyena_dna/hyena_dna_config.py +++ b/helical/models/hyena_dna/hyena_dna_config.py @@ -102,6 +102,7 @@ def __init__( list_of_files_to_download = [f"hyena_dna/{model_name}.ckpt"] self.config = { + "model_name": model_name, "model_path": Path(CACHE_DIR_HELICAL, f"hyena_dna/{model_name}.ckpt"), "list_of_files_to_download": list_of_files_to_download, "batch_size": batch_size, diff --git a/helical/models/hyena_dna/model.py b/helical/models/hyena_dna/model.py index 89cb930c..d375010c 100644 --- a/helical/models/hyena_dna/model.py +++ b/helical/models/hyena_dna/model.py @@ -67,8 +67,11 @@ def __init__(self, configurer: HyenaDNAConfig = default_configurer) -> None: self.device = self.config['device'] self.model.to(self.device) self.model.eval() + LOGGER.info(f"Model finished initializing.") - + mode = "training" if self.model.training else "eval" + LOGGER.info(f"'{self.config['model_name']}' model is in '{mode}' mode, on device '{next(self.model.parameters()).device.type}'.") + def process_data(self, sequences: list[str], return_tensors: str="pt", padding: str="max_length", truncation: bool=True) -> Dataset: """Process the input DNA sequence. @@ -88,8 +91,7 @@ def process_data(self, sequences: list[str], return_tensors: str="pt", padding: Dataset Containing processed DNA sequences. """ - LOGGER.info("Processing data") - + LOGGER.info("Processing data for HyenaDNA.") self.ensure_dna_sequence_validity(sequences) max_length = len(max(sequences, key=len))+2 # +2 for special tokens at the beginning and end of sequences @@ -97,7 +99,7 @@ def process_data(self, sequences: list[str], return_tensors: str="pt", padding: tokenized_sequences = self.tokenizer(sequences, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length) dataset = Dataset.from_dict(tokenized_sequences) - LOGGER.info(f"Data processing finished.") + LOGGER.info(f"Succesfully prepared the HyenaDNA Dataset.") return dataset def get_embeddings(self, dataset: Dataset) -> torch.Tensor: @@ -114,7 +116,7 @@ def get_embeddings(self, dataset: Dataset) -> torch.Tensor: The embeddings for the tokenized sequence in the form of a numpy array. """ - LOGGER.info(f"Inference started") + LOGGER.info(f"Started getting embeddings:") train_data_loader = DataLoader(dataset, collate_fn=self._collate_fn, batch_size=self.config["batch_size"]) with torch.inference_mode(): @@ -123,6 +125,7 @@ def get_embeddings(self, dataset: Dataset) -> torch.Tensor: input_data = batch["input_ids"].to(self.device) embeddings.append(self.model(input_data).detach().cpu().numpy()) + LOGGER.info(f"Finished getting embeddings.") return np.vstack(embeddings) diff --git a/helical/models/mamba2_mrna/model.py b/helical/models/mamba2_mrna/model.py index 64c21087..99724128 100644 --- a/helical/models/mamba2_mrna/model.py +++ b/helical/models/mamba2_mrna/model.py @@ -10,7 +10,7 @@ import logging -logger = logging.getLogger(__name__) +LOGGER = logging.getLogger(__name__) class Mamba2mRNA(HelicalRNAModel): """Mamba2-mRNA Model. @@ -52,9 +52,13 @@ def __init__(self, configurer: Mamba2mRNAConfig = default_configurer): self.model = Mamba2Model.from_pretrained(self.config["model_name"]) self.pretrained_config = Mamba2Config.from_pretrained(self.config["model_name"], trust_remote=True) self.tokenizer = CharTokenizer.from_pretrained(self.config["model_name"], trust_remote=True) - + self.model.to(self.config["device"]) self.model.post_init() - logger.info("Mamba2-mRNA initialized successfully.") + + LOGGER.info("Mamba2-mRNA initialized successfully.") + mode = "training" if self.model.training else "eval" + LOGGER.info(f"'{self.config['model_name']}' model is in '{mode}' mode, on device '{next(self.model.parameters()).device.type}'.") + def process_data(self, sequences: str) -> Dataset: """Process the mRNA sequences and return a Dataset object. @@ -69,6 +73,7 @@ def process_data(self, sequences: str) -> Dataset: Dataset The dataset object. """ + LOGGER.info(f"Processing data for Mamba2-mRNA.") self.ensure_rna_sequence_validity(sequences) tokenized_sequences = self.tokenizer(sequences, @@ -79,7 +84,8 @@ def process_data(self, sequences: str) -> Dataset: return_special_tokens_mask=True) dataset = Dataset.from_dict(tokenized_sequences) - + + LOGGER.info(f"Successfully preprocessed the data for Mamba2-mRNA.") return dataset def get_embeddings(self, dataset: Dataset) -> np.ndarray: @@ -95,11 +101,10 @@ def get_embeddings(self, dataset: Dataset) -> np.ndarray: np.ndarray The embeddings array. """ + LOGGER.info(f"Started getting embeddings:") dataloader = DataLoader(dataset, collate_fn=self._collate_fn, batch_size=self.config["batch_size"], shuffle=False) embeddings = [] - self.model.to(self.config["device"]) - progress_bar = tqdm(dataloader, desc="Getting embeddings") with torch.no_grad(): for batch in progress_bar: @@ -116,7 +121,7 @@ def get_embeddings(self, dataset: Dataset) -> np.ndarray: if self.pretrained_config.pad_token_id is None and batch_size > 1: message = "Cannot handle batch sizes > 1 if no padding token is defined." - logger.error(message) + LOGGER.error(message) raise ValueError(message) if self.pretrained_config.pad_token_id is None: @@ -139,6 +144,7 @@ def get_embeddings(self, dataset: Dataset) -> np.ndarray: del batch del output + LOGGER.info(f"Finished getting embeddings.") return np.concatenate(embeddings) def _collate_fn(self, batch): diff --git a/helical/models/scgpt/model.py b/helical/models/scgpt/model.py index 3eee5462..0557ddad 100644 --- a/helical/models/scgpt/model.py +++ b/helical/models/scgpt/model.py @@ -77,7 +77,10 @@ def __init__(self, configurer: scGPTConfig = configurer) -> None: self.model = self.accelerator.prepare(self.model) else: self.accelerator = None + LOGGER.info(f"Model finished initializing.") + mode = "training" if self.model.training else "eval" + LOGGER.info(f"'scGPT' model is in '{mode}' mode, on device '{next(self.model.parameters()).device.type}' with embedding mode '{self.config['emb_mode']}'.") def get_embeddings(self, dataset: Dataset) -> np.array: """Gets the gene embeddings @@ -94,7 +97,7 @@ def get_embeddings(self, dataset: Dataset) -> np.array: The return type depends on the `emb_mode` parameter in the configuration. If `emb_mode` is set to "gene", the embeddings are returned as a list of pd.Series which contain a mapping of gene_name:embedding for each cell. """ - LOGGER.info(f"Inference started:") + LOGGER.info(f"Started getting embeddings:") # fix seeds np.random.seed(self.config["binning_seed"]) @@ -149,6 +152,8 @@ def get_embeddings(self, dataset: Dataset) -> np.array: resulting_embeddings.extend(self._compute_embeddings_depending_on_mode(embeddings, data_dict)) resulting_embeddings = self._normalize_embeddings(resulting_embeddings) + + LOGGER.info(f"Finished getting embeddings.") return resulting_embeddings def _normalize_embeddings(self, resulting_embeddings: torch.tensor) -> np.ndarray: @@ -248,8 +253,9 @@ def process_data(self, The processed dataset. """ - + LOGGER.info(f"Processing data for scGPT.") self.ensure_data_validity(adata, gene_names, use_batch_labels, use_raw_counts) + self.gene_names = gene_names if fine_tuning: # Preprocess the dataset and select `N_HVG` highly variable genes for downstream analysis. @@ -288,6 +294,8 @@ def process_data(self, dataset = Dataset( count_matrix, gene_ids, self.vocab, self.config, batch_ids if use_batch_labels else None ) + + LOGGER.info(f"Successfully processed the data for scGPT.") return dataset diff --git a/helical/models/scgpt/scgpt_utils.py b/helical/models/scgpt/scgpt_utils.py index 3cc9071b..fb826edf 100644 --- a/helical/models/scgpt/scgpt_utils.py +++ b/helical/models/scgpt/scgpt_utils.py @@ -4,6 +4,9 @@ import json from typing import List, Mapping, Optional import torch +import logging + +LOGGER = logging.getLogger(__name__) def load_pretrained( model: torch.nn.Module, @@ -46,14 +49,14 @@ def load_pretrained( if strict: if verbose: for k, v in pretrained_params.items(): - logger.info(f"Loading parameter {k} with shape {v.shape}") + LOGGER.info(f"Loading parameter {k} with shape {v.shape}") model_dict.update(pretrained_params) model.load_state_dict(model_dict) else: if verbose: for k, v in pretrained_params.items(): if k in model_dict and v.shape == model_dict[k].shape: - logger.info(f"Loading parameter {k} with shape {v.shape}") + LOGGER.info(f"Loading parameter {k} with shape {v.shape}") pretrained_params = { k: v for k, v in pretrained_params.items() diff --git a/helical/models/uce/gene_embeddings.py b/helical/models/uce/gene_embeddings.py index ed0563c0..55f5eee9 100644 --- a/helical/models/uce/gene_embeddings.py +++ b/helical/models/uce/gene_embeddings.py @@ -9,7 +9,7 @@ import logging from helical.models.uce.uce_config import SPECIES_GENE_EMBEDDINGS -logger = logging.getLogger(__name__) +LOGGER = logging.getLogger(__name__) def get_gene_embedding_paths(embedding_path: Path) -> Dict[str, Dict[str, Path]]: @@ -54,7 +54,7 @@ def load_gene_embeddings_adata(adata: AnnData, species: list, embedding_model: s # Ensure embeddings are available for all species if not (species_names_set <= available_species): - logger.error(f'Missing gene embeddings here: {embeddings_path}') + LOGGER.error(f'Missing gene embeddings here: {embeddings_path}') raise ValueError(f'The following species do not have gene embeddings: {species_names_set - available_species}') # Load gene embeddings for desired species (and convert gene symbols to lower case) species_to_gene_symbol_to_embedding = { @@ -65,7 +65,7 @@ def load_gene_embeddings_adata(adata: AnnData, species: list, embedding_model: s for species in species_names } - logger.info(f'Finished loading gene embeddings for {species_names_set} from {embeddings_path}') + LOGGER.info(f"Finished loading gene embeddings for '{', '.join(map(str, species_names_set))}' from {embeddings_path} for model '{embedding_model}'.") # Determine which genes to include based on gene expression and embedding availability genes_with_embeddings = set.intersection(*[ @@ -77,7 +77,7 @@ def load_gene_embeddings_adata(adata: AnnData, species: list, embedding_model: s # Subset data to only use genes with embeddings filtered_adata = adata[:, adata.var_names.isin(genes_to_use)] filtered = adata.var_names.shape[0] - filtered_adata.var_names.shape[0] - logger.info(f'Filtered out {filtered} genes to a total of {filtered_adata.var_names.shape[0]} genes with embeddings.') + LOGGER.info(f'Filtered out {filtered} genes to a total of {filtered_adata.var_names.shape[0]} genes with embeddings.') # Load gene symbols for desired species for later use with indexes species_to_all_gene_symbols = { diff --git a/helical/models/uce/model.py b/helical/models/uce/model.py index 4a318d4e..bdccf124 100644 --- a/helical/models/uce/model.py +++ b/helical/models/uce/model.py @@ -71,7 +71,10 @@ def __init__(self, configurer: UCEConfig = default_configurer) -> None: self.model = self.accelerator.prepare(self.model) else: self.accelerator = None + LOGGER.info(f"Model finished initializing.") + mode = "training" if self.model.training else "eval" + LOGGER.info(f"'{self.config['model_name']}' model is in '{mode}' mode, on device '{next(self.model.parameters()).device.type}'.") def process_data(self, adata: AnnData, @@ -106,7 +109,7 @@ def process_data(self, An object that inherits from the `Dataset` class. """ - + LOGGER.info(f"Processing data for UCE.") self.ensure_rna_data_validity(adata, gene_names, use_raw_counts) if gene_names != "index": @@ -164,7 +167,7 @@ def process_data(self, datasets_to_chroms = dataset_chroms, datasets_to_starts = dataset_start ) - LOGGER.info(f'Successfully prepared the UCE Dataset.') + LOGGER.info(f'Successfully processed the data for UCE.') return dataset def get_embeddings(self, dataset: UCEDataset) -> np.array: @@ -180,7 +183,8 @@ def get_embeddings(self, dataset: UCEDataset) -> np.array: np.ndarray The gene embeddings in the form of a numpy array """ - + LOGGER.info(f"Started getting embeddings:") + batch_size = self.config["batch_size"] dataloader = DataLoader(dataset, batch_size=batch_size, @@ -199,7 +203,6 @@ def get_embeddings(self, dataset: UCEDataset) -> np.array: else: pbar = tqdm(dataloader) - LOGGER.info(f"Inference started") dataset_embeds = [] # disabling gradient calculation for inference @@ -223,6 +226,8 @@ def get_embeddings(self, dataset: UCEDataset) -> np.array: else: dataset_embeds.append(embedding.detach().cpu().numpy()) embeddings = np.vstack(dataset_embeds) + + LOGGER.info(f"Finished getting embeddings.") return embeddings \ No newline at end of file diff --git a/helical/models/uce/uce_config.py b/helical/models/uce/uce_config.py index cc96f433..98439639 100644 --- a/helical/models/uce/uce_config.py +++ b/helical/models/uce/uce_config.py @@ -108,6 +108,7 @@ def __init__(self, model_path = Path(CACHE_DIR_HELICAL, 'uce', f"{model_name}.torch") self.config = { + "model_name": model_name, "model_path": model_path, "list_of_files_to_download": list_of_files_to_download, "batch_size": batch_size, diff --git a/helical/utils/downloader.py b/helical/utils/downloader.py index 13aeb29b..1d0a0198 100644 --- a/helical/utils/downloader.py +++ b/helical/utils/downloader.py @@ -38,7 +38,7 @@ def download_via_link(self, output: Path, link: str) -> None: ''' if output.is_file(): - LOGGER.info(f"File: '{output}' exists already. File is not overwritten and nothing is downloaded.") + LOGGER.debug(f"File: '{output}' exists already. File is not overwritten and nothing is downloaded.") else: LOGGER.info(f"Starting to download: '{link}'") @@ -66,7 +66,7 @@ def download_via_link(self, output: Path, link: str) -> None: f.write(data) except: LOGGER.error(f"Failed downloading file from '{link}'") - LOGGER.info(f"File saved to: '{output}'") + LOGGER.info(f"File saved to: '{output}'") def _display_download_progress(self, data_chunk_size: int) -> None: ''' @@ -105,7 +105,7 @@ def download_via_name(self, name: str) -> None: LOGGER.info(f"Creating Folder {os.path.dirname(output)}") if Path(output).is_file(): - LOGGER.info(f"File: '{output}' exists already. File is not overwritten and nothing is downloaded.") + LOGGER.debug(f"File: '{output}' exists already. File is not overwritten and nothing is downloaded.") else: LOGGER.info(f"Starting to download: '{blob_url}'") @@ -113,8 +113,7 @@ def download_via_name(self, name: str) -> None: logging.disable(logging.INFO) self.display_azure_download_progress(blob_client, blob_url, output) logging.disable(logging.NOTSET) - - LOGGER.info(f"File saved to: '{output}'") + LOGGER.info(f"File saved to: '{output}'") def display_azure_download_progress(self, blob_client: BlobClient, blob_url: str, output: Path) -> None: """ diff --git a/mkdocs.yml b/mkdocs.yml index f759376d..3e228d6f 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -51,7 +51,6 @@ nav: - Geneformer-vs-UCE: ./notebooks/Geneformer-vs-UCE.ipynb - Hyena-DNA-Inference: ./notebooks/Hyena-DNA-Inference.ipynb - HyenaDNA-Fine-Tuning: ./notebooks/HyenaDNA-Fine-Tuning.ipynb - - Benchmarking: ./benchmarking/description.md theme: name: material diff --git a/pyproject.toml b/pyproject.toml index b5f41b3a..a0e6436e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "helical" -version = "0.0.1a17" +version = "0.0.1a18" authors = [ { name="Helical Team", email="support@helical-ai.com" }, ]