Skip to content

Commit

Permalink
Update adata_to_multivec (#326)
Browse files Browse the repository at this point in the history
* Update adata_to_multivec

* Update pyproject.toml

* Update multivec.py
  • Loading branch information
keller-mark authored Mar 13, 2024
1 parent 989af83 commit 3810797
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 3 deletions.
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ dependencies = [
'scanpy>=1.9.3',
'ome-zarr==0.8.3',
'tifffile>=2020.10.1',
'jsonschema>=3.2'
'jsonschema>=3.2',
'tqdm>=4.1.0'
]

[project.optional-dependencies]
Expand Down
38 changes: 36 additions & 2 deletions vitessce/data_utils/multivec.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,44 @@
import zarr
import numpy as np
import pandas as pd
from tqdm import tqdm

from .anndata import to_dense
from .entities import GenomicProfiles


def adata_to_multivec_zarr(adata, output_path, obs_set_col, obs_set_name, obs_set_vals=None, var_interval_col="interval", layer_key=None, assembly="hg38", starting_resolution=5000):
def adata_to_multivec_zarr(adata, output_path, obs_set_col, obs_set_name, obs_set_vals=None, var_interval_col="interval", layer_key=None, assembly="hg38", starting_resolution=5000, chr_subset=None):
"""
Convert an AnnData object containing a cell-by-bin matrix to a Multivec-Zarr store.
:param adata: The object to convert.
:type adata: anndata.AnnData
:param output_path: The path to the output Zarr store.
:type output_path: str
:param obs_set_col: The name of the column in adata.obs that contains the cluster IDs.
:type obs_set_col: str
:param obs_set_name: The name of the cluster set.
:type obs_set_name: str
:param obs_set_vals: The cluster IDs to include in the output. If None, all cluster IDs will be included. This can be used to override the order of the cluster IDs.
:type obs_set_vals: list[str] or None
:param var_interval_col: The name of the column in adata.var that contains the bin interval strings. By default, "interval".
:type var_interval_col: str
:param layer_key: The name of the layer in adata.layers to use. If None, adata.X will be used. By default, None.
:type layer_key: str or None
:param assembly: The name of the genome assembly. By default, "hg38".
:type assembly: str
:param starting_resolution: The starting resolution of the data. By default, 5000.
:type starting_resolution: int
:param chr_subset: For debugging purposes, a subset of chromosomes to process. If None, all chromosomes in the assembly will be processed. By default, None.
:type chr_subset: list[str] or None
"""
in_mtx = adata.layers[layer_key] if layer_key is not None else adata.X
in_barcodes_df = adata.obs
in_bins_df = adata.var

# Ensure that in_bins_df has a sequential integer index
in_bins_df = in_bins_df.reset_index()

in_mtx = to_dense(in_mtx) # TODO: is this necessary?

# The bin datafram consists of one column like chrName:binStart-binEnd
Expand Down Expand Up @@ -84,20 +112,26 @@ def convert_bin_name_to_chr_end(bin_name):
)
chrom_name_to_length = genomic_profiles.chrom_name_to_length

# For debugging purposes, allow the user to specify a subset of chromosomes to process.
chrom_names = chr_subset if chr_subset is not None else list(chrom_name_to_length.keys())

# Create each chromosome dataset.
for chr_name, chr_len in chrom_name_to_length.items():
for chr_name in tqdm(chrom_names):
chr_len = chrom_name_to_length[chr_name]
# The bins dataframe frustratingly does not contain every bin.
# We need to figure out which bins are missing.

# We want to check for missing bins in each chromosome separately,
# otherwise too much memory is used during the join step.
chr_bins_in_df = in_bins_df.loc[in_bins_df["chr_name"] == chr_name]
if chr_bins_in_df.shape[0] == 0:
print("Warning: No bins found for chromosome", chr_name)
# No processing or output is necessary if there is no data for this chromosome.
# Continue on through all resolutions of this chromosome to the next chromosome.
continue
# Determine the indices of the matrix at which the bins for this chromosome start and end.
chr_bin_i_start = int(chr_bins_in_df.head(1).iloc[0].name)
# +1 because the end index is exclusive.
chr_bin_i_end = int(chr_bins_in_df.tail(1).iloc[0].name) + 1

# Extract the part of the matrix corresponding to the current chromosome.
Expand Down

0 comments on commit 3810797

Please sign in to comment.