diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d5c6e8f..39ade5f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,7 +2,7 @@ name: Test & lint package & deploy documentation
on:
push:
- branches: [main]
+ branches: [main, dev]
pull_request:
branches: [main]
diff --git a/.vscode/launch.json b/.vscode/launch.json
index 7b7e632..05920e8 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -1,5 +1,5 @@
{
- "version": "0.2.0",
+ "version": "0.3.0",
"configurations": [
{
"name": "Python: Current File",
diff --git a/CITATION.cff b/CITATION.cff
index 708056f..3fa7c26 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -6,6 +6,6 @@ authors:
- family-names: "Puelles"
given-names: "Victor"
title: "pytximport: Fast gene count estimation from transcript quantification files in Python"
-version: 0.2.0
+version: 0.3.0
date-released: 2024-06-01
url: "https://github.com/complextissue/pytximport"
diff --git a/README.md b/README.md
index c8ef165..89fcca9 100755
--- a/README.md
+++ b/README.md
@@ -65,7 +65,7 @@ results = tximport(
Please cite both the original publication as well as this Python implementation:
- Charlotte Soneson, Michael I. Love, Mark D. Robinson. Differential analyses for RNA-seq: transcript-level estimates improve gene-level inferences, F1000Research, 4:1521, December 2015. doi: 10.12688/f1000research.7563.1
-- Kuehl, M., & Puelles, V. (2024). pytximport: Fast gene count estimation from transcript quantification files in Python (Version 0.2.0) [Computer software]. https://github.com/complextissue/pytximport
+- Kuehl, M., & Puelles, V. (2024). pytximport: Fast gene count estimation from transcript quantification files in Python (Version 0.3.0) [Computer software]. https://github.com/complextissue/pytximport
## License
diff --git a/docs/source/conf.py b/docs/source/conf.py
index e581058..fb5dcd0 100755
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -18,7 +18,7 @@
author = "Malte Kuehl"
# The full version, including alpha/beta/rc tags
-release = "0.2.0"
+release = "0.3.0"
# -- General configuration ---------------------------------------------------
@@ -106,7 +106,7 @@
html_theme = "furo"
html_theme_options = {
- "announcement": "pytximport 0.2.0 has been released!",
+ "announcement": "pytximport has been released!",
}
html_title = "pytximport"
diff --git a/docs/source/example.ipynb b/docs/source/example.ipynb
index b67d870..01b542e 100644
--- a/docs/source/example.ipynb
+++ b/docs/source/example.ipynb
@@ -361,7 +361,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "Reading quantification files: 2it [00:00, 322.27it/s]\n"
+ "Reading quantification files: 2it [00:00, 289.77it/s]\n"
]
},
{
@@ -739,9 +739,9 @@
"Data variables:\n",
" abundance (gene_id, file) float64 8kB 0.08291 0.0 0.09854 ... 0.4618 0.0\n",
" counts (gene_id, file) float64 8kB 1.001 0.0 1.042 ... 2.0 6.184 0.0\n",
- " length (gene_id, file) float64 8kB 509.1 509.1 445.8 ... 564.6 564.6
"
],
"text/plain": [
" Size: 87kB\n",
@@ -891,6 +891,7 @@
" [\"../../test/data/salmon/multiple/Sample_1.sf\", \"../../test/data/salmon/multiple/Sample_2.sf\"],\n",
" \"salmon\",\n",
" transcript_gene_mapping_mouse,\n",
+ " output_type=\"xarray\", # or \"anndata\"\n",
")\n",
"txi"
]
@@ -918,7 +919,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "Reading quantification files: 1it [00:00, 399.19it/s]\n"
+ "Reading quantification files: 1it [00:00, 291.80it/s]\n"
]
},
{
@@ -991,8 +992,8 @@
" counts_from_abundance=\"length_scaled_tpm\",\n",
" return_transcript_data=True,\n",
")\n",
- "pd.DataFrame(txi[\"counts\"], index=txi.coords[\"transcript_id\"], columns=txi.coords[\"file_path\"]).sort_values(\n",
- " by=txi.coords[\"file_path\"].data[0],\n",
+ "pd.DataFrame(txi.X.T, index=txi.var.index, columns=txi.obs.index).sort_values(\n",
+ " by=txi.obs.index[0],\n",
" ascending=False,\n",
").head(5)"
]
@@ -1013,7 +1014,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "Reading quantification files: 1it [00:00, 447.01it/s]\n"
+ "Reading quantification files: 1it [00:00, 534.10it/s]\n"
]
},
{
@@ -1085,6 +1086,7 @@
" \"salmon\",\n",
" transcript_gene_mapping_human,\n",
" counts_from_abundance=\"length_scaled_tpm\",\n",
+ " output_type=\"xarray\",\n",
" return_transcript_data=False,\n",
")\n",
"pd.DataFrame(txi[\"counts\"], index=txi.coords[\"gene_id\"], columns=txi.coords[\"file_path\"]).sort_values(\n",
@@ -1123,7 +1125,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "Reading quantification files: 1it [00:00, 372.17it/s]\n"
+ "Reading quantification files: 1it [00:00, 457.64it/s]\n"
]
},
{
@@ -1166,17 +1168,17 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "2024-05-31 09:42:58,831: Starting the import.\n",
- "Reading quantification files: 1it [00:00, 277.84it/s]\n",
- "2024-05-31 09:42:59,020: Converting transcript-level expression to gene-level expression.\n",
- "2024-05-31 09:42:59,194: Matching gene_ids.\n",
- "2024-05-31 09:42:59,323: Creating gene abundance.\n",
- "2024-05-31 09:42:59,460: Creating gene counts.\n",
- "2024-05-31 09:42:59,463: Creating lengths.\n",
- "2024-05-31 09:42:59,466: Replacing missing lengths.\n",
- "2024-05-31 09:42:59,467: Creating gene expression dataset.\n",
- "2024-05-31 09:42:59,470: Saving the gene-level expression to: ../../test/data/salmon/quant.h5ad.\n",
- "2024-05-31 09:42:59,474: Finished the import in 0.64 seconds.\n"
+ "2024-06-04 19:23:31,595: Starting the import.\n",
+ "Reading quantification files: 1it [00:00, 292.14it/s]\n",
+ "2024-06-04 19:23:31,761: Converting transcript-level expression to gene-level expression.\n",
+ "2024-06-04 19:23:31,964: Matching gene_ids.\n",
+ "2024-06-04 19:23:32,120: Creating gene abundance.\n",
+ "2024-06-04 19:23:32,257: Creating gene counts.\n",
+ "2024-06-04 19:23:32,260: Creating lengths.\n",
+ "2024-06-04 19:23:32,264: Replacing missing lengths.\n",
+ "2024-06-04 19:23:32,265: Creating gene expression dataset.\n",
+ "2024-06-04 19:23:32,269: Saving the gene-level expression to: ../../test/data/salmon/quant.h5ad.\n",
+ "2024-06-04 19:23:32,276: Finished the import in 0.68 seconds.\n"
]
}
],
@@ -1238,7 +1240,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "Reading quantification files: 4it [00:01, 3.28it/s]\n",
+ "Reading quantification files: 4it [00:01, 3.03it/s]\n",
"WARNING:root:Not all transcripts are present in the mapping. 33815 out of 253181 missing.\n"
]
},
@@ -1265,7 +1267,6 @@
" \"salmon\",\n",
" transcript_gene_mapping,\n",
" counts_from_abundance=\"length_scaled_tpm\",\n",
- " output_type=\"anndata\",\n",
")\n",
"result"
]
@@ -1367,21 +1368,21 @@
"output_type": "stream",
"text": [
"Fitting size factors...\n",
- "... done in 0.00 seconds.\n",
+ "... done in 0.01 seconds.\n",
"\n",
"Fitting dispersions...\n",
- "... done in 0.59 seconds.\n",
+ "... done in 1.30 seconds.\n",
"\n",
"Fitting dispersion trend curve...\n",
- "... done in 0.32 seconds.\n",
+ "... done in 0.63 seconds.\n",
"\n",
"/Users/au734063/Documents/code/pytximport-publish/pytximport/.venv/lib/python3.12/site-packages/pydeseq2/dds.py:448: UserWarning: As the residual degrees of freedom is less than 3, the distribution of log dispersions is especially asymmetric and likely to be poorly estimated by the MAD.\n",
" self.fit_dispersion_prior()\n",
"Fitting MAP dispersions...\n",
- "... done in 0.77 seconds.\n",
+ "... done in 0.95 seconds.\n",
"\n",
"Fitting LFCs...\n",
- "... done in 0.53 seconds.\n",
+ "... done in 1.42 seconds.\n",
"\n",
"Replacing 0 outlier genes.\n",
"\n"
@@ -1451,7 +1452,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "... done in 0.32 seconds.\n",
+ "... done in 0.35 seconds.\n",
"\n"
]
}
diff --git a/docs/source/start.md b/docs/source/start.md
index 42a3095..b73b98b 100755
--- a/docs/source/start.md
+++ b/docs/source/start.md
@@ -61,7 +61,7 @@ The `tximport` package has become a main stay in the bulk RNA sequencing communi
Please cite both the original publication as well as this Python implementation:
- Charlotte Soneson, Michael I. Love, Mark D. Robinson. Differential analyses for RNA-seq: transcript-level estimates improve gene-level inferences, F1000Research, 4:1521, December 2015. doi: 10.12688/f1000research.7563.1
-- Kuehl, M., & Puelles, V. (2024). pytximport: Fast gene count estimation from transcript quantification files in Python (Version 0.2.0) [Computer software]. https://github.com/complextissue/pytximport
+- Kuehl, M., & Puelles, V. (2024). pytximport: Fast gene count estimation from transcript quantification files in Python (Version 0.3.0) [Computer software]. https://github.com/complextissue/pytximport
## Differences
diff --git a/pytximport/_cli.py b/pytximport/_cli.py
index 34962d8..3102a28 100644
--- a/pytximport/_cli.py
+++ b/pytximport/_cli.py
@@ -32,6 +32,13 @@
type=click.Path(exists=True),
help="The path to the transcript to gene mapping file.",
)
+@click.option(
+ "-c",
+ "--counts_from_abundance",
+ "--counts-from-abundance",
+ type=click.Choice(["scaled_tpm", "length_scaled_tpm"]),
+ help="The type of counts to convert to.",
+)
@click.option(
"-o",
"--save_path",
@@ -43,24 +50,17 @@
@click.option(
"--ignore_after_bar",
"--ignore-after-bar",
- is_flag=True,
+ type=bool,
default=True,
help="Whether to split the transcript id after the bar character (`|`).",
)
@click.option(
"--ignore_transcript_version",
"--ignore-transcript-version",
- is_flag=True,
+ type=bool,
default=True,
help="Whether to ignore the transcript version.",
)
-@click.option(
- "-c",
- "--counts_from_abundance",
- "--counts-from-abundance",
- type=click.Choice(["scaled_tpm", "length_scaled_tpm"]),
- help="The type of counts to convert to.",
-)
@click.option(
"--return_transcript_data",
"--return-transcript-data",
@@ -71,24 +71,28 @@
"-id",
"--id_column",
"--id-column",
+ type=str,
help="The column name for the transcript id.",
)
@click.option(
"-counts",
"--counts_column",
"--counts-column",
+ type=str,
help="The column name for the counts.",
)
@click.option(
"-length",
"--length_column",
"--length-column",
+ type=str,
help="The column name for the length.",
)
@click.option(
"-tpm",
"--abundance_column",
"--abundance-column",
+ type=str,
help="The column name for the abundance.",
)
@click.option(
diff --git a/pytximport/_version.py b/pytximport/_version.py
index a4ae98e..143d250 100644
--- a/pytximport/_version.py
+++ b/pytximport/_version.py
@@ -1,4 +1,4 @@
"""Version information for the pytximport package."""
# This package will follow Semantic Versioning after version 1.0.0: https://semver.org/
-__version__ = "0.2.0"
+__version__ = "0.3.0"
diff --git a/pytximport/core/_tximport.py b/pytximport/core/_tximport.py
index 7bbd528..8c645e7 100644
--- a/pytximport/core/_tximport.py
+++ b/pytximport/core/_tximport.py
@@ -37,7 +37,7 @@ def tximport(
sparse_threshold: Optional[float] = None,
read_length: Optional[int] = None,
# arguments exclusive to the pytximport implementation
- output_type: Literal["xarray", "anndata"] = "xarray",
+ output_type: Literal["xarray", "anndata"] = "anndata",
output_format: Literal["csv", "h5ad"] = "csv",
save_path: Optional[Union[str, Path]] = None,
return_data: bool = True,
@@ -80,7 +80,7 @@ def tximport(
sparse_threshold (Optional[float], optional): The threshold for the sparse matrix. Currently, sparse input is
not supported. Defaults to None.
read_length (Optional[int], optional): The read length for the stringtie quantification. Defaults to None.
- output_type (Literal["xarray", "anndata"], optional): The type of output. Defaults to "xarray".
+ output_type (Literal["xarray", "anndata"], optional): The type of output. Defaults to "anndata".
output_format (Literal["csv", "h5ad"], optional): The type of output file. Defaults to "csv".
save_path (Optional[Union[str, Path]], optional): The path to save the gene-level expression. Defaults to None.
return_data (bool, optional): Whether to return the gene-level expression. Defaults to True.
@@ -270,6 +270,18 @@ def tximport(
counts_from_abundance,
)
+ if output_type == "anndata":
+ # convert to AnnData
+ return ad.AnnData(
+ X=transcript_data["counts"].values.T,
+ obs=pd.DataFrame(index=transcript_data.coords["file_path"].values),
+ var=pd.DataFrame(index=transcript_data.coords["transcript_id"].values),
+ obsm={
+ "length": transcript_data["length"].values.T,
+ "abundance": transcript_data["abundance"].values.T,
+ },
+ )
+
return transcript_data
# convert to gene-level expression
diff --git a/pytximport/utils/_convert_transcripts_to_genes.py b/pytximport/utils/_convert_transcripts_to_genes.py
index bf2a9e6..1b067e2 100644
--- a/pytximport/utils/_convert_transcripts_to_genes.py
+++ b/pytximport/utils/_convert_transcripts_to_genes.py
@@ -57,6 +57,9 @@ def convert_transcripts_to_genes(
# check that at least one transcript is protein-coding
assert any(transcript_keep_boolean), "No transcripts with the desired biotype are present in the data."
+ # calculate the total abundance before filtering
+ total_abundance = transcript_data["abundance"].sum(axis=0)
+
transcript_data = transcript_data.isel(
transcript_id=transcript_keep_boolean,
drop=True,
@@ -67,6 +70,11 @@ def convert_transcripts_to_genes(
)
transcript_ids = transcript_data.coords["transcript_id"].values
+ # recalculate the abundance for each sample
+ new_abundance = transcript_data["abundance"].sum(axis=0)
+ ratio = total_abundance / new_abundance
+ transcript_data["abundance"] = (transcript_data["abundance"].T * ratio).T
+
if ignore_after_bar:
# ignore the part of the transcript ID after the bar
transcript_ids = [transcript_id.split("|")[0] for transcript_id in transcript_ids]
diff --git a/pytximport/utils/_replace_missing_average_transcript_length.py b/pytximport/utils/_replace_missing_average_transcript_length.py
index 30fbf2e..17f9324 100644
--- a/pytximport/utils/_replace_missing_average_transcript_length.py
+++ b/pytximport/utils/_replace_missing_average_transcript_length.py
@@ -18,6 +18,9 @@ def replace_missing_average_transcript_length(
# get the rows of the DataArray with missing values
nan_rows = np.where(length.isnull().any(dim="file") == True)[0] # noqa: E712
+ gene_ids = []
+ lengths = []
+
for nan_idx in nan_rows:
row = length.isel({"gene_id": nan_idx})
gene_id = row.coords["gene_id"].data
@@ -37,6 +40,11 @@ def replace_missing_average_transcript_length(
average_gene_length = np.exp(np.mean(np.log(length.loc[{"gene_id": gene_id}].data[~column_is_nan])))
# replace the missing row with the average gene length
- length.loc[{"gene_id": gene_id}] = length.loc[{"gene_id": gene_id}].fillna(average_gene_length)
+ gene_ids.append(gene_id)
+ lengths.append(length.loc[{"gene_id": gene_id}].fillna(average_gene_length))
+
+ # batching updates seems to be faster than updating the DataArray row by row
+ if len(gene_ids) > 0:
+ length.loc[{"gene_id": gene_ids}] = lengths
return length
diff --git a/test/test_comparison.ipynb b/test/test_comparison.ipynb
index e34e80c..3be6dcc 100644
--- a/test/test_comparison.ipynb
+++ b/test/test_comparison.ipynb
@@ -16,7 +16,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@@ -25,7 +25,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 2,
"metadata": {},
"outputs": [
{
@@ -43,7 +43,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 3,
"metadata": {},
"outputs": [
{
@@ -122,53 +122,53 @@
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "2024-05-30 21:04:42,063: Starting the import.\n",
- "Reading quantification files: 4it [00:01, 3.58it/s]\n",
- "2024-05-30 21:04:43,294: Converting transcript-level expression to gene-level expression.\n",
- "2024-05-30 21:04:43,671: Not all transcripts are present in the mapping. 31380 out of 253181 missing.\n",
- "2024-05-30 21:04:43,952: Matching gene_ids.\n",
- "2024-05-30 21:04:44,116: Creating gene abundance.\n",
- "2024-05-30 21:04:44,213: Creating gene counts.\n",
- "2024-05-30 21:04:44,342: Creating lengths.\n",
- "2024-05-30 21:04:44,495: Replacing missing lengths.\n",
- "2024-05-30 21:04:50,105: Creating gene expression dataset.\n",
- "2024-05-30 21:04:50,136: Saving the gene-level expression to: data/fabry_disease/counts_pytximport_no.csv.\n",
- "2024-05-30 21:04:50,207: Finished the import in 8.14 seconds.\n",
- "2024-05-30 21:04:51,406: Starting the import.\n",
- "Reading quantification files: 4it [00:01, 3.63it/s]\n",
- "2024-05-30 21:04:52,631: Converting transcript-level expression to gene-level expression.\n",
- "2024-05-30 21:04:53,053: Not all transcripts are present in the mapping. 31380 out of 253181 missing.\n",
- "2024-05-30 21:04:53,313: Matching gene_ids.\n",
- "2024-05-30 21:04:53,467: Creating gene abundance.\n",
- "2024-05-30 21:04:53,561: Creating gene counts.\n",
- "2024-05-30 21:04:53,663: Creating lengths.\n",
- "2024-05-30 21:04:53,755: Replacing missing lengths.\n",
- "2024-05-30 21:04:59,260: Recreating gene counts from abundances.\n",
- "2024-05-30 21:04:59,260: Setting the counts to scaled TPM.\n",
- "2024-05-30 21:04:59,261: Creating gene expression dataset.\n",
- "2024-05-30 21:04:59,290: Saving the gene-level expression to: data/fabry_disease/counts_pytximport_scaledTPM.csv.\n",
- "2024-05-30 21:04:59,449: Finished the import in 8.04 seconds.\n",
- "2024-05-30 21:05:00,696: Starting the import.\n",
- "Reading quantification files: 4it [00:01, 3.45it/s]\n",
- "2024-05-30 21:05:01,977: Converting transcript-level expression to gene-level expression.\n",
- "2024-05-30 21:05:02,376: Not all transcripts are present in the mapping. 31380 out of 253181 missing.\n",
- "2024-05-30 21:05:02,656: Matching gene_ids.\n",
- "2024-05-30 21:05:02,816: Creating gene abundance.\n",
- "2024-05-30 21:05:02,910: Creating gene counts.\n",
- "2024-05-30 21:05:03,002: Creating lengths.\n",
- "2024-05-30 21:05:03,096: Replacing missing lengths.\n",
- "2024-05-30 21:05:08,628: Recreating gene counts from abundances.\n",
- "2024-05-30 21:05:08,628: Setting counts to length scaled TPM.\n",
- "2024-05-30 21:05:08,631: Creating gene expression dataset.\n",
- "2024-05-30 21:05:08,658: Saving the gene-level expression to: data/fabry_disease/counts_pytximport_lengthScaledTPM.csv.\n",
- "2024-05-30 21:05:08,751: Finished the import in 8.06 seconds.\n"
+ "2024-06-04 19:00:50,027: Starting the import.\n",
+ "Reading quantification files: 4it [00:01, 2.84it/s]\n",
+ "2024-06-04 19:00:51,561: Converting transcript-level expression to gene-level expression.\n",
+ "2024-06-04 19:00:51,981: Not all transcripts are present in the mapping. 31380 out of 253181 missing.\n",
+ "2024-06-04 19:00:52,335: Matching gene_ids.\n",
+ "2024-06-04 19:00:52,542: Creating gene abundance.\n",
+ "2024-06-04 19:00:52,820: Creating gene counts.\n",
+ "2024-06-04 19:00:52,936: Creating lengths.\n",
+ "2024-06-04 19:00:53,089: Replacing missing lengths.\n",
+ "2024-06-04 19:00:58,156: Creating gene expression dataset.\n",
+ "2024-06-04 19:00:58,193: Saving the gene-level expression to: data/fabry_disease/counts_pytximport_no.csv.\n",
+ "2024-06-04 19:00:58,273: Finished the import in 8.25 seconds.\n",
+ "2024-06-04 19:00:59,769: Starting the import.\n",
+ "Reading quantification files: 4it [00:01, 2.66it/s]\n",
+ "2024-06-04 19:01:01,409: Converting transcript-level expression to gene-level expression.\n",
+ "2024-06-04 19:01:01,837: Not all transcripts are present in the mapping. 31380 out of 253181 missing.\n",
+ "2024-06-04 19:01:02,160: Matching gene_ids.\n",
+ "2024-06-04 19:01:02,321: Creating gene abundance.\n",
+ "2024-06-04 19:01:02,599: Creating gene counts.\n",
+ "2024-06-04 19:01:02,673: Creating lengths.\n",
+ "2024-06-04 19:01:02,791: Replacing missing lengths.\n",
+ "2024-06-04 19:01:07,705: Recreating gene counts from abundances.\n",
+ "2024-06-04 19:01:07,706: Setting the counts to scaled TPM.\n",
+ "2024-06-04 19:01:07,707: Creating gene expression dataset.\n",
+ "2024-06-04 19:01:07,736: Saving the gene-level expression to: data/fabry_disease/counts_pytximport_scaledTPM.csv.\n",
+ "2024-06-04 19:01:07,827: Finished the import in 8.06 seconds.\n",
+ "2024-06-04 19:01:09,437: Starting the import.\n",
+ "Reading quantification files: 4it [00:01, 3.27it/s]\n",
+ "2024-06-04 19:01:10,794: Converting transcript-level expression to gene-level expression.\n",
+ "2024-06-04 19:01:11,424: Not all transcripts are present in the mapping. 31380 out of 253181 missing.\n",
+ "2024-06-04 19:01:11,870: Matching gene_ids.\n",
+ "2024-06-04 19:01:12,045: Creating gene abundance.\n",
+ "2024-06-04 19:01:12,385: Creating gene counts.\n",
+ "2024-06-04 19:01:12,462: Creating lengths.\n",
+ "2024-06-04 19:01:12,586: Replacing missing lengths.\n",
+ "2024-06-04 19:01:17,438: Recreating gene counts from abundances.\n",
+ "2024-06-04 19:01:17,439: Setting counts to length scaled TPM.\n",
+ "2024-06-04 19:01:17,444: Creating gene expression dataset.\n",
+ "2024-06-04 19:01:17,484: Saving the gene-level expression to: data/fabry_disease/counts_pytximport_lengthScaledTPM.csv.\n",
+ "2024-06-04 19:01:17,586: Finished the import in 8.15 seconds.\n"
]
}
],
@@ -180,7 +180,7 @@
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@@ -201,6 +201,13 @@
"pd.testing.assert_frame_equal(counts_tximport_scaledTPM, counts_pytximport_scaledTPM)\n",
"pd.testing.assert_frame_equal(counts_tximport_lengthScaledTPM, counts_pytximport_lengthScaledTPM)"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
diff --git a/test/test_correctness.py b/test/test_correctness.py
index 0ec23f6..8b6184e 100644
--- a/test/test_correctness.py
+++ b/test/test_correctness.py
@@ -27,6 +27,7 @@ def test_correctness(
fabry_directory / "transcript_gene_mapping_human.csv",
ignore_transcript_version=True,
ignore_after_bar=True,
+ output_type="xarray",
counts_from_abundance=counts_from_abundance, # type: ignore
)
diff --git a/test/test_kallisto.py b/test/test_kallisto.py
index 8b5e836..40b4adf 100644
--- a/test/test_kallisto.py
+++ b/test/test_kallisto.py
@@ -16,7 +16,7 @@ def test_kallisto(
"""Test importing a kallisto quantification file.
Args:
- kallisto_file (Path): [description]
+ kallisto_file (Path): Path to the kallisto quantification file.
"""
for counts_from_abundance in [None, "scaled_tpm", "length_scaled_tpm"]:
result = tximport(
@@ -25,6 +25,7 @@ def test_kallisto(
transcript_gene_mapping_human,
ignore_transcript_version=True,
ignore_after_bar=True,
+ output_type="xarray",
counts_from_abundance=counts_from_abundance, # type: ignore
)
@@ -42,7 +43,7 @@ def test_multiple_kallisto(
"""Test importing kallisto quantification files.
Args:
- kallisto_multiple_files (Path): [description]
+ kallisto_multiple_files (Path): List of paths to the kallisto quantification files.
"""
for counts_from_abundance in [None, "scaled_tpm", "length_scaled_tpm"]:
for existence_optional in [True, False]:
@@ -63,6 +64,7 @@ def test_multiple_kallisto(
abundance_column="tpm",
ignore_transcript_version=True,
ignore_after_bar=True,
+ output_type="xarray",
counts_from_abundance=counts_from_abundance, # type: ignore
existence_optional=existence_optional,
)
diff --git a/test/test_salmon.py b/test/test_salmon.py
index ca9ed71..16e1bfb 100644
--- a/test/test_salmon.py
+++ b/test/test_salmon.py
@@ -18,7 +18,7 @@ def test_salmon(
"""Test importing a salmon quantification file.
Args:
- salmon_file (Path): [description]
+ salmon_file (Path): Path to the salmon quantification file.
"""
for counts_from_abundance in [None, "scaled_tpm", "length_scaled_tpm"]:
for output_type in ["xarray", "anndata"]:
@@ -51,7 +51,7 @@ def test_multiple_salmon(
"""Test importing salmon quantification files.
Args:
- salmon_multiple_files (Path): [description]
+ salmon_multiple_files (Path): List of paths to the salmon quantification files.
"""
for counts_from_abundance in [None, "scaled_tpm", "length_scaled_tpm"]:
for biotype_filter in [None, biotype_filters.GENCODE_PROTEIN_CODING]:
@@ -69,6 +69,7 @@ def test_multiple_salmon(
transcript_gene_mapping_mouse,
ignore_transcript_version=True,
ignore_after_bar=True,
+ output_type="xarray",
counts_from_abundance=counts_from_abundance, # type: ignore
biotype_filter=biotype_filter,
existence_optional=existence_optional,