From 4c09c090314fb4080f695aa4dc4267b6a5e39a95 Mon Sep 17 00:00:00 2001 From: Emma Dann Date: Fri, 28 Apr 2023 12:28:46 +0000 Subject: [PATCH 1/3] added tests --- src/genomic_features/ensembl/ensembldb.py | 44 +++++++++++++++++++++++ tests/test_basic.py | 13 ++++++- 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/src/genomic_features/ensembl/ensembldb.py b/src/genomic_features/ensembl/ensembldb.py index dfcdea8..8fc38e3 100644 --- a/src/genomic_features/ensembl/ensembldb.py +++ b/src/genomic_features/ensembl/ensembldb.py @@ -2,6 +2,7 @@ import ibis import requests +import numpy as np from ibis import _ from pandas import DataFrame, Timestamp from requests.exceptions import HTTPError @@ -127,3 +128,46 @@ def genes( def chromosomes(self): """Get chromosome information.""" return self.db.table("chromosome").execute() + + def promoters( + self, filter: _filters.AbstractFilterExpr = filters.EmptyFilter(), + upstream: int = 2000, downstream: int = 200, + canonical_transcripts: bool = False + ) -> DataFrame: + """Get promoter annotations. + + Parameters + ---------- + filter + Filter expression to apply to the genes table. + upstream + Number of base pairs upstream of the TSS (default: 2000). + downstream + Number of base pairs downstream of the TSS (default: 200). + canonical_transcripts + If True, return only canonical transcript for each gene (default: False). + + Returns + ------- + DataFrame + A table of promoter annotations. + """ + # TODO: change to get transcript table with gene level columns + # something like: + # + tx_table = self.genes(filter) + + # Get promoter region based on strand + # strand = 1 |>>>>>>>>>>>>>>| + # strand = -1 |<<<<<<<<<<<<<<| + # Tx SS: * * + # Promoter ------ ------ + tx_ss = np.where(tx_table['seq_strand'] == 1, tx_table['gene_seq_start'], tx_table['gene_seq_end']) + tx_table['promoter_seq_start'] = np.where(tx_table['seq_strand'] == 1, tx_ss - upstream, tx_ss - downstream) + tx_table['promoter_seq_end'] = np.where(tx_table['seq_strand'] == 1, tx_ss + downstream, tx_ss + upstream) + # if canonical_transcripts: + # tx_table = tx_table[tx_table["tx_is_canonical"] == 1] + return(tx_table) + + + diff --git a/tests/test_basic.py b/tests/test_basic.py index c4fcad8..eb3eb43 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -12,7 +12,18 @@ def test_genes(): genes = gf.ensembl.annotation("Hsapiens", 108).genes() assert isinstance(genes, pd.DataFrame) - def test_missing_version(): with pytest.raises(ValueError): gf.ensembl.annotation("Hsapiens", 86) + +def test_promoters(): + promoters = gf.ensembl.annotation("Hsapiens", 108).promoters() + assert isinstance(promoters, pd.DataFrame) + promoters = gf.ensembl.annotation("Hsapiens", 108).promoters(upstream=100, downstream=100) + assert ((promoters.promoter_seq_end - promoters.promoter_seq_start) == 200).all() + promoters = gf.ensembl.annotation("Hsapiens", 108).promoters(upstream=1000, downstream=100) + assert ((promoters.promoter_seq_end - promoters.promoter_seq_start) == 1100).all() + # Test strandedness + promoters = gf.ensembl.annotation("Hsapiens", 108).promoters(upstream=1000, downstream=100) + assert (promoters[promoters.seq_strand == -1].promoter_seq_start == promoters[promoters.seq_strand == -1].gene_seq_end - 100).all() + assert (promoters[promoters.seq_strand == 1].promoter_seq_start == promoters[promoters.seq_strand == 1].gene_seq_start - 1000).all() \ No newline at end of file From c795b4739fc042fc8e300681bf055792255d5351 Mon Sep 17 00:00:00 2001 From: Emma Dann Date: Fri, 28 Apr 2023 12:29:31 +0000 Subject: [PATCH 2/3] cleaning --- tests/test_basic.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/tests/test_basic.py b/tests/test_basic.py index eb3eb43..2bf8b11 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -12,18 +12,32 @@ def test_genes(): genes = gf.ensembl.annotation("Hsapiens", 108).genes() assert isinstance(genes, pd.DataFrame) + def test_missing_version(): with pytest.raises(ValueError): gf.ensembl.annotation("Hsapiens", 86) + def test_promoters(): promoters = gf.ensembl.annotation("Hsapiens", 108).promoters() assert isinstance(promoters, pd.DataFrame) - promoters = gf.ensembl.annotation("Hsapiens", 108).promoters(upstream=100, downstream=100) + promoters = gf.ensembl.annotation("Hsapiens", 108).promoters( + upstream=100, downstream=100 + ) assert ((promoters.promoter_seq_end - promoters.promoter_seq_start) == 200).all() - promoters = gf.ensembl.annotation("Hsapiens", 108).promoters(upstream=1000, downstream=100) + promoters = gf.ensembl.annotation("Hsapiens", 108).promoters( + upstream=1000, downstream=100 + ) assert ((promoters.promoter_seq_end - promoters.promoter_seq_start) == 1100).all() # Test strandedness - promoters = gf.ensembl.annotation("Hsapiens", 108).promoters(upstream=1000, downstream=100) - assert (promoters[promoters.seq_strand == -1].promoter_seq_start == promoters[promoters.seq_strand == -1].gene_seq_end - 100).all() - assert (promoters[promoters.seq_strand == 1].promoter_seq_start == promoters[promoters.seq_strand == 1].gene_seq_start - 1000).all() \ No newline at end of file + promoters = gf.ensembl.annotation("Hsapiens", 108).promoters( + upstream=1000, downstream=100 + ) + assert ( + promoters[promoters.seq_strand == -1].promoter_seq_start + == promoters[promoters.seq_strand == -1].gene_seq_end - 100 + ).all() + assert ( + promoters[promoters.seq_strand == 1].promoter_seq_start + == promoters[promoters.seq_strand == 1].gene_seq_start - 1000 + ).all() From 36fd56513935a7ffb7c075ab752d846b53db3858 Mon Sep 17 00:00:00 2001 From: Emma Dann Date: Fri, 28 Apr 2023 12:43:01 +0000 Subject: [PATCH 3/3] added TODOs --- src/genomic_features/ensembl/ensembldb.py | 43 +++++++++++++---------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/src/genomic_features/ensembl/ensembldb.py b/src/genomic_features/ensembl/ensembldb.py index 8fc38e3..aec0343 100644 --- a/src/genomic_features/ensembl/ensembldb.py +++ b/src/genomic_features/ensembl/ensembldb.py @@ -1,8 +1,8 @@ from __future__ import annotations import ibis -import requests import numpy as np +import requests from ibis import _ from pandas import DataFrame, Timestamp from requests.exceptions import HTTPError @@ -128,12 +128,14 @@ def genes( def chromosomes(self): """Get chromosome information.""" return self.db.table("chromosome").execute() - + def promoters( - self, filter: _filters.AbstractFilterExpr = filters.EmptyFilter(), - upstream: int = 2000, downstream: int = 200, - canonical_transcripts: bool = False - ) -> DataFrame: + self, + filter: _filters.AbstractFilterExpr = filters.EmptyFilter(), + upstream: int = 2000, + downstream: int = 200, + canonical_transcripts: bool = False, + ) -> DataFrame: """Get promoter annotations. Parameters @@ -146,7 +148,7 @@ def promoters( Number of base pairs downstream of the TSS (default: 200). canonical_transcripts If True, return only canonical transcript for each gene (default: False). - + Returns ------- DataFrame @@ -154,20 +156,25 @@ def promoters( """ # TODO: change to get transcript table with gene level columns # something like: - # + # tx_table = self.transcripts(cols = set(cols + ['seq_strand', 'seq_name', 'tx_is_canonical']), filter = filter) tx_table = self.genes(filter) - + # Get promoter region based on strand - # strand = 1 |>>>>>>>>>>>>>>| - # strand = -1 |<<<<<<<<<<<<<<| + # strand = 1 |>>>>>>>>>>>>>>| + # strand = -1 |<<<<<<<<<<<<<<| # Tx SS: * * # Promoter ------ ------ - tx_ss = np.where(tx_table['seq_strand'] == 1, tx_table['gene_seq_start'], tx_table['gene_seq_end']) - tx_table['promoter_seq_start'] = np.where(tx_table['seq_strand'] == 1, tx_ss - upstream, tx_ss - downstream) - tx_table['promoter_seq_end'] = np.where(tx_table['seq_strand'] == 1, tx_ss + downstream, tx_ss + upstream) + tx_ss = np.where( + tx_table["seq_strand"] == 1, + tx_table["gene_seq_start"], + tx_table["gene_seq_end"], + ) + tx_table["promoter_seq_start"] = np.where( + tx_table["seq_strand"] == 1, tx_ss - upstream, tx_ss - downstream + ) + tx_table["promoter_seq_end"] = np.where( + tx_table["seq_strand"] == 1, tx_ss + downstream, tx_ss + upstream + ) # if canonical_transcripts: # tx_table = tx_table[tx_table["tx_is_canonical"] == 1] - return(tx_table) - - - + return tx_table