From f8c1daae77f7b29169c54c1ddc8322c33eb3bda7 Mon Sep 17 00:00:00 2001 From: Thomas Reimonn Date: Wed, 3 Apr 2024 13:45:40 -0400 Subject: [PATCH 1/7] order by chrom, start, end --- src/genomic_features/ensembl/ensembldb.py | 46 ++++++++++++++++++++--- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/src/genomic_features/ensembl/ensembldb.py b/src/genomic_features/ensembl/ensembldb.py index 5d96d29..7e32d64 100644 --- a/src/genomic_features/ensembl/ensembldb.py +++ b/src/genomic_features/ensembl/ensembldb.py @@ -1,12 +1,15 @@ from __future__ import annotations import warnings +from collections.abc import Sequence from functools import cached_property from itertools import product from pathlib import Path from typing import Final, Literal import ibis +import ibis.expr.types as ir +import ibis.selectors as s import requests from ibis import deferred from ibis.expr.types import Table as IbisTable @@ -160,6 +163,7 @@ def genes( cols: list[str] | None = None, filter: AbstractFilterExpr = filters.EmptyFilter(), join_type: Literal["inner", "left"] = "inner", + order_by: Sequence[str] | str = ("seq_name", "gene_seq_start", "gene_seq_end"), ) -> DataFrame: """Get gene annotations. @@ -172,6 +176,8 @@ def genes( Filters to apply to the query. join_type How to perform joins during the query (if cols or filters requires them). + order_by + Columns to order the results by. Usage @@ -183,11 +189,14 @@ def genes( # TODO: check why R adds entrezid cols = self.list_columns(table) # get all columns + if not isinstance(order_by, Sequence): + order_by = [order_by] + cols = cols.copy() if "gene_id" not in cols: # genes always needs gene_id cols.append("gene_id") - query = self._build_query(table, cols, filter, join_type) + query = self._build_query(table, cols, filter, join_type, order_by) return self._execute_query(query) def transcripts( @@ -195,6 +204,7 @@ def transcripts( cols: list[str] | None = None, filter: AbstractFilterExpr = filters.EmptyFilter(), join_type: Literal["inner", "left"] = "inner", + order_by: Sequence[str] | str = ("seq_name", "tx_seq_start", "tx_seq_end"), ) -> DataFrame: """Get transcript annotations. @@ -203,10 +213,12 @@ def transcripts( cols Which columns to retrieve from the database. Can be from other tables. Returns all transcript columns if None. - filters + filter Filters to apply to the query. join_type How to perform joins during the query (if cols or filters requires them). + order_by + Columns to order the results by. Usage @@ -218,6 +230,10 @@ def transcripts( cols = self.list_columns(table) # get all columns cols = cols.copy() + + if not isinstance(order_by, Sequence): + order_by = [order_by] + # Require primary key in output if "tx_id" not in cols: cols.append("tx_id") @@ -225,7 +241,7 @@ def transcripts( if ("tx_seq_start" in cols or "tx_seq_end" in cols) and "seq_name" not in cols: cols.append("seq_name") - query = self._build_query(table, cols, filter, join_type) + query = self._build_query(table, cols, filter, join_type, order_by) return self._execute_query(query) def exons( @@ -233,6 +249,12 @@ def exons( cols: list[str] | None = None, filter: AbstractFilterExpr = filters.EmptyFilter(), join_type: Literal["inner", "left"] = "inner", + order_by: Sequence[str] | str = ( + "seq_name", + "exon_seq_start", + "exon_seq_end", + "exon_id", + ), ) -> DataFrame: """Get exons table. @@ -265,7 +287,7 @@ def exons( ) and "seq_name" not in cols: cols.append("seq_name") - query = self._build_query(table, cols, filter, join_type) + query = self._build_query(table, cols, filter, join_type, order_by) return self._execute_query(query) def _execute_query(self, query: IbisTable) -> DataFrame: @@ -288,6 +310,13 @@ def _build_query( cols: list[str], filter: AbstractFilterExpr, join_type: Literal["inner", "left"] = "inner", + order_by: str + | ir.Column + | s.Selector + | Sequence[str] + | Sequence[ir.Column] + | Sequence[s.Selector] + | None = None, ) -> IbisTable: """Build a query for the genomic features table.""" # Finalize cols @@ -307,8 +336,15 @@ def _build_query( query = self._join_query(tables, start_with=table, join_type=join_type) else: query = self.db.table(table) + # add filter - query = query.filter(filter.convert()).select(cols).order_by(cols) + query = query.filter(filter.convert()) + + # add order_by + if order_by is not None: + query = query.order_by(order_by) + + query = query.select(cols) return query def _join_query( From abcdfd31f7ad4e45a154cc155c376d1c7406e4f3 Mon Sep 17 00:00:00 2001 From: Thomas Reimonn Date: Wed, 3 Apr 2024 15:17:23 -0400 Subject: [PATCH 2/7] resolve sort issues --- src/genomic_features/ensembl/ensembldb.py | 14 +++++++------- tests/test_basic.py | 10 ++++++---- tests/test_filters.py | 5 +---- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/genomic_features/ensembl/ensembldb.py b/src/genomic_features/ensembl/ensembldb.py index 7e32d64..58ac1ab 100644 --- a/src/genomic_features/ensembl/ensembldb.py +++ b/src/genomic_features/ensembl/ensembldb.py @@ -163,7 +163,12 @@ def genes( cols: list[str] | None = None, filter: AbstractFilterExpr = filters.EmptyFilter(), join_type: Literal["inner", "left"] = "inner", - order_by: Sequence[str] | str = ("seq_name", "gene_seq_start", "gene_seq_end"), + order_by: Sequence[str] | str = ( + "seq_name", + "gene_seq_start", + "gene_seq_end", + "gene_id", + ), ) -> DataFrame: """Get gene annotations. @@ -249,12 +254,7 @@ def exons( cols: list[str] | None = None, filter: AbstractFilterExpr = filters.EmptyFilter(), join_type: Literal["inner", "left"] = "inner", - order_by: Sequence[str] | str = ( - "seq_name", - "exon_seq_start", - "exon_seq_end", - "exon_id", - ), + order_by: Sequence[str] | str = ("exon_id"), ) -> DataFrame: """Get exons table. diff --git a/tests/test_basic.py b/tests/test_basic.py index 3e90722..2611f9a 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -8,8 +8,9 @@ def test_package_has_version(): assert gf.__version__ is not None -def test_genes(): - genes = gf.ensembl.annotation("Hsapiens", 108).genes() +@pytest.mark.parametrize("backend", ["sqlite", "duckdb"]) +def test_genes(backend): + genes = gf.ensembl.annotation("Hsapiens", 108, backend=backend).genes() assert isinstance(genes, pd.DataFrame) @@ -30,8 +31,9 @@ def test_invalid_join(): gf.ensembl.annotation("Hsapiens", 108).genes(cols=["tx_id"], join_type="flarb") -def test_exons(): - ensdb = gf.ensembl.annotation("Hsapiens", 108) +@pytest.mark.parametrize("backend", ["sqlite", "duckdb"]) +def test_exons(backend): + ensdb = gf.ensembl.annotation("Hsapiens", 108, backend=backend) exons = ensdb.exons() pd.testing.assert_index_equal( diff --git a/tests/test_filters.py b/tests/test_filters.py index 1daff6b..2b09a5a 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -194,10 +194,7 @@ def test_negation(hsapiens108): assert result.shape[0] == 22894 -@pytest.mark.parametrize("backend", ["sqlite", "duckdb"]) -def test_seqs_as_int(backend): - hsapiens108 = gf.ensembl.annotation("Hsapiens", 108, backend=backend) - +def test_seqs_as_int(hsapiens108): result_w_int = hsapiens108.genes(filter=filters.SeqNameFilter(1)) result_w_str = hsapiens108.genes(filter=filters.SeqNameFilter("1")) pd.testing.assert_frame_equal( From 2c7018ea5f418aaf8979eccaa58f047cc771e193 Mon Sep 17 00:00:00 2001 From: Thomas Reimonn Date: Thu, 4 Apr 2024 10:02:01 -0400 Subject: [PATCH 3/7] Sort in pre-determined order --- src/genomic_features/ensembl/ensembldb.py | 35 ++++++++++++----------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/src/genomic_features/ensembl/ensembldb.py b/src/genomic_features/ensembl/ensembldb.py index 58ac1ab..6961bd0 100644 --- a/src/genomic_features/ensembl/ensembldb.py +++ b/src/genomic_features/ensembl/ensembldb.py @@ -163,12 +163,7 @@ def genes( cols: list[str] | None = None, filter: AbstractFilterExpr = filters.EmptyFilter(), join_type: Literal["inner", "left"] = "inner", - order_by: Sequence[str] | str = ( - "seq_name", - "gene_seq_start", - "gene_seq_end", - "gene_id", - ), + order_by: Sequence[str] | str | None = None, ) -> DataFrame: """Get gene annotations. @@ -194,9 +189,6 @@ def genes( # TODO: check why R adds entrezid cols = self.list_columns(table) # get all columns - if not isinstance(order_by, Sequence): - order_by = [order_by] - cols = cols.copy() if "gene_id" not in cols: # genes always needs gene_id cols.append("gene_id") @@ -209,7 +201,7 @@ def transcripts( cols: list[str] | None = None, filter: AbstractFilterExpr = filters.EmptyFilter(), join_type: Literal["inner", "left"] = "inner", - order_by: Sequence[str] | str = ("seq_name", "tx_seq_start", "tx_seq_end"), + order_by: Sequence[str] | str | None = None, ) -> DataFrame: """Get transcript annotations. @@ -236,9 +228,6 @@ def transcripts( cols = cols.copy() - if not isinstance(order_by, Sequence): - order_by = [order_by] - # Require primary key in output if "tx_id" not in cols: cols.append("tx_id") @@ -254,7 +243,7 @@ def exons( cols: list[str] | None = None, filter: AbstractFilterExpr = filters.EmptyFilter(), join_type: Literal["inner", "left"] = "inner", - order_by: Sequence[str] | str = ("exon_id"), + order_by: Sequence[str] | str | None = None, ) -> DataFrame: """Get exons table. @@ -339,12 +328,26 @@ def _build_query( # add filter query = query.filter(filter.convert()) + query = query.select(cols) - # add order_by if order_by is not None: + # Custom ordering is provided + query = query.order_by(order_by) + else: + # Default ordering + order_by = [] + if "seq_name" in cols: + order_by = ["seq_name"] + if "gene_seq_start" in cols: + order_by.extend(["gene_seq_start"]) + if "tx_seq_start" in cols: + order_by.extend(["tx_seq_start"]) + if "exon_seq_start" in cols: + order_by.extend(["exon_seq_start"]) + + order_by.extend([c for c in cols if "id" in c]) query = query.order_by(order_by) - query = query.select(cols) return query def _join_query( From b37fea68d5f23330a76c0e31c3767fce68fd62c5 Mon Sep 17 00:00:00 2001 From: Thomas Reimonn Date: Thu, 4 Apr 2024 10:02:08 -0400 Subject: [PATCH 4/7] Test sorting --- tests/test_basic.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_basic.py b/tests/test_basic.py index 2611f9a..9df844f 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -13,6 +13,12 @@ def test_genes(backend): genes = gf.ensembl.annotation("Hsapiens", 108, backend=backend).genes() assert isinstance(genes, pd.DataFrame) + # Test sort order + genes_resorted = genes.sort_values( + ["seq_name", "gene_seq_start", "gene_id"] + ).reset_index(drop=True) + pd.testing.assert_frame_equal(genes, genes_resorted) + def test_missing_version(): with pytest.raises(ValueError): @@ -46,3 +52,9 @@ def test_exons(backend): pd.testing.assert_index_equal(exons_id.columns, pd.Index(["exon_id"])) assert exons_id.shape[0] == exons.shape[0] + + # Test sort order + exons_resorted = exons.sort_values( + ["seq_name", "exon_seq_start", "exon_id"] + ).reset_index(drop=True) + pd.testing.assert_frame_equal(exons, exons_resorted) From a5cdb15cb03602a9801a6c6ad6b82cd1d222eea1 Mon Sep 17 00:00:00 2001 From: Thomas Reimonn Date: Thu, 4 Apr 2024 10:34:20 -0400 Subject: [PATCH 5/7] Test join sort case --- tests/test_basic.py | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/tests/test_basic.py b/tests/test_basic.py index 9df844f..57990a7 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -3,6 +3,8 @@ import genomic_features as gf +ENSEMBL_RELEASE = 108 + def test_package_has_version(): assert gf.__version__ is not None @@ -10,7 +12,7 @@ def test_package_has_version(): @pytest.mark.parametrize("backend", ["sqlite", "duckdb"]) def test_genes(backend): - genes = gf.ensembl.annotation("Hsapiens", 108, backend=backend).genes() + genes = gf.ensembl.annotation("Hsapiens", ENSEMBL_RELEASE, backend=backend).genes() assert isinstance(genes, pd.DataFrame) # Test sort order @@ -26,20 +28,24 @@ def test_missing_version(): def test_repr(): - result = repr(gf.ensembl.annotation("Hsapiens", 108)) - expected = "EnsemblDB(organism='Homo sapiens', ensembl_release='108')" + result = repr(gf.ensembl.annotation("Hsapiens", ENSEMBL_RELEASE)) + expected = ( + f"EnsemblDB(organism='Homo sapiens', ensembl_release='{ENSEMBL_RELEASE}')" + ) assert result == expected def test_invalid_join(): with pytest.raises(ValueError, match=r"Invalid join type: flarb"): - gf.ensembl.annotation("Hsapiens", 108).genes(cols=["tx_id"], join_type="flarb") + gf.ensembl.annotation("Hsapiens", ENSEMBL_RELEASE).genes( + cols=["tx_id"], join_type="flarb" + ) @pytest.mark.parametrize("backend", ["sqlite", "duckdb"]) def test_exons(backend): - ensdb = gf.ensembl.annotation("Hsapiens", 108, backend=backend) + ensdb = gf.ensembl.annotation("Hsapiens", ENSEMBL_RELEASE, backend=backend) exons = ensdb.exons() pd.testing.assert_index_equal( @@ -58,3 +64,24 @@ def test_exons(backend): ["seq_name", "exon_seq_start", "exon_id"] ).reset_index(drop=True) pd.testing.assert_frame_equal(exons, exons_resorted) + + +@pytest.mark.parametrize("backend", ["sqlite", "duckdb"]) +def test_join_sort_ordering(backend): + ensdb = gf.ensembl.annotation("Hsapiens", ENSEMBL_RELEASE, backend=backend) + df = ensdb.genes( + [ + "seq_name", + "gene_seq_start", + "gene_seq_end", + "exon_id", + "exon_seq_start", + "exon_seq_end", + ] + ) + + # Test sort order + df_resorted = df.sort_values( + ["seq_name", "gene_seq_start", "exon_seq_start", "exon_id", "gene_id"] + ).reset_index(drop=True) + pd.testing.assert_frame_equal(df, df_resorted) From 5824f3a8f61729527eb269e72b5531e75d6ea7aa Mon Sep 17 00:00:00 2001 From: Thomas Reimonn Date: Thu, 4 Apr 2024 11:21:33 -0400 Subject: [PATCH 6/7] test coverage --- tests/test_basic.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_basic.py b/tests/test_basic.py index 57990a7..9bf5a25 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -27,6 +27,11 @@ def test_missing_version(): gf.ensembl.annotation("Hsapiens", 86) +def test_invalid_backend(): + with pytest.raises(ValueError): + gf.ensembl.annotation("Hsapiens", ENSEMBL_RELEASE, backend="bad_idea") + + def test_repr(): result = repr(gf.ensembl.annotation("Hsapiens", ENSEMBL_RELEASE)) expected = ( From 2cacae99cd55972b27ef3212e8d85f64cc7150c5 Mon Sep 17 00:00:00 2001 From: Thomas Reimonn Date: Thu, 4 Apr 2024 11:28:42 -0400 Subject: [PATCH 7/7] Test custom ordering --- tests/test_basic.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test_basic.py b/tests/test_basic.py index 9bf5a25..3a37a1c 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -90,3 +90,23 @@ def test_join_sort_ordering(backend): ["seq_name", "gene_seq_start", "exon_seq_start", "exon_id", "gene_id"] ).reset_index(drop=True) pd.testing.assert_frame_equal(df, df_resorted) + + +@pytest.mark.parametrize("backend", ["sqlite", "duckdb"]) +def test_custom_ordering(backend): + ensdb = gf.ensembl.annotation("Hsapiens", ENSEMBL_RELEASE, backend=backend) + df = ensdb.genes( + [ + "seq_name", + "gene_seq_start", + "gene_seq_end", + "exon_id", + "exon_seq_start", + "exon_seq_end", + ], + order_by="exon_id", + ) + + # Test sort order + df_resorted = df.sort_values(["exon_id"]).reset_index(drop=True) + pd.testing.assert_frame_equal(df, df_resorted)