Skip to content

Commit

Permalink
Feat: Use duckdb as backend instead of sqlite (#59)
Browse files Browse the repository at this point in the history
* use duckdb backend

* add duckdb dep

* fix: sort and reindex test results

* add: backend option (sqlite default)

* tests

* default to duckdb

* default to duckdb

* sqlite default
  • Loading branch information
thomas-reimonn authored Apr 3, 2024
1 parent ef33d87 commit 1c2a953
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 13 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ urls.Documentation = "https://genomic-features.readthedocs.io/"
urls.Source = "https://github.com/scverse/genomic-features"
urls.Home-page = "https://github.com/scverse/genomic-features"
dependencies = [
"ibis-framework[sqlite]>0.6",
"ibis-framework[sqlite, duckdb]>0.6",
"pooch",
"pandas",
"pyarrow",
Expand Down
29 changes: 21 additions & 8 deletions src/genomic_features/ensembl/ensembldb.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@
TIMESTAMP_URL = "https://annotationhub.bioconductor.org/metadata/database_timestamp"


def annotation(species: str, version: str | int) -> EnsemblDB:
def annotation(
species: str, version: str | int, backend: Literal["duckdb", "sqlite"] = "sqlite"
) -> EnsemblDB:
"""Get an annotation database for a species and version.
Parameters
Expand All @@ -40,6 +42,8 @@ def annotation(species: str, version: str | int) -> EnsemblDB:
The species name. E.g. Hsapiens for human, Mmusculus for mouse.
version
The ensembl release number.
backend
The backend to use for the database. Either "sqlite" or "duckdb".
Returns
-------
Expand All @@ -51,13 +55,22 @@ def annotation(species: str, version: str | int) -> EnsemblDB:
>>> gf.ensembl.annotation("Hsapiens", "108")
"""
try:
ensdb = EnsemblDB(
ibis.sqlite.connect(
retrieve_annotation(
ENSEMBL_URL_TEMPLATE.format(species=species, version=version)
)
)
sqlite_file_path = retrieve_annotation(
ENSEMBL_URL_TEMPLATE.format(species=species, version=version)
)

if backend == "sqlite":
# Connect to SQLite database
conn = ibis.sqlite.connect(sqlite_file_path)
ensdb = EnsemblDB(conn)
elif backend == "duckdb":
# Connect to DuckDB through Ibis
conn = ibis.duckdb.connect(":memory:", extensions=["sqlite"])
conn.attach_sqlite(sqlite_file_path)
ensdb = EnsemblDB(conn)
else:
raise ValueError(f"Invalid backend: {backend}")

except HTTPError as err:
if err.response.status_code == 404:
raise ValueError(
Expand Down Expand Up @@ -295,7 +308,7 @@ def _build_query(
else:
query = self.db.table(table)
# add filter
query = query.filter(filter.convert()).select(cols)
query = query.filter(filter.convert()).select(cols).order_by(cols)
return query

def _join_query(
Expand Down
20 changes: 16 additions & 4 deletions tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,14 +194,26 @@ def test_negation(hsapiens108):
assert result.shape[0] == 22894


def test_seqs_as_int(hsapiens108):
@pytest.mark.parametrize("backend", ["sqlite", "duckdb"])
def test_seqs_as_int(backend):
hsapiens108 = gf.ensembl.annotation("Hsapiens", 108, backend=backend)

result_w_int = hsapiens108.genes(filter=filters.SeqNameFilter(1))
result_w_str = hsapiens108.genes(filter=filters.SeqNameFilter("1"))
pd.testing.assert_frame_equal(result_w_int, result_w_str)
pd.testing.assert_frame_equal(
result_w_int,
result_w_str,
)

result_w_ints = hsapiens108.genes(filter=filters.SeqNameFilter([1, 2]))
result_w_strs = hsapiens108.genes(filter=filters.SeqNameFilter(["1", "2"]))
result_w_mixed = hsapiens108.genes(filter=filters.SeqNameFilter([1, "2"]))

pd.testing.assert_frame_equal(result_w_ints, result_w_strs)
pd.testing.assert_frame_equal(result_w_ints, result_w_mixed)
pd.testing.assert_frame_equal(
result_w_ints,
result_w_strs,
)
pd.testing.assert_frame_equal(
result_w_ints,
result_w_mixed,
)

0 comments on commit 1c2a953

Please sign in to comment.