Skip to content

Commit

Permalink
Reduce EnsemblDB public interface (#49)
Browse files Browse the repository at this point in the history
  • Loading branch information
ivirshup authored May 17, 2023
1 parent 2398977 commit 5bcc2a4
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 29 deletions.
38 changes: 19 additions & 19 deletions src/genomic_features/ensembl/ensembldb.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,8 @@ def genes(
if "gene_id" not in cols: # genes always needs gene_id
cols.append("gene_id")

query = self.build_query(table, cols, filter, join_type)
return self.execute_query(query)
query = self._build_query(table, cols, filter, join_type)
return self._execute_query(query)

def transcripts(
self,
Expand Down Expand Up @@ -183,8 +183,8 @@ def transcripts(
if ("tx_seq_start" in cols or "tx_seq_end" in cols) and "seq_name" not in cols:
cols.append("seq_name")

query = self.build_query(table, cols, filter, join_type)
return self.execute_query(query)
query = self._build_query(table, cols, filter, join_type)
return self._execute_query(query)

def exons(
self,
Expand Down Expand Up @@ -218,10 +218,10 @@ def exons(
) and "seq_name" not in cols:
cols.append("seq_name")

query = self.build_query(table, cols, filter, join_type)
return self.execute_query(query)
query = self._build_query(table, cols, filter, join_type)
return self._execute_query(query)

def execute_query(self, query: IbisTable) -> DataFrame:
def _execute_query(self, query: IbisTable) -> DataFrame:
"""Run a query and return the results."""
# TODO: Allow more options for returning results
return query.distinct().execute()
Expand All @@ -230,7 +230,7 @@ def chromosomes(self) -> DataFrame:
"""Get chromosome information."""
return self.db.table("chromosome").execute()

def build_query(
def _build_query(
self,
table: Literal["gene", "tx", "exon"],
cols: list[str],
Expand All @@ -239,27 +239,27 @@ def build_query(
) -> IbisTable:
"""Build a query for the genomic features table."""
# Finalize cols
self.clean_columns(cols)
self._clean_columns(cols)
for col in filter.columns():
if col not in cols:
cols.append(col)

# check if join is required
tables = self.get_required_tables(self.tables_for_columns(cols))
tables = self._get_required_tables(self._tables_for_columns(cols))

# Basically just to make sure exons stay in the query
if table not in tables:
tables.append(table)

if len(tables) > 1:
query = self.join_query(tables, start_with=table, join_type=join_type)
query = self._join_query(tables, start_with=table, join_type=join_type)
else:
query = self.db.table(table)
# add filter
query = query.filter(filter.convert()).select(cols)
return query

def join_query(
def _join_query(
self,
tables: list[str],
start_with: str,
Expand Down Expand Up @@ -318,7 +318,7 @@ def list_tables(self) -> list:
"""List all tables available in the genomic features database."""
return self.db.list_tables()

def tables_by_degree(self, tab: list[str] = None) -> list:
def _tables_by_degree(self, tab: list[str] = None) -> list:
"""Order tables available in the genomic features database."""
if tab is None:
tab = self.list_tables() # list of table names
Expand Down Expand Up @@ -350,7 +350,7 @@ def tables_by_degree(self, tab: list[str] = None) -> list:

return sorted(tab, key=lambda x: table_order[x])

def get_required_tables(self, tab) -> list:
def _get_required_tables(self, tab) -> list:
"""Given tables, get all intermediate tables required to execute the query."""
# If we have exon and any other table, we need definitely tx2exon
if "exon" in tab and len(tab) > 1:
Expand Down Expand Up @@ -381,7 +381,7 @@ def get_required_tables(self, tab) -> list:
if "entrezgene" in tab and len(tab) > 1:
tab = list(set(tab + ["gene"]))

return self.tables_by_degree(tab)
return self._tables_by_degree(tab)

def list_columns(self, tables: str | list[str] | None = None) -> list[str]:
"""List all columns available in the genomic features table."""
Expand All @@ -392,7 +392,7 @@ def list_columns(self, tables: str | list[str] | None = None) -> list[str]:
columns = [c for t in tables for c in self.db.table(t).columns]
return columns

def clean_columns(self, columns: list[str]) -> list[str]:
def _clean_columns(self, columns: list[str]) -> list[str]:
"""Clean a list of columns to make sure they are valid."""
if isinstance(columns, str):
columns = [columns]
Expand All @@ -408,7 +408,7 @@ def clean_columns(self, columns: list[str]) -> list[str]:
raise ValueError("No valid columns were found.")
return cols

def tables_for_columns(self, cols: list, start_with: str | None = None) -> list:
def _tables_for_columns(self, cols: list, start_with: str | None = None) -> list:
"""
Return a list of tables that contain the specified columns.
Expand All @@ -417,8 +417,8 @@ def tables_for_columns(self, cols: list, start_with: str | None = None) -> list:
cols
Columns that we're looking for.
"""
cols = self.clean_columns(cols)
table_list = self.tables_by_degree() # list of table names
cols = self._clean_columns(cols)
table_list = self._tables_by_degree() # list of table names

# remove start_with from table_list and add it to the beginning of the list
if start_with is not None:
Expand Down
20 changes: 10 additions & 10 deletions tests/test_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def hsapiens108():


def test_tables_by_degree(hsapiens108):
result = hsapiens108.tables_by_degree()
result = hsapiens108._tables_by_degree()
assert result == [
"gene",
"tx",
Expand All @@ -23,9 +23,9 @@ def test_tables_by_degree(hsapiens108):
"entrezgene",
"metadata",
]
result = hsapiens108.tables_by_degree(tab=["protein", "exon"])
result = hsapiens108._tables_by_degree(tab=["protein", "exon"])
assert result == ["exon", "protein"]
result = hsapiens108.tables_by_degree(tab=["protein", "invalid_table"])
result = hsapiens108._tables_by_degree(tab=["protein", "invalid_table"])
assert result == ["protein"]


Expand All @@ -35,26 +35,26 @@ def test_list_columns(hsapiens108):


def test_clean_columns(hsapiens108):
result = hsapiens108.clean_columns("gene_id")
result = hsapiens108._clean_columns("gene_id")
assert result == ["gene_id"]
result = hsapiens108.clean_columns(["gene_id", "gene_name"])
result = hsapiens108._clean_columns(["gene_id", "gene_name"])
assert result == ["gene_id", "gene_name"]
with pytest.raises(ValueError):
hsapiens108.clean_columns(["gene_id", "invalid_column"])
hsapiens108._clean_columns(["gene_id", "invalid_column"])
with pytest.raises(ValueError):
hsapiens108.clean_columns([])
hsapiens108._clean_columns([])


def test_tables_for_columns(hsapiens108):
result = hsapiens108.tables_for_columns(["gene_id"])
result = hsapiens108._tables_for_columns(["gene_id"])
assert result == ["gene"]


def test_required_tables(hsapiens108):
result = hsapiens108.get_required_tables(["gene", "tx"])
result = hsapiens108._get_required_tables(["gene", "tx"])
assert result == ["gene", "tx"]
# case where we need intermediate tables
result = hsapiens108.get_required_tables(["gene", "protein"])
result = hsapiens108._get_required_tables(["gene", "protein"])
assert result == ["gene", "tx", "protein"]


Expand Down

0 comments on commit 5bcc2a4

Please sign in to comment.