Skip to content

Commit

Permalink
Merge pull request #214 from Ensembl/lcampbell/genome_stats_compare
Browse files Browse the repository at this point in the history
Implement ensembl-py logging
  • Loading branch information
JAlvarezJarreta authored Nov 21, 2023
2 parents 0ae4c26 + 22048b3 commit 6985344
Show file tree
Hide file tree
Showing 20 changed files with 113 additions and 40 deletions.
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
# See the NOTICE file distributed with this work for additional information
# regarding copyright ownership.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: Documentation_deploy_mkdocs
run-name: ${{ github.actor }} triggered mkdocs generation
on:
Expand Down
2 changes: 2 additions & 0 deletions docs/gen_ref_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
Expand Down
2 changes: 1 addition & 1 deletion docs/index.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# [Ensembl GenomIO](https://github.com/Ensembl/ensembl-genomio)

*Ensembl-genomIO Base Library Documentation*
*Ensembl GenomIO Base Library Documentation*

A repository dedicated to pipelines used to turn basic genomic data into formatted
Ensembl core databases. Also allow users to dump core databases into various formats.
Expand Down
5 changes: 4 additions & 1 deletion mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,13 @@
# limitations under the License.

site_name: Ensembl-GenomIO
site_url: "https://ensembl.github.io/ensembl-genomio"
repo_url: "https://github.com/Ensembl/ensembl-genomio"
repo_name: "Ensembl-GenomIO"
copyright: Copyright © [2016-2023] EMBL-European Bioinformatics Institute

theme:
name: "material"
logo: img/logo.png

plugins:
- search
Expand Down
2 changes: 1 addition & 1 deletion pipelines/nextflow/modules/download/download_asm_data.nf
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,6 @@ process DOWNLOAD_ASM_DATA {

shell:
'''
assembly_download --accession !{meta.accession} --asm_download_dir ./
assembly_download --accession !{meta.accession} --download_dir ./ --verbose
'''
}
3 changes: 2 additions & 1 deletion pipelines/nextflow/modules/events/dump_events.nf
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ process DUMP_EVENTS {
--user '${server.user}' \
--password '${server.password}' \
--database '${db.database}' \
--output_file "events.txt"
--output_file "events.txt" \
--verbose
"""
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ process DUMP_GENOME_META {
--user '${server.user}' \
--password '${server.password}' \
--database '${db.database}' \
--verbose \
> genome.json
"""
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@ process COMPARE_GENOME_STATS {

script:
"""
genome_stats_compare --ncbi $ncbi_stats --core $core_stats > diff_stats.json
genome_stats_compare --ncbi $ncbi_stats --core $core_stats --verbose > diff_stats.json
"""
}
2 changes: 1 addition & 1 deletion pipelines/nextflow/modules/manifest/check_integrity_db.nf
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,6 @@ process CHECK_INTEGRITY {
script:
brc_mode = params.brc_mode ? '--brc_mode' : ''
"""
manifest_check_integrity --manifest_file ${manifest_dir}/manifest.json $brc_mode
manifest_check_integrity --manifest_file ${manifest_dir}/manifest.json $brc_mode --verbose
"""
}
3 changes: 2 additions & 1 deletion pipelines/nextflow/modules/patch_build/load_events.nf
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ process LOAD_EVENTS {
--password $server.password \\
--database $server.database \\
--input_file $events \\
--update
--update \\
--verbose
"""
}
5 changes: 2 additions & 3 deletions pipelines/nextflow/subworkflows/dump_metadata/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,10 @@
include { DUMP_SEQ_REGIONS } from '../../modules/seq_region/dump_seq_regions.nf'
include { DUMP_EVENTS } from '../../modules/events/dump_events.nf'
include { DUMP_GENOME_META } from '../../modules/genome_metadata/dump_genome_meta.nf'
include { DUMP_GENOME_STATS } from '../../modules/genome_metadata/dump_genome_stats.nf'
include { COMPARE_GENOME_STATS } from '../../modules/genome_metadata/compare_genome_stats.nf'
include { DUMP_GENOME_STATS } from '../../modules/genome_stats/dump_genome_stats.nf'
include { COMPARE_GENOME_STATS } from '../../modules/genome_stats/compare_genome_stats.nf'
include { DUMP_NCBI_STATS } from '../../modules/genome_metadata/dump_ncbi_stats.nf'
include { CHECK_JSON_SCHEMA } from '../../modules/schema/check_json_schema_db.nf'

include { COLLECT_FILES } from '../../modules/files/collect_files_db.nf'
include { MANIFEST } from '../../modules/files/collect_files_db.nf'
include { PUBLISH_DIR } from '../../modules/files/collect_files_db.nf'
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ dependencies = [
"bcbio-gff == 0.7.0",
"biopython == 1.81",
"ensembl-hive @ git+https://github.com/Ensembl/ensembl-hive.git",
"ensembl-py @ git+https://github.com/Ensembl/ensembl-py.git", # minimum v1.2.0
"ensembl-py @ git+https://github.com/Ensembl/ensembl-py.git", # minimum v1.2.1
"jsonschema >= 4.6.0",
"importlib_resources", # not needed from Python 3.9+
"intervaltree >= 3.1.0",
Expand Down Expand Up @@ -83,6 +83,7 @@ doc = [
[project.urls]
homepage = "https://www.ensembl.org"
repository = "https://github.com/Ensembl/ensembl-genomio"
documentation = "https://ensembl.github.io/ensembl-genomio"

[project.scripts]
# Assembly
Expand Down
11 changes: 10 additions & 1 deletion src/python/ensembl/brc4/runnable/core_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import argparse
import re
from typing import Dict, List, Any
import logging

import mysql.connector

Expand Down Expand Up @@ -55,7 +56,11 @@ def connect(self) -> None:
)

def set_database(self, db_name: str) -> None:
self._connector.database = db_name
try:
self._connector.database = db_name
except mysql.connector.errors.ProgrammingError:
logging.exception(f"Unknown database ! DB:'{db_name}' not located on host {self.host}")
raise

def get_cursor(self):
return self._connector.cursor()
Expand Down Expand Up @@ -90,6 +95,10 @@ def get_cores(

dbs = self.get_all_cores()

# Check if there are databases returned from query to host
if not dbs:
logging.warning("No databases returned from query")

if prefix:
dbs = [db for db in dbs if db.startswith(f"{prefix}")]
if dbname_re:
Expand Down
8 changes: 6 additions & 2 deletions src/python/ensembl/io/genomio/assembly/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from typing import Dict

from ensembl.utils.argparse import ArgumentParser

from ensembl.utils.logging import init_logging

_FILE_ENDS = {
"assembly_report.txt": "report",
Expand Down Expand Up @@ -281,6 +281,10 @@ def main() -> None:
parser.add_argument_dst_path(
"--download_dir", default=Path.cwd(), help="Folder where the data will be downloaded"
)
parser.add_log_arguments(add_log_file=True)
args = parser.parse_args()

retrieve_assembly_data(**vars(args))
# Configure and initialise logging
init_logging(args.log_level, args.log_file, args.log_file_level)

retrieve_assembly_data(args.accession, args.download_dir)
14 changes: 10 additions & 4 deletions src/python/ensembl/io/genomio/events/dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional, Set, Tuple
import logging

from ensembl.brc4.runnable.core_server import CoreServer
from ensembl.utils.argparse import ArgumentParser
from ensembl.utils.logging import init_logging


BRC4_START_DATE = datetime(2020, 5, 1)
Expand Down Expand Up @@ -307,7 +309,7 @@ def get_history(self) -> List:

events = []
for session in sessions:
print(f"Mapping session {session['release']}")
logging.info(f"Mapping session {session['release']}")
pairs = self.get_pairs(session["id"])
session_events = self.make_events(pairs)
for event in session_events:
Expand All @@ -327,7 +329,7 @@ def print_events(self, events: List[StableIdEvent], output_file: Path) -> None:
"""
if not events:
print("No events to print")
logging.info("No events to print")
return
with output_file.open("w") as out_fh:
for event in events:
Expand Down Expand Up @@ -379,7 +381,7 @@ def get_pairs(self, session_id: int) -> List[Pair]:
for db in cursor:
pair = Pair(old_id=db[0], new_id=db[1])
pairs.append(pair)
print(f"{len(pairs)} stable id events")
logging.debug(f"{len(pairs)} stable id events")
return pairs

def make_events(self, pairs: List[Pair]) -> List:
Expand Down Expand Up @@ -423,7 +425,7 @@ def make_events(self, pairs: List[Pair]) -> List:
stats[name] += 1

for stat, value in stats.items():
print(f"\t{stat} = {value}")
logging.info(f"\t{stat} = {value}")

return events

Expand Down Expand Up @@ -525,8 +527,12 @@ def main() -> None:
)
parser.add_server_arguments(include_database=True)
parser.add_argument_dst_path("--output_file", required=True, help="Output file")
parser.add_log_arguments(add_log_file=True)
args = parser.parse_args()

# Configure and initialise logging
init_logging(args.log_level, args.log_file, args.log_file_level)

# Start
factory = CoreServer(host=args.host, port=args.port, user=args.user, password=args.password)
factory.set_database(args.database)
Expand Down
20 changes: 13 additions & 7 deletions src/python/ensembl/io/genomio/events/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,15 @@
from os import PathLike
from pathlib import Path
import re
import logging
from typing import Dict, Generator, List, Optional, Tuple

from sqlalchemy.orm import Session

from ensembl.database import DBConnection
from ensembl.core.models import MappingSession, StableIdEvent
from ensembl.utils.argparse import ArgumentParser
from ensembl.utils.logging import init_logging


@dataclass
Expand Down Expand Up @@ -64,7 +66,7 @@ def add_event(self, event: IdEvent) -> None:


class EventCollection:
""" "Collection of events with loader/writer in various formats."""
"""Collection of events with loader/writer in various formats."""

def __init__(self) -> None:
self.events: List[IdEvent] = []
Expand Down Expand Up @@ -143,7 +145,7 @@ def _parse_gene_diff_event(self, event_string: str) -> Generator[Tuple[str, str,
splitter = f"({event_sep})"
parts = re.split(splitter, event_string)
if len(parts) != 3:
print(f"Wrong partition: from '{event_string}' to '{parts}'")
logging.warning(f"Wrong partition: from '{event_string}' to '{parts}'")
return
[from_ids, sep, to_ids] = parts
event_name = event_symbol[sep]
Expand All @@ -165,7 +167,7 @@ def remap_to_ids(self, map_dict: Dict[str, str]):
elif event.to_id in map_dict:
event.to_id = map_dict[event.to_id]
else:
print(f"No map for to_id {event.to_id}")
logging.info(f"No map for to_id {event.to_id}")
no_map += 1

if no_map:
Expand All @@ -174,7 +176,7 @@ def remap_to_ids(self, map_dict: Dict[str, str]):
def write_events_to_file(self, output_file: PathLike) -> None:
"""Write the events to a file."""
with Path(output_file).open("w") as out_fh:
print(f"Write {len(self.events)} events to {output_file}")
logging.info(f"Write {len(self.events)} events to {output_file}")
for event in self.events:
out_fh.write(f"{event}\n")

Expand All @@ -194,7 +196,7 @@ def write_events_to_db(self, session: Session, update: bool = False) -> None:
# Then, add the mapping, and the events for this mapping
for release, mapping in mappings.items():
if update:
print(f"Adding mapping for release {release} ({len(mapping.events)} events)")
logging.info(f"Adding mapping for release {release} ({len(mapping.events)} events)")
map_session = MappingSession(new_release=mapping.release, created=mapping.release_date)
session.add(map_session)
session.flush()
Expand All @@ -217,9 +219,9 @@ def write_events_to_db(self, session: Session, update: bool = False) -> None:
session.add(id_event)
session.commit()
else:
print(f"Found mapping for release {release} ({len(mapping.events)} events)")
logging.info(f"Found mapping for release {release} ({len(mapping.events)} events)")
if not update:
print("Run your command again with '--update' to add them")
logging.info("Run your command again with '--update' to add them")


def main() -> None:
Expand All @@ -235,8 +237,12 @@ def main() -> None:
),
)
parser.add_argument("--update", action="store_true", help="Make changes to the database")
parser.add_log_arguments(add_log_file=True)
args = parser.parse_args()

# Configure and initialise logging
init_logging(args.log_level, args.log_file, args.log_file_level)

# Start
dbc = DBConnection(args.url)

Expand Down
10 changes: 10 additions & 0 deletions src/python/ensembl/io/genomio/genome_metadata/dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,15 @@

import json
from typing import Any, Dict
import logging

from sqlalchemy import select
from sqlalchemy.orm import Session

from ensembl.core.models import Meta
from ensembl.database import DBConnection
from ensembl.utils.argparse import ArgumentParser
from ensembl.utils.logging import init_logging


def get_genome_metadata(session: Session) -> Dict[str, Any]:
Expand Down Expand Up @@ -130,13 +132,17 @@ def check_assembly_version(gmeta_out: Dict[str, Any]) -> None:
# Check the version is an integer
if version is not None and version.isdigit():
assembly["version"] = int(version)
logging.info(f"Located version [v{int(version)}] info from meta data.")
else:
# Get the version from the assembly accession
accession = assembly["accession"]
parts = accession.split(".")
if len(parts) == 2 and parts[1].isdigit():
version = parts[1]
assembly["version"] = int(version)
logging.info(
f'Asm version [v{version}] obtained from: assembly accession ({assembly["accession"]}).'
)
else:
raise ValueError(f"Assembly version is not an integer in {assembly}")

Expand All @@ -147,10 +153,14 @@ def main() -> None:
description="Fetch the genome metadata from a core database and print it in JSON format."
)
parser.add_server_arguments(include_database=True)
parser.add_log_arguments(add_log_file=True)
args = parser.parse_args()

dbc = DBConnection(args.url)

# Configure and initialise logging
init_logging(args.log_level, args.log_file, args.log_file_level)

with dbc.session_scope() as session:
genome_meta = get_genome_metadata(session)
genome_meta = filter_genome_meta(genome_meta)
Expand Down
Loading

0 comments on commit 6985344

Please sign in to comment.