Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mbarba/logging gff #204

Merged
merged 7 commits into from
Nov 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pipelines/nextflow/modules/gff3/process_gff3.nf
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ process PROCESS_GFF3 {
out_gff = "gene_models.gff3"
'''
gff3_process --genome_data !{genome} --in_gff_path !{gff3} --out_gff_path !{out_gff} \
--out_func_path !{out_func}
--out_func_path !{out_func} -v \
--log_file gff3_process.log
'''
}
66 changes: 37 additions & 29 deletions src/python/ensembl/io/genomio/gff3/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

from collections import Counter
import json
import logging
from os import PathLike
from pathlib import Path
import re
Expand All @@ -35,6 +36,7 @@

from ensembl.io.genomio.gff3.extract_annotation import FunctionalAnnotations
from ensembl.utils.argparse import ArgumentParser
from ensembl.utils.logging import init_logging
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
from ensembl.utils.logging import init_logging
from ensembl.utils.logging import init_logging_with_args



class Records(list):
Expand Down Expand Up @@ -193,11 +195,9 @@ def _merge_genes(self, to_merge: List) -> str:
to_merge: List of gff3 lines with gene parts.
"""
print(f"Merge gene in {len(to_merge)} parts")
min_start = -1
max_end = -1
for gene in to_merge:
print(f"Merge part: {gene[8]}")
start = int(gene[3])
end = int(gene[4])

Expand Down Expand Up @@ -264,7 +264,7 @@ def simpler_gff3(self, in_gff_path: PathLike) -> None:
for record in GFF.parse(in_gff_fh):
new_record = SeqRecord(record.seq, id=record.id)
if record.id in to_exclude:
print(f"Skip seq_region {record.id}")
logging.debug(f"Skip seq_region {record.id}")
continue

# Root features (usually genes)
Expand All @@ -286,8 +286,7 @@ def simpler_gff3(self, in_gff_path: PathLike) -> None:
pass
else:
fail_types["gene=" + feat.type] = 1
message = f"Unsupported feature type: {feat.type} (for {feat.id})"
print(message)
logging.debug(f"Unsupported feature type: {feat.type} (for {feat.id})")
if skip_unrecognized:
del feat
continue
Expand Down Expand Up @@ -320,15 +319,17 @@ def format_mobile_element(self, feat: SeqFeature) -> SeqFeature:
if not feat.qualifiers.get("product"):
feat.qualifiers["product"] = [description]
else:
print(f"Mobile genetic element 'mobile_element_type' is not transposon: {element_type}")
logging.warning(
f"Mobile genetic element 'mobile_element_type' is not transposon: {element_type}"
)
return feat
else:
print("Mobile genetic element does not have a 'mobile_element_type' tag")
logging.warning("Mobile genetic element does not have a 'mobile_element_type' tag")
return feat
elif feat.type == "transposable_element":
pass
else:
print(f"Feature {feat.id} is not a supported TE feature {feat.type}")
logging.warning(f"Feature {feat.id} is not a supported TE feature {feat.type}")
return feat

# Generate ID if needed and add it to the functional annotation
Expand Down Expand Up @@ -356,7 +357,9 @@ def format_gene_segments(self, transcript: SeqFeature) -> SeqFeature:
elif re.search(r"\bt[- _]cell\b", standard_name, flags=re.IGNORECASE):
biotype = f"TR_{biotype}"
else:
print(f"Unexpected 'standard_name' content for feature {transcript.id}: {standard_name}")
logging.warning(
f"Unexpected 'standard_name' content for feature {transcript.id}: {standard_name}"
)
return transcript
transcript.type = biotype
return transcript
Expand All @@ -376,7 +379,7 @@ def normalize_gene(self, gene: SeqFeature, fail_types: Dict[str, int]) -> SeqFea

# Gene with no subfeatures: need to create a transcript at least
if len(gene.sub_features) == 0:
print(f"Insert transcript for lone gene {gene.id}")
logging.debug(f"Insert transcript for lone gene {gene.id}")
transcript = self.transcript_for_gene(gene)
gene.sub_features = [transcript]

Expand All @@ -387,14 +390,14 @@ def normalize_gene(self, gene: SeqFeature, fail_types: Dict[str, int]) -> SeqFea
if len(fcounter) == 1:
if fcounter.get("CDS"):
num_subs = len(gene.sub_features)
print(f"Insert transcript-exon feats for {gene.id} ({num_subs} CDSs)")
logging.debug(f"Insert transcript-exon feats for {gene.id} ({num_subs} CDSs)")
transcripts = self.gene_to_cds(gene)
gene.sub_features = transcripts

# Transform gene - exon to gene-transcript-exon
elif fcounter.get("exon"):
num_subs = len(gene.sub_features)
print(f"Insert transcript for {gene.id} ({num_subs} exons)")
logging.debug(f"Insert transcript for {gene.id} ({num_subs} exons)")
transcript = self.gene_to_exon(gene)
gene.sub_features = [transcript]
else:
Expand Down Expand Up @@ -441,7 +444,7 @@ def _normalize_transcripts(self, gene: SeqFeature, fail_types) -> SeqFeature:
message = (
f"Unrecognized transcript type: {transcript.type}" f" for {transcript.id} ({gene.id})"
)
print(message)
logging.warning(message)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
logging.warning(message)
logging.warning(
f"Unrecognized transcript type: {transcript.type}" f" for {transcript.id} ({gene.id})"
)

And remove the previous message declaration?

if skip_unrecognized:
transcripts_to_delete.append(count)
continue
Expand Down Expand Up @@ -515,7 +518,7 @@ def _normalize_transcript_subfeatures(
f"Unrecognized exon type for {feat.type}: {feat.id}"
f" (for transcript {transcript.id} of type {transcript.type})"
)
print(message)
logging.warning(message)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same suggestion as before:

Suggested change
logging.warning(message)
logging.warning((
f"Unrecognized exon type for {feat.type}: {feat.id}"
f" (for transcript {transcript.id} of type {transcript.type})"
))

if self.skip_unrecognized:
exons_to_delete.append(tcount)
continue
Expand All @@ -538,7 +541,7 @@ def transcript_gene(self, ncrna: SeqFeature) -> SeqFeature:
new_type = "ncRNA_gene"
if ncrna.type in ("tRNA", "rRNA"):
new_type = "gene"
print(f"Put the transcript {ncrna.type} in a {new_type} parent feature")
logging.debug(f"Put the transcript {ncrna.type} in a {new_type} parent feature")
gene = SeqFeature(ncrna.location, type=new_type)
gene.qualifiers["source"] = ncrna.qualifiers["source"]
gene.sub_features = [ncrna]
Expand All @@ -549,7 +552,7 @@ def transcript_gene(self, ncrna: SeqFeature) -> SeqFeature:
def cds_gene(self, cds: SeqFeature) -> SeqFeature:
"""Returns a gene created for a lone CDS."""

print("Put the lone CDS in gene-mRNA parent features")
logging.debug(f"Put the lone CDS in gene-mRNA parent features for {cds.id}")

# Create a transcript, add the CDS
transcript = SeqFeature(cds.location, type="mRNA")
Expand Down Expand Up @@ -603,7 +606,7 @@ def gene_to_cds(self, gene: SeqFeature) -> List[SeqFeature]:

# Add to transcript or create a new one
if cds.id not in transcripts_dict:
print(f"Create new mRNA for {cds.id}")
logging.debug(f"Create new mRNA for {cds.id}")
transcript = self.build_transcript(gene)
transcripts_dict[cds.id] = transcript
exon.qualifiers["source"] = gene.qualifiers["source"]
Expand Down Expand Up @@ -678,7 +681,7 @@ def move_cds_to_mrna(self, gene: SeqFeature) -> SeqFeature:
self._check_sub_cdss(gene, sub_cdss)
self._check_sub_exons(gene, cdss, sub_exons)

print(f"Gene {gene.id}: move {len(cdss)} CDSs to the mRNA")
logging.debug(f"Gene {gene.id}: move {len(cdss)} CDSs to the mRNA")
# No more issues? move the CDSs
mrna.sub_features += cdss
# And remove them from the gene
Expand Down Expand Up @@ -736,7 +739,7 @@ def clean_extra_exons(self, gene: SeqFeature) -> SeqFeature:
exon_has_id += 1
if exon_has_id:
if exon_has_id == len(exons):
print(f"Remove {exon_has_id} extra exons from {gene.id}")
logging.debug(f"Remove {exon_has_id} extra exons from {gene.id}")
gene.sub_features = mrnas
gene.sub_features += others
else:
Expand Down Expand Up @@ -774,20 +777,20 @@ def normalize_gene_id(self, gene: SeqFeature) -> str:

# In case the gene id is not valid, use the GeneID
if not self.valid_id(new_gene_id):
print(f"Gene id is not valid: {new_gene_id}")
logging.warning(f"Gene id is not valid: {new_gene_id}")
qual = gene.qualifiers
if "Dbxref" in qual:
for xref in qual["Dbxref"]:
(db, value) = xref.split(":")
if db == "GeneID":
new_gene_id = f"{db}_{value}"
print(f"Using GeneID {new_gene_id} for stable_id instead of {gene.id}")
logging.debug(f"Using GeneID {new_gene_id} for stable_id instead of {gene.id}")
return new_gene_id

# Make a new stable_id
if self.make_missing_stable_ids:
new_id = self.generate_stable_id()
print(f"New id: {new_gene_id} -> {new_id}")
logging.debug(f"New id: {new_gene_id} -> {new_id}")
return new_id
raise GFFParserError(f"Can't use invalid gene id for {gene}")

Expand Down Expand Up @@ -828,22 +831,22 @@ def valid_id(self, name: str) -> bool:

# Trna (from tRNAscan)
if re.search(r"^Trna", name):
print(f"Stable id is a Trna from tRNA-scan: {name}")
logging.debug(f"Stable ID is a tRNA from tRNA-scan: {name}")
return False

# Coordinates
if re.search(r"^.+:\d+..\d+", name):
print(f"Stable id is a coordinate: {name}")
logging.debug(f"Stable id is a coordinate: {name}")
return False

# Special characters
if re.search(r"[ |]", name):
print(f"Stable id contains special characters: {name}")
logging.debug(f"Stable id contains special characters: {name}")
return False

# Min length
if len(name) < min_length:
print(f"Stable id is too short (<{min_length}) {name}")
logging.debug(f"Stable id is too short (<{min_length}) {name}")
return False

return True
Expand Down Expand Up @@ -893,12 +896,12 @@ def remove_cds_from_pseudogene(self, gene: SeqFeature) -> None:
gene_subfeats = []
for transcript in gene.sub_features:
if transcript.type == "CDS":
print(f"Remove pseudo CDS {transcript.id}")
logging.debug(f"Remove pseudo CDS {transcript.id}")
continue
new_subfeats = []
for feat in transcript.sub_features:
if feat.type == "CDS":
print(f"Remove pseudo CDS {feat.id}")
logging.debug(f"Remove pseudo CDS {feat.id}")
continue
new_subfeats.append(feat)
transcript.sub_features = new_subfeats
Expand Down Expand Up @@ -935,9 +938,12 @@ def main() -> None:
default=Path("functional_annotation.json"),
help="Output functional annotation JSON file",
)
parser.add_log_arguments(add_log_file=True)
args = parser.parse_args()
init_logging(args.log_level, args.log_file, args.log_file_level)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
init_logging(args.log_level, args.log_file, args.log_file_level)
init_logging_with_args(args)


# Merge multiline gene features in a separate file
logging.info("Checking for genes to merge...")
interim_gff_path = Path(f"{args.in_gff_path}_INTERIM_MERGE")
merger = GFFGeneMerger()
merged_genes = merger.merge(args.in_gff_path, interim_gff_path)
Expand All @@ -946,12 +952,14 @@ def main() -> None:
# If there are split genes, decide to merge, or just die
if num_merged_genes > 0:
# Report the list of merged genes in case something does not look right
print(f"{num_merged_genes} genes merged:\n" + "\n".join(merged_genes))
logging.info(f"{num_merged_genes} genes merged")
logging.debug("\n".join(merged_genes))
# Use the GFF with the merged genes for the next part
in_gff_path = interim_gff_path

# Load GFF3 data and write a simpler version that follows our specifications as well as a
# functional annotation JSON file
logging.info("Simplify and fix GFF3")
gff_data = GFFSimplifier(args.genome_data, args.make_missing_stable_ids)
gff_data.simpler_gff3(in_gff_path)
gff_data.records.to_gff(args.out_gff_path)
Expand Down