Skip to content

Commit

Permalink
Merge pull request #204 from Ensembl/mbarba/logging_gff
Browse files Browse the repository at this point in the history
Mbarba/logging gff
  • Loading branch information
MatBarba authored Nov 23, 2023
2 parents 18b356c + c616efd commit 8d91580
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 30 deletions.
3 changes: 2 additions & 1 deletion pipelines/nextflow/modules/gff3/process_gff3.nf
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ process PROCESS_GFF3 {
out_gff = "gene_models.gff3"
'''
gff3_process --genome_data !{genome} --in_gff_path !{gff3} --out_gff_path !{out_gff} \
--out_func_path !{out_func}
--out_func_path !{out_func} -v \
--log_file gff3_process.log
'''
}
66 changes: 37 additions & 29 deletions src/python/ensembl/io/genomio/gff3/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

from collections import Counter
import json
import logging
from os import PathLike
from pathlib import Path
import re
Expand All @@ -35,6 +36,7 @@

from ensembl.io.genomio.gff3.extract_annotation import FunctionalAnnotations
from ensembl.utils.argparse import ArgumentParser
from ensembl.utils.logging import init_logging


class Records(list):
Expand Down Expand Up @@ -193,11 +195,9 @@ def _merge_genes(self, to_merge: List) -> str:
to_merge: List of gff3 lines with gene parts.
"""
print(f"Merge gene in {len(to_merge)} parts")
min_start = -1
max_end = -1
for gene in to_merge:
print(f"Merge part: {gene[8]}")
start = int(gene[3])
end = int(gene[4])

Expand Down Expand Up @@ -264,7 +264,7 @@ def simpler_gff3(self, in_gff_path: PathLike) -> None:
for record in GFF.parse(in_gff_fh):
new_record = SeqRecord(record.seq, id=record.id)
if record.id in to_exclude:
print(f"Skip seq_region {record.id}")
logging.debug(f"Skip seq_region {record.id}")
continue

# Root features (usually genes)
Expand All @@ -286,8 +286,7 @@ def simpler_gff3(self, in_gff_path: PathLike) -> None:
pass
else:
fail_types["gene=" + feat.type] = 1
message = f"Unsupported feature type: {feat.type} (for {feat.id})"
print(message)
logging.debug(f"Unsupported feature type: {feat.type} (for {feat.id})")
if skip_unrecognized:
del feat
continue
Expand Down Expand Up @@ -320,15 +319,17 @@ def format_mobile_element(self, feat: SeqFeature) -> SeqFeature:
if not feat.qualifiers.get("product"):
feat.qualifiers["product"] = [description]
else:
print(f"Mobile genetic element 'mobile_element_type' is not transposon: {element_type}")
logging.warning(
f"Mobile genetic element 'mobile_element_type' is not transposon: {element_type}"
)
return feat
else:
print("Mobile genetic element does not have a 'mobile_element_type' tag")
logging.warning("Mobile genetic element does not have a 'mobile_element_type' tag")
return feat
elif feat.type == "transposable_element":
pass
else:
print(f"Feature {feat.id} is not a supported TE feature {feat.type}")
logging.warning(f"Feature {feat.id} is not a supported TE feature {feat.type}")
return feat

# Generate ID if needed and add it to the functional annotation
Expand Down Expand Up @@ -356,7 +357,9 @@ def format_gene_segments(self, transcript: SeqFeature) -> SeqFeature:
elif re.search(r"\bt[- _]cell\b", standard_name, flags=re.IGNORECASE):
biotype = f"TR_{biotype}"
else:
print(f"Unexpected 'standard_name' content for feature {transcript.id}: {standard_name}")
logging.warning(
f"Unexpected 'standard_name' content for feature {transcript.id}: {standard_name}"
)
return transcript
transcript.type = biotype
return transcript
Expand All @@ -376,7 +379,7 @@ def normalize_gene(self, gene: SeqFeature, fail_types: Dict[str, int]) -> SeqFea

# Gene with no subfeatures: need to create a transcript at least
if len(gene.sub_features) == 0:
print(f"Insert transcript for lone gene {gene.id}")
logging.debug(f"Insert transcript for lone gene {gene.id}")
transcript = self.transcript_for_gene(gene)
gene.sub_features = [transcript]

Expand All @@ -387,14 +390,14 @@ def normalize_gene(self, gene: SeqFeature, fail_types: Dict[str, int]) -> SeqFea
if len(fcounter) == 1:
if fcounter.get("CDS"):
num_subs = len(gene.sub_features)
print(f"Insert transcript-exon feats for {gene.id} ({num_subs} CDSs)")
logging.debug(f"Insert transcript-exon feats for {gene.id} ({num_subs} CDSs)")
transcripts = self.gene_to_cds(gene)
gene.sub_features = transcripts

# Transform gene - exon to gene-transcript-exon
elif fcounter.get("exon"):
num_subs = len(gene.sub_features)
print(f"Insert transcript for {gene.id} ({num_subs} exons)")
logging.debug(f"Insert transcript for {gene.id} ({num_subs} exons)")
transcript = self.gene_to_exon(gene)
gene.sub_features = [transcript]
else:
Expand Down Expand Up @@ -441,7 +444,7 @@ def _normalize_transcripts(self, gene: SeqFeature, fail_types) -> SeqFeature:
message = (
f"Unrecognized transcript type: {transcript.type}" f" for {transcript.id} ({gene.id})"
)
print(message)
logging.warning(message)
if skip_unrecognized:
transcripts_to_delete.append(count)
continue
Expand Down Expand Up @@ -515,7 +518,7 @@ def _normalize_transcript_subfeatures(
f"Unrecognized exon type for {feat.type}: {feat.id}"
f" (for transcript {transcript.id} of type {transcript.type})"
)
print(message)
logging.warning(message)
if self.skip_unrecognized:
exons_to_delete.append(tcount)
continue
Expand All @@ -538,7 +541,7 @@ def transcript_gene(self, ncrna: SeqFeature) -> SeqFeature:
new_type = "ncRNA_gene"
if ncrna.type in ("tRNA", "rRNA"):
new_type = "gene"
print(f"Put the transcript {ncrna.type} in a {new_type} parent feature")
logging.debug(f"Put the transcript {ncrna.type} in a {new_type} parent feature")
gene = SeqFeature(ncrna.location, type=new_type)
gene.qualifiers["source"] = ncrna.qualifiers["source"]
gene.sub_features = [ncrna]
Expand All @@ -549,7 +552,7 @@ def transcript_gene(self, ncrna: SeqFeature) -> SeqFeature:
def cds_gene(self, cds: SeqFeature) -> SeqFeature:
"""Returns a gene created for a lone CDS."""

print("Put the lone CDS in gene-mRNA parent features")
logging.debug(f"Put the lone CDS in gene-mRNA parent features for {cds.id}")

# Create a transcript, add the CDS
transcript = SeqFeature(cds.location, type="mRNA")
Expand Down Expand Up @@ -603,7 +606,7 @@ def gene_to_cds(self, gene: SeqFeature) -> List[SeqFeature]:

# Add to transcript or create a new one
if cds.id not in transcripts_dict:
print(f"Create new mRNA for {cds.id}")
logging.debug(f"Create new mRNA for {cds.id}")
transcript = self.build_transcript(gene)
transcripts_dict[cds.id] = transcript
exon.qualifiers["source"] = gene.qualifiers["source"]
Expand Down Expand Up @@ -678,7 +681,7 @@ def move_cds_to_mrna(self, gene: SeqFeature) -> SeqFeature:
self._check_sub_cdss(gene, sub_cdss)
self._check_sub_exons(gene, cdss, sub_exons)

print(f"Gene {gene.id}: move {len(cdss)} CDSs to the mRNA")
logging.debug(f"Gene {gene.id}: move {len(cdss)} CDSs to the mRNA")
# No more issues? move the CDSs
mrna.sub_features += cdss
# And remove them from the gene
Expand Down Expand Up @@ -736,7 +739,7 @@ def clean_extra_exons(self, gene: SeqFeature) -> SeqFeature:
exon_has_id += 1
if exon_has_id:
if exon_has_id == len(exons):
print(f"Remove {exon_has_id} extra exons from {gene.id}")
logging.debug(f"Remove {exon_has_id} extra exons from {gene.id}")
gene.sub_features = mrnas
gene.sub_features += others
else:
Expand Down Expand Up @@ -774,20 +777,20 @@ def normalize_gene_id(self, gene: SeqFeature) -> str:

# In case the gene id is not valid, use the GeneID
if not self.valid_id(new_gene_id):
print(f"Gene id is not valid: {new_gene_id}")
logging.warning(f"Gene id is not valid: {new_gene_id}")
qual = gene.qualifiers
if "Dbxref" in qual:
for xref in qual["Dbxref"]:
(db, value) = xref.split(":")
if db == "GeneID":
new_gene_id = f"{db}_{value}"
print(f"Using GeneID {new_gene_id} for stable_id instead of {gene.id}")
logging.debug(f"Using GeneID {new_gene_id} for stable_id instead of {gene.id}")
return new_gene_id

# Make a new stable_id
if self.make_missing_stable_ids:
new_id = self.generate_stable_id()
print(f"New id: {new_gene_id} -> {new_id}")
logging.debug(f"New id: {new_gene_id} -> {new_id}")
return new_id
raise GFFParserError(f"Can't use invalid gene id for {gene}")

Expand Down Expand Up @@ -828,22 +831,22 @@ def valid_id(self, name: str) -> bool:

# Trna (from tRNAscan)
if re.search(r"^Trna", name):
print(f"Stable id is a Trna from tRNA-scan: {name}")
logging.debug(f"Stable ID is a tRNA from tRNA-scan: {name}")
return False

# Coordinates
if re.search(r"^.+:\d+..\d+", name):
print(f"Stable id is a coordinate: {name}")
logging.debug(f"Stable id is a coordinate: {name}")
return False

# Special characters
if re.search(r"[ |]", name):
print(f"Stable id contains special characters: {name}")
logging.debug(f"Stable id contains special characters: {name}")
return False

# Min length
if len(name) < min_length:
print(f"Stable id is too short (<{min_length}) {name}")
logging.debug(f"Stable id is too short (<{min_length}) {name}")
return False

return True
Expand Down Expand Up @@ -893,12 +896,12 @@ def remove_cds_from_pseudogene(self, gene: SeqFeature) -> None:
gene_subfeats = []
for transcript in gene.sub_features:
if transcript.type == "CDS":
print(f"Remove pseudo CDS {transcript.id}")
logging.debug(f"Remove pseudo CDS {transcript.id}")
continue
new_subfeats = []
for feat in transcript.sub_features:
if feat.type == "CDS":
print(f"Remove pseudo CDS {feat.id}")
logging.debug(f"Remove pseudo CDS {feat.id}")
continue
new_subfeats.append(feat)
transcript.sub_features = new_subfeats
Expand Down Expand Up @@ -935,9 +938,12 @@ def main() -> None:
default=Path("functional_annotation.json"),
help="Output functional annotation JSON file",
)
parser.add_log_arguments(add_log_file=True)
args = parser.parse_args()
init_logging(args.log_level, args.log_file, args.log_file_level)

# Merge multiline gene features in a separate file
logging.info("Checking for genes to merge...")
interim_gff_path = Path(f"{args.in_gff_path}_INTERIM_MERGE")
merger = GFFGeneMerger()
merged_genes = merger.merge(args.in_gff_path, interim_gff_path)
Expand All @@ -946,12 +952,14 @@ def main() -> None:
# If there are split genes, decide to merge, or just die
if num_merged_genes > 0:
# Report the list of merged genes in case something does not look right
print(f"{num_merged_genes} genes merged:\n" + "\n".join(merged_genes))
logging.info(f"{num_merged_genes} genes merged")
logging.debug("\n".join(merged_genes))
# Use the GFF with the merged genes for the next part
in_gff_path = interim_gff_path

# Load GFF3 data and write a simpler version that follows our specifications as well as a
# functional annotation JSON file
logging.info("Simplify and fix GFF3")
gff_data = GFFSimplifier(args.genome_data, args.make_missing_stable_ids)
gff_data.simpler_gff3(in_gff_path)
gff_data.records.to_gff(args.out_gff_path)
Expand Down

0 comments on commit 8d91580

Please sign in to comment.