diff --git a/GenBankParser.py b/GenBankParser.py index 8df3fac..a48f12d 100644 --- a/GenBankParser.py +++ b/GenBankParser.py @@ -6,6 +6,8 @@ from Bio.SeqFeature import CompoundLocation from functools import lru_cache +from PAMProcessor import PAMProcessor + class GenBankReader: def __init__(self, filename): @@ -18,57 +20,6 @@ def records(self): return SeqIO.to_dict(SeqIO.parse(handle, "genbank")) -class PAMFinder: - def __init__(self, records, pam, direction): - self.records = records - self.pam = pam - self.pam_length = len(pam) - self.pam_pattern = self.pam.replace("N", "[ATCG]") - - self.direction = direction - - def get_pam_seq(self, row): - # Fetch the sequence for the range - sequence = self.records[row.Chromosome].seq[row.Start : row.End] - - # If the strand is "-", get the reverse complement of the sequence - if row.Strand == "-": - sequence = sequence.reverse_complement() - - # Get the PAM sequence - if self.direction == "upstream": - - if row.Strand == "+": - pam_sequence = self.records[row.Chromosome].seq[ - row.Start - self.pam_length : row.Start - ] - else: - pam_sequence = self.records[row.Chromosome].seq[ - row.End : row.End + self.pam_length - ] - elif self.direction == "downstream": - if row.Strand == "+": - pam_sequence = self.records[row.Chromosome].seq[ - row.End : row.End + self.pam_length - ] - else: - pam_sequence = self.records[row.Chromosome].seq[ - row.Start - self.pam_length : row.Start - ] - else: - raise ValueError("direction must be 'upstream' or 'downstream'") - - # If the strand is "-", get the reverse complement of the PAM sequence - if row.Strand == "-": - pam_sequence = pam_sequence.reverse_complement() - - return str(pam_sequence) - - def pam_matches(self, sequence): - # check if the sequence matches the PAM pattern - return bool(re.search(self.pam_pattern, sequence)) - - class GenBankParser(Logger): def __init__(self, filename): super().__init__() diff --git a/PAMProcessor.py b/PAMProcessor.py new file mode 100644 index 0000000..867871d --- /dev/null +++ b/PAMProcessor.py @@ -0,0 +1,92 @@ +import re + + +class PAMProcessor: + def __init__(self, records, pam, direction): + self.records = records + self.pam = pam.replace("N", "[ATCG]") + self.direction = direction + + def get_sequence(self, row): + sequence = self.records[row.Chromosome].seq[row.Start : row.End] + if row.Strand == "-": + sequence = sequence.reverse_complement() + return sequence + + +class GuideFinderByPAM(PAMProcessor): + def __init__(self, records, pam, direction, length): + super().__init__(records, pam, direction) + self.length = length + + def find_pam_sequences(self): + pam_sequences = [] + + for record_id, record in self.records.items(): + for strand, sequence in [ + (+1, record.seq), + (-1, record.seq.reverse_complement()), + ]: + sequence_str = str(sequence) + for match in re.finditer(self.pam, sequence_str): + start = match.start() + end = match.end() + + if ( + self.direction == "downstream" + and strand == 1 + or self.direction == "upstream" + and strand == -1 + ): + if start >= self.length: + guide_sequence = sequence_str[start - self.length : start] + pam_sequences.append(guide_sequence) + elif ( + self.direction == "upstream" + and strand == 1 + or self.direction == "downstream" + and strand == -1 + ): + if end + self.length <= len(sequence_str): + guide_sequence = sequence_str[end : end + self.length] + pam_sequences.append(guide_sequence) + + return pam_sequences + + +class PAMRetriever(PAMProcessor): + def __init__(self, records, pam, direction): + super().__init__(records, pam, direction) + self.pam_length = len(pam) + + def get_pam_seq(self, row): + sequence = self.get_sequence(row) + + if self.direction == "upstream": + if row.Strand == "+": + pam_sequence = self.records[row.Chromosome].seq[ + row.Start - self.pam_length : row.Start + ] + else: + pam_sequence = self.records[row.Chromosome].seq[ + row.End : row.End + self.pam_length + ] + elif self.direction == "downstream": + if row.Strand == "+": + pam_sequence = self.records[row.Chromosome].seq[ + row.End : row.End + self.pam_length + ] + else: + pam_sequence = self.records[row.Chromosome].seq[ + row.Start - self.pam_length : row.Start + ] + else: + raise ValueError("direction must be 'upstream' or 'downstream'") + + if row.Strand == "-": + pam_sequence = pam_sequence.reverse_complement() + + return str(pam_sequence) + + def pam_matches(self, sequence): + return bool(re.search(self.pam, sequence)) diff --git a/testing_grounds.py b/testing_grounds.py index 86dab7a..e603289 100644 --- a/testing_grounds.py +++ b/testing_grounds.py @@ -6,10 +6,10 @@ from BarCodeLibrary import BarCodeLibrary from BowtieRunner import BowtieRunner from CRISPRiLibrary import CRISPRiLibrary -from GenBankParser import GenBankParser, PAMFinder +from GenBankParser import GenBankParser +from PAMProcessor import PAMProcessor, GuideFinderByPAM, PAMRetriever from Logger import Logger from PySamParser import PySamParser -import polars as pl import pyranges as pr ### Example usage ### @@ -18,34 +18,40 @@ # Create a BarCodeLibrary instance by reading a TSV file containing barcodes. # The 'column' parameter specifies which column in the TSV to use for barcodes. -barcodes = BarCodeLibrary("Example_Libraries/CN-32-zmo.tsv", column="spacer") +# barcodes = BarCodeLibrary("Example_Libraries/CN-32-zmo.tsv", column="spacer") # Parse a GenBank file to extract genetic information using GenBankParser. genbank = GenBankParser("GCA_003054575.1.gb") -# Find Protospacer Adjacent Motifs (PAMs) using the PAMFinder class. -# It searches for the specified PAM sequence ('NGNC') downstream of the genetic records. -pam = PAMFinder(genbank.records, "NGNC", "downstream") - -# Use BowtieRunner as a context manager to handle the lifecycle of the Bowtie alignment tool. -# This block will create necessary FASTA and FASTQ files, build an index, and perform alignment. -with BowtieRunner() as bowtie: - # Convert GenBank records to FASTA format. - bowtie.make_fasta(genbank.records) - bowtie.make_fastq(barcodes.barcodes) # Convert barcodes to FASTQ format. - bowtie.create_index() # Create an index for the Bowtie alignment. - # Perform the alignment with specified parameters. - bowtie.align(num_mismatches=1, num_threads=12) - # Parse the resulting SAM file to extract alignment data. - sam = PySamParser(bowtie.sam_path) - # Join the alignment data with GenBank ranges. - targets = sam.ranges.join(genbank.ranges) - -# Create a CRISPRiLibrary instance to manage CRISPR interference (CRISPRi) guides. -guides = CRISPRiLibrary(targets.df, pam) - -# Print the unique targets for the CRISPRi library. -print(guides.unique_targets) - -# Print unambiguous targets for the CRISPRi library. -print(guides.unambiguous_targets) +finder = GuideFinderByPAM(genbank.records, "GGGGGG", "downstream", 32) + +pam_sequences = finder.find_pam_sequences() + +print(pam_sequences[1:20]) + +# # Find Protospacer Adjacent Motifs (PAMs) using the PAMFinder class. +# # It searches for the specified PAM sequence ('NGNC') downstream of the genetic records. +# pam = PAMRetriever(genbank.records, "NGNC", "downstream") + +# # Use BowtieRunner as a context manager to handle the lifecycle of the Bowtie alignment tool. +# # This block will create necessary FASTA and FASTQ files, build an index, and perform alignment. +# with BowtieRunner() as bowtie: +# # Convert GenBank records to FASTA format. +# bowtie.make_fasta(genbank.records) +# bowtie.make_fastq(barcodes.barcodes) # Convert barcodes to FASTQ format. +# bowtie.create_index() # Create an index for the Bowtie alignment. +# # Perform the alignment with specified parameters. +# bowtie.align(num_mismatches=1, num_threads=12) +# # Parse the resulting SAM file to extract alignment data. +# sam = PySamParser(bowtie.sam_path) +# # Join the alignment data with GenBank ranges. +# targets = sam.ranges.join(genbank.ranges) + +# # Create a CRISPRiLibrary instance to manage CRISPR interference (CRISPRi) guides. +# guides = CRISPRiLibrary(targets.df, pam) + +# # Print the unique targets for the CRISPRi library. +# print(guides.unique_targets) + +# # Print unambiguous targets for the CRISPRi library. +# print(guides.unambiguous_targets)