Skip to content

Commit

Permalink
moved PAM logic to its own class
Browse files Browse the repository at this point in the history
  • Loading branch information
ryandward committed Mar 11, 2024
1 parent 659d669 commit c34590e
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 80 deletions.
53 changes: 2 additions & 51 deletions GenBankParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from Bio.SeqFeature import CompoundLocation
from functools import lru_cache

from PAMProcessor import PAMProcessor


class GenBankReader:
def __init__(self, filename):
Expand All @@ -18,57 +20,6 @@ def records(self):
return SeqIO.to_dict(SeqIO.parse(handle, "genbank"))


class PAMFinder:
def __init__(self, records, pam, direction):
self.records = records
self.pam = pam
self.pam_length = len(pam)
self.pam_pattern = self.pam.replace("N", "[ATCG]")

self.direction = direction

def get_pam_seq(self, row):
# Fetch the sequence for the range
sequence = self.records[row.Chromosome].seq[row.Start : row.End]

# If the strand is "-", get the reverse complement of the sequence
if row.Strand == "-":
sequence = sequence.reverse_complement()

# Get the PAM sequence
if self.direction == "upstream":

if row.Strand == "+":
pam_sequence = self.records[row.Chromosome].seq[
row.Start - self.pam_length : row.Start
]
else:
pam_sequence = self.records[row.Chromosome].seq[
row.End : row.End + self.pam_length
]
elif self.direction == "downstream":
if row.Strand == "+":
pam_sequence = self.records[row.Chromosome].seq[
row.End : row.End + self.pam_length
]
else:
pam_sequence = self.records[row.Chromosome].seq[
row.Start - self.pam_length : row.Start
]
else:
raise ValueError("direction must be 'upstream' or 'downstream'")

# If the strand is "-", get the reverse complement of the PAM sequence
if row.Strand == "-":
pam_sequence = pam_sequence.reverse_complement()

return str(pam_sequence)

def pam_matches(self, sequence):
# check if the sequence matches the PAM pattern
return bool(re.search(self.pam_pattern, sequence))


class GenBankParser(Logger):
def __init__(self, filename):
super().__init__()
Expand Down
92 changes: 92 additions & 0 deletions PAMProcessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import re


class PAMProcessor:
def __init__(self, records, pam, direction):
self.records = records
self.pam = pam.replace("N", "[ATCG]")
self.direction = direction

def get_sequence(self, row):
sequence = self.records[row.Chromosome].seq[row.Start : row.End]
if row.Strand == "-":
sequence = sequence.reverse_complement()
return sequence


class GuideFinderByPAM(PAMProcessor):
def __init__(self, records, pam, direction, length):
super().__init__(records, pam, direction)
self.length = length

def find_pam_sequences(self):
pam_sequences = []

for record_id, record in self.records.items():
for strand, sequence in [
(+1, record.seq),
(-1, record.seq.reverse_complement()),
]:
sequence_str = str(sequence)
for match in re.finditer(self.pam, sequence_str):
start = match.start()
end = match.end()

if (
self.direction == "downstream"
and strand == 1
or self.direction == "upstream"
and strand == -1
):
if start >= self.length:
guide_sequence = sequence_str[start - self.length : start]
pam_sequences.append(guide_sequence)
elif (
self.direction == "upstream"
and strand == 1
or self.direction == "downstream"
and strand == -1
):
if end + self.length <= len(sequence_str):
guide_sequence = sequence_str[end : end + self.length]
pam_sequences.append(guide_sequence)

return pam_sequences


class PAMRetriever(PAMProcessor):
def __init__(self, records, pam, direction):
super().__init__(records, pam, direction)
self.pam_length = len(pam)

def get_pam_seq(self, row):
sequence = self.get_sequence(row)

if self.direction == "upstream":
if row.Strand == "+":
pam_sequence = self.records[row.Chromosome].seq[
row.Start - self.pam_length : row.Start
]
else:
pam_sequence = self.records[row.Chromosome].seq[
row.End : row.End + self.pam_length
]
elif self.direction == "downstream":
if row.Strand == "+":
pam_sequence = self.records[row.Chromosome].seq[
row.End : row.End + self.pam_length
]
else:
pam_sequence = self.records[row.Chromosome].seq[
row.Start - self.pam_length : row.Start
]
else:
raise ValueError("direction must be 'upstream' or 'downstream'")

if row.Strand == "-":
pam_sequence = pam_sequence.reverse_complement()

return str(pam_sequence)

def pam_matches(self, sequence):
return bool(re.search(self.pam, sequence))
64 changes: 35 additions & 29 deletions testing_grounds.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
from BarCodeLibrary import BarCodeLibrary
from BowtieRunner import BowtieRunner
from CRISPRiLibrary import CRISPRiLibrary
from GenBankParser import GenBankParser, PAMFinder
from GenBankParser import GenBankParser
from PAMProcessor import PAMProcessor, GuideFinderByPAM, PAMRetriever
from Logger import Logger
from PySamParser import PySamParser
import polars as pl
import pyranges as pr

### Example usage ###
Expand All @@ -18,34 +18,40 @@

# Create a BarCodeLibrary instance by reading a TSV file containing barcodes.
# The 'column' parameter specifies which column in the TSV to use for barcodes.
barcodes = BarCodeLibrary("Example_Libraries/CN-32-zmo.tsv", column="spacer")
# barcodes = BarCodeLibrary("Example_Libraries/CN-32-zmo.tsv", column="spacer")

# Parse a GenBank file to extract genetic information using GenBankParser.
genbank = GenBankParser("GCA_003054575.1.gb")

# Find Protospacer Adjacent Motifs (PAMs) using the PAMFinder class.
# It searches for the specified PAM sequence ('NGNC') downstream of the genetic records.
pam = PAMFinder(genbank.records, "NGNC", "downstream")

# Use BowtieRunner as a context manager to handle the lifecycle of the Bowtie alignment tool.
# This block will create necessary FASTA and FASTQ files, build an index, and perform alignment.
with BowtieRunner() as bowtie:
# Convert GenBank records to FASTA format.
bowtie.make_fasta(genbank.records)
bowtie.make_fastq(barcodes.barcodes) # Convert barcodes to FASTQ format.
bowtie.create_index() # Create an index for the Bowtie alignment.
# Perform the alignment with specified parameters.
bowtie.align(num_mismatches=1, num_threads=12)
# Parse the resulting SAM file to extract alignment data.
sam = PySamParser(bowtie.sam_path)
# Join the alignment data with GenBank ranges.
targets = sam.ranges.join(genbank.ranges)

# Create a CRISPRiLibrary instance to manage CRISPR interference (CRISPRi) guides.
guides = CRISPRiLibrary(targets.df, pam)

# Print the unique targets for the CRISPRi library.
print(guides.unique_targets)

# Print unambiguous targets for the CRISPRi library.
print(guides.unambiguous_targets)
finder = GuideFinderByPAM(genbank.records, "GGGGGG", "downstream", 32)

pam_sequences = finder.find_pam_sequences()

print(pam_sequences[1:20])

# # Find Protospacer Adjacent Motifs (PAMs) using the PAMFinder class.
# # It searches for the specified PAM sequence ('NGNC') downstream of the genetic records.
# pam = PAMRetriever(genbank.records, "NGNC", "downstream")

# # Use BowtieRunner as a context manager to handle the lifecycle of the Bowtie alignment tool.
# # This block will create necessary FASTA and FASTQ files, build an index, and perform alignment.
# with BowtieRunner() as bowtie:
# # Convert GenBank records to FASTA format.
# bowtie.make_fasta(genbank.records)
# bowtie.make_fastq(barcodes.barcodes) # Convert barcodes to FASTQ format.
# bowtie.create_index() # Create an index for the Bowtie alignment.
# # Perform the alignment with specified parameters.
# bowtie.align(num_mismatches=1, num_threads=12)
# # Parse the resulting SAM file to extract alignment data.
# sam = PySamParser(bowtie.sam_path)
# # Join the alignment data with GenBank ranges.
# targets = sam.ranges.join(genbank.ranges)

# # Create a CRISPRiLibrary instance to manage CRISPR interference (CRISPRi) guides.
# guides = CRISPRiLibrary(targets.df, pam)

# # Print the unique targets for the CRISPRi library.
# print(guides.unique_targets)

# # Print unambiguous targets for the CRISPRi library.
# print(guides.unambiguous_targets)

0 comments on commit c34590e

Please sign in to comment.