moved PAM logic to its own class

ryandward · Mar 11, 2024 · c34590e · c34590e
1 parent 659d669
commit c34590e
Show file tree

Hide file tree

Showing 3 changed files with 129 additions and 80 deletions.
diff --git a/GenBankParser.py b/GenBankParser.py
@@ -6,6 +6,8 @@
 from Bio.SeqFeature import CompoundLocation
 from functools import lru_cache
 
+from PAMProcessor import PAMProcessor
+
 
 class GenBankReader:
     def __init__(self, filename):
@@ -18,57 +20,6 @@ def records(self):
             return SeqIO.to_dict(SeqIO.parse(handle, "genbank"))
 
 
-class PAMFinder:
-    def __init__(self, records, pam, direction):
-        self.records = records
-        self.pam = pam
-        self.pam_length = len(pam)
-        self.pam_pattern = self.pam.replace("N", "[ATCG]")
-
-        self.direction = direction
-
-    def get_pam_seq(self, row):
-        # Fetch the sequence for the range
-        sequence = self.records[row.Chromosome].seq[row.Start : row.End]
-
-        # If the strand is "-", get the reverse complement of the sequence
-        if row.Strand == "-":
-            sequence = sequence.reverse_complement()
-
-        # Get the PAM sequence
-        if self.direction == "upstream":
-
-            if row.Strand == "+":
-                pam_sequence = self.records[row.Chromosome].seq[
-                    row.Start - self.pam_length : row.Start
-                ]
-            else:
-                pam_sequence = self.records[row.Chromosome].seq[
-                    row.End : row.End + self.pam_length
-                ]
-        elif self.direction == "downstream":
-            if row.Strand == "+":
-                pam_sequence = self.records[row.Chromosome].seq[
-                    row.End : row.End + self.pam_length
-                ]
-            else:
-                pam_sequence = self.records[row.Chromosome].seq[
-                    row.Start - self.pam_length : row.Start
-                ]
-        else:
-            raise ValueError("direction must be 'upstream' or 'downstream'")
-
-        # If the strand is "-", get the reverse complement of the PAM sequence
-        if row.Strand == "-":
-            pam_sequence = pam_sequence.reverse_complement()
-
-        return str(pam_sequence)
-
-    def pam_matches(self, sequence):
-        # check if the sequence matches the PAM pattern
-        return bool(re.search(self.pam_pattern, sequence))
-
-
 class GenBankParser(Logger):
     def __init__(self, filename):
         super().__init__()

diff --git a/PAMProcessor.py b/PAMProcessor.py
@@ -0,0 +1,92 @@
+import re
+
+
+class PAMProcessor:
+    def __init__(self, records, pam, direction):
+        self.records = records
+        self.pam = pam.replace("N", "[ATCG]")
+        self.direction = direction
+
+    def get_sequence(self, row):
+        sequence = self.records[row.Chromosome].seq[row.Start : row.End]
+        if row.Strand == "-":
+            sequence = sequence.reverse_complement()
+        return sequence
+
+
+class GuideFinderByPAM(PAMProcessor):
+    def __init__(self, records, pam, direction, length):
+        super().__init__(records, pam, direction)
+        self.length = length
+
+    def find_pam_sequences(self):
+        pam_sequences = []
+
+        for record_id, record in self.records.items():
+            for strand, sequence in [
+                (+1, record.seq),
+                (-1, record.seq.reverse_complement()),
+            ]:
+                sequence_str = str(sequence)
+                for match in re.finditer(self.pam, sequence_str):
+                    start = match.start()
+                    end = match.end()
+
+                    if (
+                        self.direction == "downstream"
+                        and strand == 1
+                        or self.direction == "upstream"
+                        and strand == -1
+                    ):
+                        if start >= self.length:
+                            guide_sequence = sequence_str[start - self.length : start]
+                            pam_sequences.append(guide_sequence)
+                    elif (
+                        self.direction == "upstream"
+                        and strand == 1
+                        or self.direction == "downstream"
+                        and strand == -1
+                    ):
+                        if end + self.length <= len(sequence_str):
+                            guide_sequence = sequence_str[end : end + self.length]
+                            pam_sequences.append(guide_sequence)
+
+        return pam_sequences
+
+
+class PAMRetriever(PAMProcessor):
+    def __init__(self, records, pam, direction):
+        super().__init__(records, pam, direction)
+        self.pam_length = len(pam)
+
+    def get_pam_seq(self, row):
+        sequence = self.get_sequence(row)
+
+        if self.direction == "upstream":
+            if row.Strand == "+":
+                pam_sequence = self.records[row.Chromosome].seq[
+                    row.Start - self.pam_length : row.Start
+                ]
+            else:
+                pam_sequence = self.records[row.Chromosome].seq[
+                    row.End : row.End + self.pam_length
+                ]
+        elif self.direction == "downstream":
+            if row.Strand == "+":
+                pam_sequence = self.records[row.Chromosome].seq[
+                    row.End : row.End + self.pam_length
+                ]
+            else:
+                pam_sequence = self.records[row.Chromosome].seq[
+                    row.Start - self.pam_length : row.Start
+                ]
+        else:
+            raise ValueError("direction must be 'upstream' or 'downstream'")
+
+        if row.Strand == "-":
+            pam_sequence = pam_sequence.reverse_complement()
+
+        return str(pam_sequence)
+
+    def pam_matches(self, sequence):
+        return bool(re.search(self.pam, sequence))
diff --git a/testing_grounds.py b/testing_grounds.py
@@ -6,10 +6,10 @@
 from BarCodeLibrary import BarCodeLibrary
 from BowtieRunner import BowtieRunner
 from CRISPRiLibrary import CRISPRiLibrary
-from GenBankParser import GenBankParser, PAMFinder
+from GenBankParser import GenBankParser
+from PAMProcessor import PAMProcessor, GuideFinderByPAM, PAMRetriever
 from Logger import Logger
 from PySamParser import PySamParser
-import polars as pl
 import pyranges as pr
 
 ### Example usage ###
@@ -18,34 +18,40 @@
 
 # Create a BarCodeLibrary instance by reading a TSV file containing barcodes.
 # The 'column' parameter specifies which column in the TSV to use for barcodes.
-barcodes = BarCodeLibrary("Example_Libraries/CN-32-zmo.tsv", column="spacer")
+# barcodes = BarCodeLibrary("Example_Libraries/CN-32-zmo.tsv", column="spacer")
 
 # Parse a GenBank file to extract genetic information using GenBankParser.
 genbank = GenBankParser("GCA_003054575.1.gb")
 
-# Find Protospacer Adjacent Motifs (PAMs) using the PAMFinder class.
-# It searches for the specified PAM sequence ('NGNC') downstream of the genetic records.
-pam = PAMFinder(genbank.records, "NGNC", "downstream")
-
-# Use BowtieRunner as a context manager to handle the lifecycle of the Bowtie alignment tool.
-# This block will create necessary FASTA and FASTQ files, build an index, and perform alignment.
-with BowtieRunner() as bowtie:
-    # Convert GenBank records to FASTA format.
-    bowtie.make_fasta(genbank.records)
-    bowtie.make_fastq(barcodes.barcodes)  # Convert barcodes to FASTQ format.
-    bowtie.create_index()  # Create an index for the Bowtie alignment.
-    # Perform the alignment with specified parameters.
-    bowtie.align(num_mismatches=1, num_threads=12)
-    # Parse the resulting SAM file to extract alignment data.
-    sam = PySamParser(bowtie.sam_path)
-    # Join the alignment data with GenBank ranges.
-    targets = sam.ranges.join(genbank.ranges)
-
-# Create a CRISPRiLibrary instance to manage CRISPR interference (CRISPRi) guides.
-guides = CRISPRiLibrary(targets.df, pam)
-
-# Print the unique targets for the CRISPRi library.
-print(guides.unique_targets)
-
-# Print unambiguous targets for the CRISPRi library.
-print(guides.unambiguous_targets)
+finder = GuideFinderByPAM(genbank.records, "GGGGGG", "downstream", 32)
+
+pam_sequences = finder.find_pam_sequences()
+
+print(pam_sequences[1:20])
+
+# # Find Protospacer Adjacent Motifs (PAMs) using the PAMFinder class.
+# # It searches for the specified PAM sequence ('NGNC') downstream of the genetic records.
+# pam = PAMRetriever(genbank.records, "NGNC", "downstream")
+
+# # Use BowtieRunner as a context manager to handle the lifecycle of the Bowtie alignment tool.
+# # This block will create necessary FASTA and FASTQ files, build an index, and perform alignment.
+# with BowtieRunner() as bowtie:
+#     # Convert GenBank records to FASTA format.
+#     bowtie.make_fasta(genbank.records)
+#     bowtie.make_fastq(barcodes.barcodes)  # Convert barcodes to FASTQ format.
+#     bowtie.create_index()  # Create an index for the Bowtie alignment.
+#     # Perform the alignment with specified parameters.
+#     bowtie.align(num_mismatches=1, num_threads=12)
+#     # Parse the resulting SAM file to extract alignment data.
+#     sam = PySamParser(bowtie.sam_path)
+#     # Join the alignment data with GenBank ranges.
+#     targets = sam.ranges.join(genbank.ranges)
+
+# # Create a CRISPRiLibrary instance to manage CRISPR interference (CRISPRi) guides.
+# guides = CRISPRiLibrary(targets.df, pam)
+
+# # Print the unique targets for the CRISPRi library.
+# print(guides.unique_targets)
+
+# # Print unambiguous targets for the CRISPRi library.
+# print(guides.unambiguous_targets)