redo logic for guide features to keep

ryandward · Mar 7, 2024 · 126c279 · 126c279
1 parent 2d1a877
commit 126c279
Showing 1 changed file with 58 additions and 125 deletions.
diff --git a/testing_grounds.py b/testing_grounds.py
@@ -3,152 +3,85 @@
 # Related third party imports
 from Bio.SeqFeature import CompoundLocation
 import pyranges as pr
+import polars as pl
 
 from BarCodeLibrary import BarCodeLibrary
 from BowtieRunner import BowtieRunner
 from GenBankParser import GenBankParser
 from Logger import Logger
 from PySamParser import PySamParser
-
+from GenBankParser import PAMFinder
 
 ### Example usage ###
 logger = Logger()
 barcodes = BarCodeLibrary("Example_Libraries/CN-32-zmo.tsv", column="spacer")
 genbank = GenBankParser("GCA_003054575.1.gb")
-
+pam = PAMFinder(genbank.records, "NGNC", "downstream")
 
 with BowtieRunner() as bowtie:
     bowtie.make_fasta(genbank.records)
     bowtie.make_fastq(barcodes.barcodes)
     bowtie.create_index()
-    bowtie.align(num_mismatches=0, num_threads=12)
-
+    bowtie.align(num_mismatches=1, num_threads=12)
     sam = PySamParser(bowtie.sam_path)
-
-    # TODO: This is a work in progress. I am currently exploring how to create a class.
-    pam = "NGNC"
-    direction = "downstream"
-
     targets = sam.ranges.join(genbank.ranges)
 
-    ## compile stats ##
-
-    # barcodes with genome targets, here the genome is called "source"
-    source_targets = targets[targets.Type == "source"]
-
-    logger.info(f"Found {len(source_targets)} barcodes with genome targets ...")
-
-    # Create a DataFrame with unique combinations of 'Barcode', 'Chromosome', 'Start', 'End'
-    unique_genomic_sites = source_targets.df.drop_duplicates(
-        subset=["Barcode", "Chromosome", "Start", "End"]
-    )
-
-    logger.info(f"Found {len(unique_genomic_sites)} unique genomic sites ...")
-
-    # Find barcodes that appear more than once in the unique combinations
-    barcode_occurrences_in_unique_sites = unique_genomic_sites["Barcode"].value_counts()
-
-    logger.info(f"Found {len(barcode_occurrences_in_unique_sites)} unique barcodes ...")
-
-    barcodes_in_multiple_sites = barcode_occurrences_in_unique_sites[
-        barcode_occurrences_in_unique_sites > 1
-    ].index
-
-    logger.info(
-        f"Found {len(barcodes_in_multiple_sites)} barcodes that appear in multiple unique genomic sites ..."
-    )
-
-    multiple_site_barcodes_df = source_targets[
-        source_targets.Barcode.isin(barcodes_in_multiple_sites)
-    ].df
-
-    logger.info(
-        f"Found {len(multiple_site_barcodes_df)} sites where these barcodes appear ..."
-    )
-
-    # Add the 'Sites' column
-    multiple_site_barcodes_df["Sites"] = multiple_site_barcodes_df["Barcode"].map(
-        barcode_occurrences_in_unique_sites
+targets.PAM = targets.df.apply(lambda row: pam.get_pam_seq(row), axis=1)
+
+targets.Targeting = targets.PAM.apply(lambda x: pam.pam_matches(x))
+
+# "Source" targets are the targets that are in the genome, agnostic to genes or features
+
+# Immediately reject Barcodes that:
+# 1. Do not have a PAM
+# 2. Are not targeting
+# 3. Are not mapped
+# 4. Are not unique (i.e. have multiple targets)
+
+source_unique_targets = (
+    targets.df[
+        (targets.df["Type"] == "source")
+        & (targets.df["Targeting"] == True)
+        & (targets.df["Mapped"] == True)
+    ]
+    .loc[lambda df: ~df.duplicated(subset=["Barcode"])]
+    .reset_index(drop=True)
+)
+
+
+# "Feature" targets are the targets that are in the genome and are annotated to a feature
+
+feature_targets = (
+    targets.df[
+        (targets.df["Type"] != "source")
+        & (targets.df["Targeting"] == True)
+        & (targets.df["Mapped"] == True)
+    ]
+    .assign(
+        Offset=lambda df: df.apply(
+            lambda row: {"+": row.Start - row.Start_b, "-": row.End_b - row.End}.get(
+                row.Strand_b, None
+            ),
+            axis=1,
+        ),
+        Overlap=lambda df: df.apply(
+            lambda row: max(min(row.End, row.End_b) - max(row.Start, row.Start_b), 0),
+            axis=1,
+        ),
     )
+    .reset_index(drop=True)
+)
 
-    # Convert DataFrame back to PyRanges
-    multiple_site_barcodes = pr.PyRanges(multiple_site_barcodes_df)
+# Features that are not in multiple sites, but might have multiple overlapping annotations
 
-    def convert_pam(pam):
-        return pam.replace("N", "[GACT]")
+feature_unique_targets = (
+    feature_targets[feature_targets["Barcode"].isin(source_unique_targets.Barcode)]
+    .sort_values(["Chromosome", "Start", "End"])
+    .reset_index(drop=True)
+)
 
-    pam_search = convert_pam(pam)
-
-    targets.PAM = targets.df.apply(
-        lambda row: genbank.get_pam_sequence(row, len(pam), direction), axis=1
-    )
-
-    targets.Offset = targets.df.apply(
-        lambda row: (
-            row.Start - row.Start_b if row.Strand_b == "+" else row.End_b - row.End
-        ),
-        axis=1,
-    )
-
-    targets.Overlap = targets.df.apply(
-        lambda row: min(row.End, row.End_b) - max(row.Start, row.Start_b), axis=1
-    )
+# Features that are not in multiple sites and have unique annotations
 
-    targets_with_pam = targets[targets.PAM.str.contains(pam_search, regex=True)]
-
-    if targets_with_pam.empty:
-        logger.warn(f"No targets with matching PAM: '{pam}'")
-
-    else:
-        # here we're leveraging that source contains the whole genome to get the barcode occurrences
-        pam_barcode_occurrences = targets_with_pam[
-            targets_with_pam.Type == "source"
-        ].Barcode.value_counts()
-
-        gene_targets_with_pam = targets_with_pam[targets_with_pam.Type == "gene"]
-
-        logger.info(f"Taken {len(sam.ranges)} barcodes from aligner ...")
-        logger.info(
-            f"Found {len(targets[targets.Type == 'source'])} barcodes in the genome ..."
-        )
-        logger.info(
-            f"Found {len(pam_barcode_occurrences)} targets with matching PAM: '{pam}'"
-        )
-
-        gene_targets_with_pam_in_multiple_sites = gene_targets_with_pam[
-            gene_targets_with_pam.Barcode.isin(barcodes_in_multiple_sites)
-        ]
-
-        logger.info(
-            f"Found {len(gene_targets_with_pam_in_multiple_sites)} gene targets with matching PAM in multiple sites ..."
-        )
-
-        # return the genes with pams that are NOT in multiple sites
-        gene_targets_with_pam_not_in_multiple_sites = gene_targets_with_pam[
-            ~gene_targets_with_pam.Barcode.isin(barcodes_in_multiple_sites)
-        ]
-
-        logger.info(
-            f"Found {len(gene_targets_with_pam_not_in_multiple_sites)} gene targets with matching PAM NOT in multiple sites ..."
-        )
-        logger.info(gene_targets_with_pam_not_in_multiple_sites.df)
-
-        # return sites with PAMS that do NOT have genes associated with
-        sites_with_pam_no_genes = targets_with_pam[
-            ~targets_with_pam.Barcode.isin(gene_targets_with_pam.Barcode)
-        ]
-        logger.info(
-            f"Found {len(sites_with_pam_no_genes)} sites with PAMs that are not located in a gene ..."
-        )
-        if len(sites_with_pam_no_genes) > 0:
-            logger.info(sites_with_pam_no_genes.df)
-
-            # return the sites without a gene with PAMs that are NOT in multiple sites
-            sites_with_pam_no_genes_not_in_multiple_sites = sites_with_pam_no_genes[
-                ~sites_with_pam_no_genes.Barcode.isin(barcodes_in_multiple_sites)
-            ]
-            logger.info(
-                f"Found {len(sites_with_pam_no_genes_not_in_multiple_sites)} sites with PAMs that are not located in a gene and NOT in multiple sites ..."
-            )
-            if len(sites_with_pam_no_genes_not_in_multiple_sites) > 0:
-                logger.info(sites_with_pam_no_genes_not_in_multiple_sites.df)
+feature_unambiguous_targets = feature_unique_targets[
+    ~feature_unique_targets.duplicated(subset=["Barcode"]).reset_index(drop=True)
+]