Skip to content

Commit

Permalink
redo logic for guide features to keep
Browse files Browse the repository at this point in the history
  • Loading branch information
ryandward committed Mar 7, 2024
1 parent 2d1a877 commit 126c279
Showing 1 changed file with 58 additions and 125 deletions.
183 changes: 58 additions & 125 deletions testing_grounds.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,152 +3,85 @@
# Related third party imports
from Bio.SeqFeature import CompoundLocation
import pyranges as pr
import polars as pl

from BarCodeLibrary import BarCodeLibrary
from BowtieRunner import BowtieRunner
from GenBankParser import GenBankParser
from Logger import Logger
from PySamParser import PySamParser

from GenBankParser import PAMFinder

### Example usage ###
logger = Logger()
barcodes = BarCodeLibrary("Example_Libraries/CN-32-zmo.tsv", column="spacer")
genbank = GenBankParser("GCA_003054575.1.gb")

pam = PAMFinder(genbank.records, "NGNC", "downstream")

with BowtieRunner() as bowtie:
bowtie.make_fasta(genbank.records)
bowtie.make_fastq(barcodes.barcodes)
bowtie.create_index()
bowtie.align(num_mismatches=0, num_threads=12)

bowtie.align(num_mismatches=1, num_threads=12)
sam = PySamParser(bowtie.sam_path)

# TODO: This is a work in progress. I am currently exploring how to create a class.
pam = "NGNC"
direction = "downstream"

targets = sam.ranges.join(genbank.ranges)

## compile stats ##

# barcodes with genome targets, here the genome is called "source"
source_targets = targets[targets.Type == "source"]

logger.info(f"Found {len(source_targets)} barcodes with genome targets ...")

# Create a DataFrame with unique combinations of 'Barcode', 'Chromosome', 'Start', 'End'
unique_genomic_sites = source_targets.df.drop_duplicates(
subset=["Barcode", "Chromosome", "Start", "End"]
)

logger.info(f"Found {len(unique_genomic_sites)} unique genomic sites ...")

# Find barcodes that appear more than once in the unique combinations
barcode_occurrences_in_unique_sites = unique_genomic_sites["Barcode"].value_counts()

logger.info(f"Found {len(barcode_occurrences_in_unique_sites)} unique barcodes ...")

barcodes_in_multiple_sites = barcode_occurrences_in_unique_sites[
barcode_occurrences_in_unique_sites > 1
].index

logger.info(
f"Found {len(barcodes_in_multiple_sites)} barcodes that appear in multiple unique genomic sites ..."
)

multiple_site_barcodes_df = source_targets[
source_targets.Barcode.isin(barcodes_in_multiple_sites)
].df

logger.info(
f"Found {len(multiple_site_barcodes_df)} sites where these barcodes appear ..."
)

# Add the 'Sites' column
multiple_site_barcodes_df["Sites"] = multiple_site_barcodes_df["Barcode"].map(
barcode_occurrences_in_unique_sites
targets.PAM = targets.df.apply(lambda row: pam.get_pam_seq(row), axis=1)

targets.Targeting = targets.PAM.apply(lambda x: pam.pam_matches(x))

# "Source" targets are the targets that are in the genome, agnostic to genes or features

# Immediately reject Barcodes that:
# 1. Do not have a PAM
# 2. Are not targeting
# 3. Are not mapped
# 4. Are not unique (i.e. have multiple targets)

source_unique_targets = (
targets.df[
(targets.df["Type"] == "source")
& (targets.df["Targeting"] == True)
& (targets.df["Mapped"] == True)
]
.loc[lambda df: ~df.duplicated(subset=["Barcode"])]
.reset_index(drop=True)
)


# "Feature" targets are the targets that are in the genome and are annotated to a feature

feature_targets = (
targets.df[
(targets.df["Type"] != "source")
& (targets.df["Targeting"] == True)
& (targets.df["Mapped"] == True)
]
.assign(
Offset=lambda df: df.apply(
lambda row: {"+": row.Start - row.Start_b, "-": row.End_b - row.End}.get(
row.Strand_b, None
),
axis=1,
),
Overlap=lambda df: df.apply(
lambda row: max(min(row.End, row.End_b) - max(row.Start, row.Start_b), 0),
axis=1,
),
)
.reset_index(drop=True)
)

# Convert DataFrame back to PyRanges
multiple_site_barcodes = pr.PyRanges(multiple_site_barcodes_df)
# Features that are not in multiple sites, but might have multiple overlapping annotations

def convert_pam(pam):
return pam.replace("N", "[GACT]")
feature_unique_targets = (
feature_targets[feature_targets["Barcode"].isin(source_unique_targets.Barcode)]
.sort_values(["Chromosome", "Start", "End"])
.reset_index(drop=True)
)

pam_search = convert_pam(pam)

targets.PAM = targets.df.apply(
lambda row: genbank.get_pam_sequence(row, len(pam), direction), axis=1
)

targets.Offset = targets.df.apply(
lambda row: (
row.Start - row.Start_b if row.Strand_b == "+" else row.End_b - row.End
),
axis=1,
)

targets.Overlap = targets.df.apply(
lambda row: min(row.End, row.End_b) - max(row.Start, row.Start_b), axis=1
)
# Features that are not in multiple sites and have unique annotations

targets_with_pam = targets[targets.PAM.str.contains(pam_search, regex=True)]

if targets_with_pam.empty:
logger.warn(f"No targets with matching PAM: '{pam}'")

else:
# here we're leveraging that source contains the whole genome to get the barcode occurrences
pam_barcode_occurrences = targets_with_pam[
targets_with_pam.Type == "source"
].Barcode.value_counts()

gene_targets_with_pam = targets_with_pam[targets_with_pam.Type == "gene"]

logger.info(f"Taken {len(sam.ranges)} barcodes from aligner ...")
logger.info(
f"Found {len(targets[targets.Type == 'source'])} barcodes in the genome ..."
)
logger.info(
f"Found {len(pam_barcode_occurrences)} targets with matching PAM: '{pam}'"
)

gene_targets_with_pam_in_multiple_sites = gene_targets_with_pam[
gene_targets_with_pam.Barcode.isin(barcodes_in_multiple_sites)
]

logger.info(
f"Found {len(gene_targets_with_pam_in_multiple_sites)} gene targets with matching PAM in multiple sites ..."
)

# return the genes with pams that are NOT in multiple sites
gene_targets_with_pam_not_in_multiple_sites = gene_targets_with_pam[
~gene_targets_with_pam.Barcode.isin(barcodes_in_multiple_sites)
]

logger.info(
f"Found {len(gene_targets_with_pam_not_in_multiple_sites)} gene targets with matching PAM NOT in multiple sites ..."
)
logger.info(gene_targets_with_pam_not_in_multiple_sites.df)

# return sites with PAMS that do NOT have genes associated with
sites_with_pam_no_genes = targets_with_pam[
~targets_with_pam.Barcode.isin(gene_targets_with_pam.Barcode)
]
logger.info(
f"Found {len(sites_with_pam_no_genes)} sites with PAMs that are not located in a gene ..."
)
if len(sites_with_pam_no_genes) > 0:
logger.info(sites_with_pam_no_genes.df)

# return the sites without a gene with PAMs that are NOT in multiple sites
sites_with_pam_no_genes_not_in_multiple_sites = sites_with_pam_no_genes[
~sites_with_pam_no_genes.Barcode.isin(barcodes_in_multiple_sites)
]
logger.info(
f"Found {len(sites_with_pam_no_genes_not_in_multiple_sites)} sites with PAMs that are not located in a gene and NOT in multiple sites ..."
)
if len(sites_with_pam_no_genes_not_in_multiple_sites) > 0:
logger.info(sites_with_pam_no_genes_not_in_multiple_sites.df)
feature_unambiguous_targets = feature_unique_targets[
~feature_unique_targets.duplicated(subset=["Barcode"]).reset_index(drop=True)
]

0 comments on commit 126c279

Please sign in to comment.