makeBarcodeFusionMatrix_ATAC.sh

#!/bin/bash

### The following code extracts reads from Arriba output "fusions.tsv" and determines which cell they are associated with
### The product is a matrix of barcodes by fusion expression

# Set sample variables
sample="L215_tumour_E"

# Extract desired reads
echo "Extracting reads"
grep "NC_004107" fusions.tsv | grep "WCK01" > WHV_WCK_fusions.tsv
#grep "NC_004107" fusions.discarded.tsv | grep "WCK01" > WHV_WCK_fusions.discarded.tsv
head -n 1 fusions.tsv > header.tsv
cat header.tsv WHV_WCK_fusions.tsv > filtered_fusions.tsv

# Extract read identifiers associated with any fusion, making into newlines and unique
cut -f30 filtered_fusions.tsv | tr , "\n" | tail -n +2 | sort | uniq > fusion_reads.txt

# Extract cell barcodes and UMIs from unique list
echo "Identifying cells with viral integration events"
seqkit grep \
--pattern-file fusion_reads.txt \
/data/zoe_fastqs/multiome/ATAC/"$sample"/*R2* \
> R2_fusions.fastq

# Convert barcode fastq file to fasta, then remove fastq files
echo "Converting fastq files"
seqkit fq2fa R2_fusions.fastq -o R2_fusions.fasta
rm R2_fusions.fastq

# Trim to only have cell barcode in fasta file (reverse compliment of last 16 bp for ATAC)
fastx_trimmer -f 9 -l 24 -i R2_fusions.fasta -o R2_fusions_cellBarcode.fasta

# Take reverse complement
seqkit seq R2_fusions_cellBarcode.fasta -r -p -t "dna" -o R2_fusions_cellBarcode_revComp.fasta

# Trim to only have read name in fasta header
cut -d ' ' -f1 R2_fusions_cellBarcode_revComp.fasta | sponge R2_fusions_cellBarcode_revComp.fasta

# Convert fasta to table
seqkit fx2tab R2_fusions_cellBarcode_revComp.fasta > R2_fusions_cellBarcode.tsv

# Isolate reads identifiers from WHV_WCK_fusions.tsv and replace them with corresponding cell barcode
cut -f30 WHV_WCK_fusions.tsv > fusion_barcodes.tsv
while IFS=$'\t' read -r header barcode
do
    echo "Replacing" $header "with" ${barcode}-1
    sed -i "s/$header/${barcode}-1/g" fusion_barcodes.tsv
done < R2_fusions_cellBarcode.tsv
# Add "barcodes" header to this file
sed -i '1s/^/barcodes\n/g' fusion_barcodes.tsv

# Add this to filtered_fusions.tsv
paste -d "\t" filtered_fusions.tsv fusion_barcodes.tsv > filtered_fusions_withBarcodes.tsv

# Extract barcodes and associated genes from filtered_fusions_withBarcodes.tsv
cut -f1,2,31 filtered_fusions_withBarcodes.tsv > all_fusion_barcodes.tsv
# Combine both gene columns
awk '{print $1 "," $2 "\t" $3}' all_fusion_barcodes.tsv | sponge all_fusion_barcodes.tsv

# Remove super long parts of gene names
sed -i 's/mikado\.WCK01_AAH20201022_F8-//g' all_fusion_barcodes.tsv
# Remove WhBvgp
sed -e 's/WhBvgp//g' -e 's/,\t/\t/g' -e 's/^,//g' -e 's/,,/,/g' -i all_fusion_barcodes.tsv
# Remove lines that contain periods (as these are empty gene names at this point)
grep -v "\." all_fusion_barcodes.tsv | sponge all_fusion_barcodes.tsv
# Remove brackets and anything within them, removing whitespace
sed -e 's/([^()]*)//g' all_fusion_barcodes.tsv | tr -d ' ' | sponge all_fusion_barcodes.tsv

# Expand the comma separated barcodes, while repeated gene fusion information
sed -Ee 's/^((\S+\t)[^,]+),/\1\n\2/;P;D' -i all_fusion_barcodes.tsv
# Remove header
tail -n +2 all_fusion_barcodes.tsv | sponge all_fusion_barcodes.tsv
# NOTE - could make a contingency table in R at this point, but I'm going to make one in bash
# Count occurances of each fusion
sort all_fusion_barcodes.tsv | uniq -c | sponge all_fusion_barcodes.tsv
# Make contingency table, using 0s instead of NAs for empty values
datamash -sW crosstab 3,2 unique 1 --filler=0 < all_fusion_barcodes.tsv > fusion_countMatrix.tsv