SplicingRates_code.Rmd

---
title: "Code for: Drosophila Splicing Rates"
author: "Athma A. Pai"
output: pdf_document
---

# Contents
- [Set up of workspace](#setup)
- [Functions](#functions)
- [Expression](#expression)

[Table of Contents](#contents)

# Setup
## Set-up workspace and libraries
```{r, message=FALSE, warning=FALSE}
setwd("~/Dropbox (MIT)/Projects/Adelman/timecourse")
setdir="~/Dropbox (MIT)/Projects/Adelman/timecourse/"
#load libraries
library(ggplot2)
library(wesanderson)
library(GGally)
library(corrplot)
library(cowplot)
library(scales)
library(gridExtra)
library(ggrepel)
library(MatchIt)
library(seqLogo)
library(png)
library(grid)
library(RColorBrewer)
library(relaimpo)

load("~/Desktop/Dropbox (MIT)/Projects/Adelman/timecourse/FinalCode.Rdata")
save.image("~/Desktop/Dropbox (MIT)/Projects/Adelman/timecourse/FinalCode.Rdata")

```

[Table of Contents](#contents)

# Functions
## Functions to use during script
```{r, echo=TRUE}
getLM_throughzero <- function(vals){
  times <- c(5,5,5,10,10,10,20,20,20)
  lm.row = c(NA,NA,NA)
  try(lm.hold <- lm(log(vals)~0+times))
  try(lm.row <- c(as.numeric(lm.hold$coefficients[1]),summary(lm.hold)$coefficients[1,4],
                  summary(lm.hold)$adj.r.squared))
  return(lm.row)
}

getLM_throughzero_tau <- function(input){
  vals <- input[1:9]
  distance <- input[10]
  # assume txn rate is 1.5kb/minute
  tau = distance/1500
  # get corrected time
  times <- c(5,5,5,10,10,10,20,20,20)
  times_with <- (times+tau)/2
  lm.row = c(NA,NA,NA)
  try(lm.hold <- lm(log(vals)~0+times_with))
  try(lm.row <- c(as.numeric(lm.hold$coefficients[1]),summary(lm.hold)$coefficients[1,4],
                  summary(lm.hold)$adj.r.squared))
  return(lm.row)
}

getLM_throughzero_tau_lowadj <- function(input){
  vals <- as.numeric(input[1:9])
  distance <- as.numeric(input[10])
  # assume txn rate is 1.5kb/minute
  tau = distance/1500
  # get corrected time
  times <- c(5,5,5,10,10,10,20,20,20)
  times_with <- (times+tau)/2
  lm.row = c(NA,NA,NA, NA)
  try(lm.hold <- lm(log(vals)~0+times_with))
  try(lm.row <- c(as.numeric(lm.hold$coefficients[1]),summary(lm.hold)$coefficients[1,4],
                  summary(lm.hold)$adj.r.squared, "fit"))
  # correct for low half-life; if second effective timepoint should have PSI < 0.1
  meaninitial <- mean(vals[1:3],na.rm=T)
  k <- -log(meaninitial)/times_with[1]
  meansecond <- exp(-k*times_with[4])
  try(if(meansecond <= 0.1){ lm.row <- c(-k, NA, NA, "corrected") })
  return(lm.row)
}


writeSSbeds <- function(name,introns){
  out.names <- c()
  # name = outbase name
  # introns = SEs[gene,chr,start,end,strand]
  file.3ss <- paste0(name,".threeSS.bed")
  file.5ss <- paste0(name,".fiveSS.bed")
  for(i in 1:nrow(introns)){
    print(i)
    gene <- as.character(introns[i,1])
    chr <- as.character(introns[i,2])
    # introns lines
    intron3 <- c(chr,as.numeric(as.character(introns[i,4]))-21,as.numeric(as.character(introns[i,4]))+2,paste(gene,"3ss",sep=";"),".","+")
    intron5 <- c(chr,as.numeric(as.character(introns[i,3]))-3,as.numeric(as.character(introns[i,3]))+6,paste(gene,"5ss",sep=";"),".","+")
    if(introns[i,5] == "-"){
      #print("minus!")
      intron3 <- c(chr,as.numeric(as.character(introns[i,3]))-3,as.numeric(as.character(introns[i,3]))+20,paste(gene,"3ss",sep=";"),".","-")
      intron5 <- c(chr,as.numeric(as.character(introns[i,4]))-7,as.numeric(as.character(introns[i,4]))+2,paste(gene,"5ss",sep=";"),".","-")
    }
    out.names <- rbind(out.names,c(intron5[4],intron3[4]))
    write.table(rbind(intron3),file=file.3ss,append=T,sep="\t",quote=F,row.names=F,col.names=F)
    write.table(rbind(intron5),file=file.5ss,append=T,sep="\t",quote=F,row.names=F,col.names=F)
  } 
  colnames(out.names) <- c("fivess","threess")
  return(out.names)
}

Tcomplete <- function(x){
  #x = [constant, intercept, total]
  return((x[2] - log(x[3]))/(-x[1]))
}

halflife <- function(x){
  #x = [constant]
  #half.val <- 1-x[3]
  #return((x[2] + log(half.val))/x[1])
  return(log(2)/(-x[1]))
}

# write out beds of flanking exons to get nucleotide 
flankingExonBeds <- function(x,filename){
  # x =[name]
  # filename = basename.flankexon.[up/down].bed
  name = strsplit(as.character(x),split=":")[[1]]
  upexon <- c(name[1],name[2],name[3],as.character(x),"up",name[7])
  downexon <- c(name[1],name[5],name[6],as.character(x),"down",name[7])
  if(name[7] == "-"){
    upexon <- c(name[1],name[5],name[6],as.character(x),"up",name[7])
    downexon <- c(name[1],name[2],name[3],as.character(x),"down",name[7])
  }
  write.table(rbind(upexon),paste0(filename,".flankexon.up.bed"),append=T,sep="\t",quote=F,row.names=F,col.names=F)
  write.table(rbind(downexon),paste0(filename,".flankexon.down.bed"),append=T,sep="\t",quote=F,row.names=F,col.names=F)
}

binVec <- function(vec,x){
  # takes in: (1) vector to bin based on [vec], (2) quantile bins (ie. 5% bins) [x]
  # returns vector indicating which bin the value falls in (x*#bin)
  binned <- rep("0%",length(vec))
  seps <- seq(100,x,by=-x)/100
  quant.sep <- quantile(vec,seps,na.rm=T)
  for(i in 1:length(quant.sep)){
    #    print(i)
    binned[which(vec <= quant.sep[[i]][1])] <- names(quant.sep[i]) 
  }
  return(binned)
  # to reorder levels properly: paste(seq(x,100,x),"%",sep=""), where x is the quantile bin
}

getBPbins <- function(x, num){
  minx <- min(x)
  maxx <- max(x)
  bins <- seq(minx, maxx-num, num)
  binout <- c()
  binout[which(x >= bins[1] & x < bins[2])] <- bins[1]
  binout[which(x >= bins[length(bins)])] <- bins[length(bins)]
  for(i in 2:(length(bins)-1)){
    binout[which(x >= bins[i] & x < bins[i+1])] <- bins[i]
  }
  return(binout)
}

get_runningmedians <- function(datahere, bins, bin.min, colnum){
  run.half <- c()
  for(i in 1:nrow(datahere)){
    if(i <= bins){
      size <- max(bin.min, i)
      run.half[i] <- median(datahere[c(1:size),colnum],na.rm=T)
    }
    if(i > bins & i < nrow(datahere)-bins){
      run.half[i] <- median(datahere[c((i-bins):(i+bins)),colnum],na.rm=T)
    }
    if(i >= nrow(datahere)-bins){
      size <- min(i, nrow(datahere)-bin.min)
      run.half[i] <- median(datahere[c(size:nrow(datahere)),colnum],na.rm=T)
    }
  }
  median.data <- data.frame(length = datahere$intronlen, half=datahere$fitvalue, pos = datahere$intronnum, fit_half = run.half, IEratio=datahere$IEratio)
  return(median.data)
}

```


# Expression
Read in gene expression data. TPMs were obtained by running Kallisto on each fastq file and then combining transcript-level TPMs to gene-level TPMs with tximport package

```{r} 
adelman.txi <- read.table(file=paste0(setdir,"adelman_abundance.gene"),sep="\t",header=T) 
adelman.txi <- adelman.txi[,c(3,1,2,4)]
colnames(adelman.txi) <- c("5m","10m","20m","total")
```


# Splicing
Read in splicing information across timepoints. PSIs were obtained by running MISO on each mapped bam file (mapped with Tophat)

```{r}
all.combo <- read.table(paste0(setdir,"4sU_all.psi"),header=T)
psis <- all.combo[,c(2:12)]
rownames(psis) <- all.combo[,1]
# take mean across replicates per timepoints
miso.meanPsi <- as.data.frame(cbind(rowMeans(all.combo[,2:4],na.rm=T),rowMeans(all.combo[,5:7],na.rm=T),rowMeans(all.combo[,8:10],na.rm=T),rowMeans(all.combo[,11:12],na.rm=T)))
rownames(miso.meanPsi) <- all.combo$introns
colnames(miso.meanPsi) <- c("meanPsi_5m","meanPsi_10m","meanPsi_20m","meanPsi_total")
miso.names <- matrix(unlist(strsplit(rownames(miso.meanPsi),split=":")),ncol=7,byrow=T)
```

Read in junction-based splicing information. PSIs were obtained using only junction reads - with intron-exon | exon-intron reads indicating incomplete splicing and exon-exon reads indicating completed splicing.
```{r}
altpsi.names <- c("5min_rep1", "5min_rep2", "5min_rep3","10min_rep1","10min_rep2","10min_rep3","20min_rep1","20min_rep2","20min_rep3","total_rep1","total_rep2")
altpsi.init <- read.table(paste0(setdir,"alt_psi/Adelman_4sU_RNA-seq_5min_rep1.intron.psi"),header=T)
altpsi <- data.frame(name=altpsi.init$name)
for(i in 1:length(altpsi.names)){
  print(i)
  altpsi.hold <- read.table(paste0(setdir,"alt_psi/Adelman_4sU_RNA-seq_",altpsi.names[i],".intron.psi"),header=T)
  readcount = rowSums(altpsi.hold[,c(2:4)])
  altpsi.hold$PSI[which(readcount < 20)] <- NA
  altpsi <- cbind(altpsi, altpsi.hold$PSI)
}
colnames(altpsi)[2:12] <- c("psi_5m_r1","psi_5m_r2","psi_5m_r3","psi_10m_r1","psi_10m_r2","psi_10m_r3","psi_20m_r1","psi_20m_r2","psi_20m_r3","psi_total_r1","psi_total_r2")
```


# Simulations to assess methods

Read in simulated ratios
```{r}
Ddists = seq(1,5,1)
exprlevels = seq(1,50,5)
ratiofit.dir <- "~/Dropbox (MIT)/Projects/Adelman/timecourse/simulations/ratioSIM"

ratiosims <- c()
for(D in Ddists){
  for(X in exprlevels){
    print(paste0(D," - ",X))
    hold <- read.table(paste0(ratiofit.dir,"/ratioSIM_D",D,"_X",X,".txt"),header=T)
    ratiosims <- rbind(ratiosims, hold)
  }
}

# make NA those without any reads in either the 5 or 60m timepoints
ratiosims$splicingratio[which(ratiosims$intron_5 ==0 | ratiosims$intron_60 == 0)] <- NA

```

Read in simulated junc fits
```{r}

Ddists = seq(1,5,1)
exprlevels = seq(1,50,5)
juncfit.dir <- "~/Dropbox (MIT)/Projects/Adelman/timecourse/simulations/juncSIM"

juncsims <- c()
for(D in Ddists){
  for(X in exprlevels){
    print(paste0(D," - ",X))
    hold <- read.table(paste0(juncfit.dir,"/juncSIM_D",D,"_X",X,".txt"),header=T)
    juncsims <- rbind(juncsims, hold)
  }
}

# make NA:
# those where fit failed: already in dataframe
# those without exon-exon junction reads
juncsims$root[which(is.infinite(juncsims$ratio_5) | is.infinite(juncsims$ratio_10) | is.infinite(juncsims$ratio_20))] <- NA
# those without intron-exon junction coverage in 5m timepoint
juncsims$root[which(juncsims$unspliced_5 == 0)] <- NA
# those with coverage in at least two timepoints
juncsims$root[which(juncsims$unspliced_10 ==0 & juncsims$unspliced_20==0)] <- NA

```

Read in simulated miso fits
```{r}

Ddists = seq(1,5,1)
psifit.dir = "~/Dropbox (MIT)/Projects/Adelman/timecourse/simulations/psiSIM"

psisims <- c()
for(D in Ddists){
  print(D)
  hold <- read.table(paste0(psifit.dir,"/psiSIM_D",D,".txt"),header=T)
  psisims <- rbind(psisims,hold)
}

# make NA:
# any timepoint is NA
psisims$halflife[which(is.na(psisims$psi5) | is.na(psisims$psi10) | is.na(psisims$psi20))] <- NA
# 10m | 20m PSI > 5m PSI
psisims$halflife[which((psisims$psi10 > psisims$psi5) | (psisims$psi20 > psisims$psi5))] <- NA
# all psis == 0
psisims$halflife[which(psisims$psi5 == 1 & psisims$psi10 == 1 & psisims$psi20 == 1)] <- NA
# 20m & 10m >= 5m
psisims$halflife[which((psisims$psi20 >= psisims$psi5) & (psisims$psi20 >= psisims$psi5))] <- NA

```

Combine simulated metrics
```{r}

Ddists = seq(1,5,1)
exprlevels = seq(1,50,5)
half_lives = unique(ratiosims$half_life)
introns = unique(ratiosims$intron)

fullsims <- c()
fullsims.cor <- c()
for(D in Ddists){
  d.ratio = subset(ratiosims, D_dist == D*1000)
  d.junc = subset(juncsims, D_dist == D*1000)
  d.psi = subset(psisims, D_dist == D)
  for(exp in exprlevels){
    d.e.ratio = subset(d.ratio, expression_level == exp)
    d.e.junc = subset(d.junc, expression_level == exp)
    d.e.psi = subset(d.psi, expression == exp)
    for(i in introns){
      print(paste(D,exp,i,sep=" - "))
      d.e.i.ratio = subset(d.e.ratio, intron == i)
      d.e.i.junc = subset(d.e.junc, intron == i)
      d.e.i.psi = subset(d.e.psi, intron_len == as.numeric(i)/1000)
      # get match inds
      d.e.i.h.ratio.match = match(half_lives, d.e.i.ratio$half_life)
      d.e.i.h.junc.match = match(half_lives, d.e.i.junc$half_life)
      d.e.i.h.psi.match = match(half_lives, d.e.i.psi$sim_half_life)
      # create temp dataframe
      hold.data <- data.frame(D_dist = D,
                              expression_level = exp,
                              intron = i,
                              half_life = rep(half_lives, 3),
                              type = rep(c("ratio","psi","junc"),each=length(half_lives)),
                              sim_hl = c(d.e.i.ratio$splicingratio[d.e.i.h.ratio.match],
                                         d.e.i.psi$halflife[d.e.i.h.psi.match],
                                         d.e.i.junc$root[d.e.i.h.junc.match]))
      hold.row <- data.frame(D_dist = D,
                             expression_level = exp,
                             intron = i,
                             type = rep(c("ratio","psi","junc"),2),
                             cor_type = rep(c("spearman","ME"),each=3),
                             relate = c(cor(subset(hold.data, type=="ratio")$half_life, subset(hold.data, type=="ratio")$sim_hl, use="na.or.complete", method="spearman"),
                                        cor(subset(hold.data, type=="psi")$half_life, subset(hold.data, type=="psi")$sim_hl, use="na.or.complete", method="spearman"),
                                        cor(subset(hold.data, type=="junc")$half_life, subset(hold.data, type=="junc")$sim_hl, use="na.or.complete", method="spearman"),
                                        sum((subset(hold.data, type=="ratio")$sim_hl - subset(hold.data, type=="ratio")$half_life), na.rm=T)/sum(!is.na(subset(hold.data, type=="ratio")$sim_hl)),
                                        sum((subset(hold.data, type=="psi")$sim_hl - subset(hold.data, type=="psi")$half_life), na.rm=T)/sum(!is.na(subset(hold.data, type=="ratio")$sim_hl)),
                                        sum((subset(hold.data, type=="junc")$sim_hl - subset(hold.data, type=="junc")$half_life), na.rm=T)/sum(!is.na(subset(hold.data, type=="ratio")$sim_hl))),
                             cor_hl20 = c(cor(subset(hold.data, type=="ratio" & half_life <=20)$half_life, subset(hold.data, type=="ratio" & half_life <=20)$sim_hl, use="na.or.complete", method="spearman"),
                                          cor(subset(hold.data, type=="psi" & half_life <=20)$half_life, subset(hold.data, type=="psi" & half_life <=20)$sim_hl, use="na.or.complete", method="spearman"),
                                          cor(subset(hold.data, type=="junc" & half_life <=20)$half_life, subset(hold.data, type=="junc" & half_life <=20)$sim_hl, use="na.or.complete", method="spearman"),
                                          sum((subset(hold.data, type=="ratio" & half_life <= 20)$sim_hl - subset(hold.data, type=="ratio" & half_life<=20)$half_life), na.rm=T)/
                                            sum(!is.na(subset(hold.data, type=="ratio" & half_life <=20)$sim_hl)),
                                          sum((subset(hold.data, type=="psi" & half_life <= 20)$sim_hl - subset(hold.data, type=="psi" & half_life<=20)$half_life)^2, na.rm=T)/
                                            sum(!is.na(subset(hold.data, type=="ratio" & half_life <=20)$sim_hl)),
                                          sum((subset(hold.data, type=="junc" & half_life <=20)$sim_hl - subset(hold.data, type=="junc" & half_life<=20)$half_life)^2, na.rm=T)/
                                            sum(!is.na(subset(hold.data, type=="ratio" & half_life <=20)$sim_hl))))
      # add to full dataframe
      fullsims <- rbind(fullsims, hold.data)
      fullsims.cor <- rbind(fullsims.cor, hold.row)
    }
  }
}

fullsims.cor$relate <- as.numeric(fullsims.cor$relate)
fullsims.cor$cor_hl20 <- as.numeric(fullsims.cor$cor_hl20)

# percentage with detection power
nrow(subset(fullsims, type=="ratio" & !is.na(sim_hl)))/(nrow(fullsims)/3)
nrow(subset(fullsims, type=="psi" & !is.na(sim_hl)))/(nrow(fullsims)/3)
nrow(subset(fullsims, type=="junc" & !is.na(sim_hl)))/(nrow(fullsims)/3)

# percentage with detection power
nrow(subset(fullsims, type=="ratio" & !is.na(sim_hl) & expression_level > 5))/(nrow(fullsims)/3)
nrow(subset(fullsims, type=="psi" & !is.na(sim_hl) & expression_level > 5))/(nrow(fullsims)/3)
nrow(subset(fullsims, type=="junc" & !is.na(sim_hl) & expression_level > 5))/(nrow(fullsims)/3)

```

Look at simulation correlations
```{r}

ggplot(fullsims, aes(x=half_life, y=sim_hl)) + geom_point(alpha=0.05) + geom_abline() + facet_wrap(~type)
ggplot(subset(fullsims, half_life<=10 & expression_level >= 10), aes(x=half_life, y=sim_hl)) + geom_point() + geom_abline() + facet_wrap(~type) + scale_x_log10() + scale_y_log10()
ggplot(fullsims, aes(x=half_life,y=sim_hl,fill=factor(type))) + geom_boxplot(notch=T) + facet_grid(D_dist~expression_level)

full.sim.cors.data <- data.frame(cor_type = "spearman",
                                type = c("ratio","psi","junc"),
                                meancor = c(mean(subset(fullsims.cor, cor_type=="spearman" & type=="ratio")$relate, na.rm=T),
                                            mean(subset(fullsims.cor, cor_type=="spearman" & type=="psi")$relate, na.rm=T),
                                            mean(subset(fullsims.cor, cor_type=="spearman" & type=="junc")$relate, na.rm=T)),
                                secor = c(sd(subset(fullsims.cor, cor_type=="spearman" & type=="ratio")$relate, na.rm=T)/sqrt(nrow(fullsims.cor)/6),
                                          sd(subset(fullsims.cor, cor_type=="spearman" & type=="psi")$relate, na.rm=T)/sqrt(nrow(fullsims.cor)/6),
                                          sd(subset(fullsims.cor, cor_type=="spearman" & type=="junc")$relate, na.rm=T)/sqrt(nrow(fullsims.cor)/6)),
                                meancor_hl20 = c(mean(subset(fullsims.cor, cor_type=="spearman" & type=="ratio")$cor_hl20, na.rm=T),
                                                 mean(subset(fullsims.cor, cor_type=="spearman" & type=="psi")$cor_hl20, na.rm=T),
                                                 mean(subset(fullsims.cor, cor_type=="spearman" & type=="junc")$cor_hl20, na.rm=T)),
                                secor_hl20 = c(sd(subset(fullsims.cor, cor_type=="spearman" & type=="ratio")$cor_hl20, na.rm=T)/sqrt(nrow(fullsims.cor)/6),
                                               sd(subset(fullsims.cor, cor_type=="spearman" & type=="psi")$cor_hl20, na.rm=T)/sqrt(nrow(fullsims.cor)/6),
                                               sd(subset(fullsims.cor, cor_type=="spearman" & type=="junc")$cor_hl20, na.rm=T)/sqrt(nrow(fullsims.cor)/6)))
full.sim.cors.data$type <- factor(full.sim.cors.data$type, levels=c("ratio","psi","junc"))

fullsims.cor$type <- factor(fullsims.cor$type, levels=c("ratio","psi","junc"))
ggplot(subset(fullsims.cor, cor_type=="ME"), aes(x=factor(cor_type),y=relate, fill=factor(type))) + geom_boxplot(notch=T)
ggplot(subset(fullsims.cor, expression_level>=5 & cor_type=="SSE"), aes(x=factor(cor_type),y=cor_hl10, fill=factor(type))) + geom_boxplot(notch=T) + scale_y_log10()

```

Simulation errors
```{r}

fullsims$percenterror <- fullsims$sim_hl/fullsims$half_life
fullsims$logpercenterror <- log2(fullsims$percenterror)
fullsims$abslogpercenterror <- abs(fullsims$logpercenterror)

ggplot(fullsims, aes(x=factor(type), y=abslogpercenterror)) + geom_boxplot()

fullsims.error <- data.frame(type = c("ratio","psi","junc"),
                             abslogpererror_mean = c(NA, mean(subset(fullsims, type=="psi")$abslogpercenterror, na.rm=T), 
                                                     mean(subset(fullsims, type=="junc")$abslogpercenterror, na.rm=T)),
                             abslogpererror_sd = c(NA, sd(subset(fullsims, type=="psi")$abslogpercenterror, na.rm=T)/nrow(subset(fullsims, type=="psi")), 
                                                       sd(subset(fullsims, type=="junc")$abslogpercenterror, na.rm=T)/nrow(subset(fullsims, type=="junc"))),
                             abslogpererror_hl20_mean = c(NA, mean(subset(fullsims, type=="psi" & half_life<=20)$abslogpercenterror, na.rm=T), 
                                                          mean(subset(fullsims, type=="junc" & half_life<=20)$abslogpercenterror, na.rm=T)),
                             abslogpererror_hl20_se = c(NA, sd(subset(fullsims, type=="psi" & half_life<=20)$abslogpercenterror, na.rm=T)/nrow(subset(fullsims, type=="psi" & half_life<=20)), 
                                                       sd(subset(fullsims, type=="junc" & half_life<=20)$abslogpercenterror, na.rm=T)/nrow(subset(fullsims, type=="junc" & half_life<=20))))
fullsims.error$type <- factor(fullsims.error$type, levels=c("ratio","psi","junc"))                             
                             

### 10,000 random comparisons to get relative percent estimation

fullsims.ratio <- subset(fullsims, type=="ratio")
fullsims.psi <- subset(fullsims, type=="psi")
fullsims.junc <- subset(fullsims, type=="junc")
fullsims.ratio.hl20 <- subset(fullsims, type=="ratio" & half_life<=20)
fullsims.psi.hl20 <- subset(fullsims, type=="psi" & half_life<=20)
fullsims.junc.hl20 <- subset(fullsims, type=="junc" & half_life<=20)

choose.first <- sample((which(!is.na(fullsims.ratio$sim_hl) & !is.na(fullsims.psi$sim_hl) & !is.na(fullsims.junc$sim_hl))), 10000, replace=F)
choose.second <- sample((which(!is.na(fullsims.ratio$sim_hl) & !is.na(fullsims.psi$sim_hl) & !is.na(fullsims.junc$sim_hl))), 10000, replace=F)

choose.first.hl20 <- sample((which(!is.na(fullsims.ratio.hl20$sim_hl) & !is.na(fullsims.psi.hl20$sim_hl) & !is.na(fullsims.junc.hl20$sim_hl))), 10000, replace=F)
choose.second.hl20 <- sample((which(!is.na(fullsims.ratio.hl20$sim_hl) & !is.na(fullsims.psi.hl20$sim_hl) & !is.na(fullsims.junc.hl20$sim_hl))), 10000, replace=F)

fullsims.ratio.logpercenterror <- log2((fullsims.ratio$sim_hl[choose.first]/fullsims.ratio$sim_hl[choose.second])/(fullsims.ratio$half_life[choose.first]/fullsims.ratio$half_life[choose.second]))
fullsims.psi.logpercenterror <- log2((fullsims.psi$sim_hl[choose.first]/fullsims.psi$sim_hl[choose.second])/(fullsims.psi$half_life[choose.first]/fullsims.psi$half_life[choose.second]))
fullsims.junc.logpercenterror <- log2((fullsims.junc$sim_hl[choose.first]/fullsims.junc$sim_hl[choose.second])/(fullsims.junc$half_life[choose.first]/fullsims.junc$half_life[choose.second]))

fullsims.ratio.logpercenterror.hl20 <- log2((fullsims.ratio.hl20$sim_hl[choose.first.hl20]/fullsims.ratio.hl20$sim_hl[choose.second.hl20])/
                                              (fullsims.ratio.hl20$half_life[choose.first.hl20]/fullsims.ratio.hl20$half_life[choose.second.hl20]))
fullsims.psi.logpercenterror.hl20 <- log2((fullsims.psi.hl20$sim_hl[choose.first.hl20]/fullsims.psi.hl20$sim_hl[choose.second.hl20])/
                                            (fullsims.psi.hl20$half_life[choose.first.hl20]/fullsims.psi.hl20$half_life[choose.second.hl20]))
fullsims.junc.logpercenterror.hl20 <- log2((fullsims.junc.hl20$sim_hl[choose.first.hl20]/fullsims.junc.hl20$sim_hl[choose.second.hl20])/
                                             (fullsims.junc.hl20$half_life[choose.first.hl20]/fullsims.junc.hl20$half_life[choose.second.hl20]))

fullsims.relative.error <- data.frame(type=c("ratio", "psi","junc"),
                                      logpererror_mean = c(mean(fullsims.ratio.logpercenterror, na.rm=T),
                                                              mean(fullsims.psi.logpercenterror, na.rm=T),
                                                              mean(fullsims.junc.logpercenterror, na.rm=T)),
                                      logpererror_se = c(sd(fullsims.ratio.logpercenterror, na.rm=T)/length(choose.first),
                                                             sd(fullsims.psi.logpercenterror, na.rm=T)/length(choose.first),
                                                             sd(fullsims.junc.logpercenterror, na.rm=T)/length(choose.first)),
                                      abslogpererror_mean = c(mean(abs(fullsims.ratio.logpercenterror), na.rm=T),
                                                              mean(abs(fullsims.psi.logpercenterror), na.rm=T),
                                                              mean(abs(fullsims.junc.logpercenterror), na.rm=T)),
                                      abslogpererror_se = c(sd(abs(fullsims.ratio.logpercenterror), na.rm=T)/length(choose.first),
                                                             sd(abs(fullsims.psi.logpercenterror), na.rm=T)/length(choose.first),
                                                             sd(abs(fullsims.junc.logpercenterror), na.rm=T)/length(choose.first)))
fullsims.relative.error$type <- factor(fullsims.relative.error$type, levels=c("ratio","psi","junc"))

fullsims.relative.error.hl20 <- data.frame(type=c("ratio", "psi","junc"),
                                      logpererror_mean = c(mean(fullsims.ratio.logpercenterror.hl20, na.rm=T),
                                                              mean(fullsims.psi.logpercenterror.hl20, na.rm=T),
                                                              mean(fullsims.junc.logpercenterror.hl20, na.rm=T)),
                                      logpererror_se = c(sd(fullsims.ratio.logpercenterror.hl20, na.rm=T)/length(choose.first.hl20),
                                                             sd(fullsims.psi.logpercenterror.hl20, na.rm=T)/length(choose.first.hl20),
                                                             sd(fullsims.junc.logpercenterror.hl20, na.rm=T)/length(choose.first.hl20)),
                                      abslogpererror_mean = c(mean(abs(fullsims.ratio.logpercenterror.hl20), na.rm=T),
                                                              mean(abs(fullsims.psi.logpercenterror.hl20), na.rm=T),
                                                              mean(abs(fullsims.junc.logpercenterror.hl20), na.rm=T)),
                                      abslogpererror_se = c(sd(abs(fullsims.ratio.logpercenterror.hl20), na.rm=T)/length(choose.first.hl20),
                                                             sd(abs(fullsims.psi.logpercenterror.hl20), na.rm=T)/length(choose.first.hl20),
                                                             sd(abs(fullsims.junc.logpercenterror.hl20), na.rm=T)/length(choose.first.hl20)))
fullsims.relative.error.hl20$type <- factor(fullsims.relative.error.hl20$type, levels=c("ratio","psi","junc"))


```

Correlation across half-lifes
```{r}

fullsims$type <- factor(fullsims$type, levels=c("ratio","psi","junc"))

ggplot(fullsims, aes(x=half_life, y=sim_hl,fill=factor(type),color=factor(type))) + 
  geom_point(data=subset(fullsims, type=="ratio"),alpha=0.05,shape=21,color=NA) + 
  geom_point(data=subset(fullsims, type=="psi"),alpha=0.05,shape=21,color=NA) + 
  geom_point(data=subset(fullsims, type=="junc"),alpha=0.05,shape=21,color=NA) + 
  stat_smooth(data=subset(fullsims, type=="ratio"),method="lm") + stat_smooth(data=subset(fullsims, type=="psi"),method="lm") + stat_smooth(data=subset(fullsims, type=="junc"),method="lm") +
  scale_fill_manual(values=c(brewer.pal(9,"BuPu")[6], brewer.pal(9,"Oranges")[3], brewer.pal(9,"RdPu")[3])) +
  scale_color_manual(values=c(brewer.pal(9,"BuPu")[8], brewer.pal(9,"Oranges")[5], brewer.pal(9,"RdPu")[5])) +
  geom_abline(linetype="longdash",color="grey50",size=2) +xlim(0,10) + ylim(0,40)


```

# Model splicing
Read in only junction reads (exon-exon and intron-exon) to model splicing rates jointly with continuing transcription

```{r}
ratio.names <- c("5min_rep1", "5min_rep2", "5min_rep3","10min_rep1","10min_rep2","10min_rep3","20min_rep1","20min_rep2","20min_rep3","total_rep1","total_rep2")
juncratio.data <- c()
for(i in ratio.names){
  print(i)
  junccombo <- read.table(paste0("junccombo/Adelman_4sU_RNA-seq_",i,"_startsites_junc.combo"),header=T)
  hold.data <- data.frame(time = strsplit(i, split="_")[[1]][1],
                          rep = strsplit(i, split="_")[[1]][2],
                          intron = junccombo$intron,
                          ee_count = junccombo$ee_count,
                          ie_count = junccombo$ie_count)
  juncratio.data <- rbind(juncratio.data, hold.data)
}

juncratio.data$ratio <- (juncratio.data$ie_count/40)/(juncratio.data$ee_count/40)

ggplot(juncratio.data, aes(x=factor(time),y=ratio,fill=factor(rep))) + geom_boxplot(notch=T) + scale_y_log10(limits=c(0.05,7.5),breaks=c(0.1, 0.25, 0.5, 0.75, 1, 2.5, 5),labels=comma)

```

Get information for introns 
```{r}
fullintronlist <- as.character(unique(juncratio.data$intron))
# gene
dm3_introns <- read.table("~/Dropbox (MIT)/Annotations/dm3_ensGene_annot/dm3.genes.gff",sep="\t",header=F)
gene1 <- unlist(lapply(strsplit(as.character(dm3_introns$V9),split=";"),"[",2))
gene2 <- sub("Gene:","",unlist(lapply(strsplit(gene1, split=","),function(x){return(x[grep("Gene:",x)])})))
dm3_introns$gene <- gene2
name <- sub("ID=","",unlist(lapply(strsplit(as.character(dm3_introns$V9),split=";"),"[",1)))
dm3_introns$name <- name
full.intron.genes <- dm3_introns$gene[match(fullintronlist, dm3_introns$name)]

# TPM
full.intron.TPM <- adelman.txi[match(full.intron.genes, rownames(adelman.txi)),4]

# intron length
full.names <- matrix(unlist(strsplit(fullintronlist,split=":")),ncol=7,byrow=T)
full.intron.length <- as.numeric(full.names[,5])-as.numeric(full.names[,3])

# three distance
dm3.gff <- read.table("~/Dropbox (MIT)/Annotations/dm3_ensGene_annot/dmel-all-r5.57-genes_parsed.gff")
dm3.gff$V9 <- sub("ID=","",dm3.gff$V9)
dm3.gff$genelength <- dm3.gff$V5 - dm3.gff$V4
dm3.gff$TES <- dm3.gff$V5
dm3.gff$TES[which(dm3.gff$V7 == "-")] <- dm3.gff$V4[which(dm3.gff$V7=="-")]

getpolyAdist <- function(x){
  xhere <- strsplit(as.character(x), split=":")[[1]]
  threess <- as.numeric(xhere[5])
  if(xhere[7] == "-"){ threess <- as.numeric(xhere[3]) }
  return(threess)
}

full.intron.TES <- sapply(fullintronlist, getpolyAdist)
full.intron.threelength <- abs(dm3.gff$TES[match(full.intron.genes, dm3.gff$V9)] - full.intron.TES)

three.data <- data.frame(intron = fullintronlist, threelength = full.intron.threelength)
write.table(three.data, file="threeprime_distance.txt",sep="\t",quote=F,row.names=F,col.names=T)

# total MISO PSI
full.intron.PSI <- miso.meanPsi$meanPsi_total[match(fullintronlist, rownames(miso.meanPsi))]

# intron num
intron.bed<- read.table("~/Dropbox (MIT)/Annotations/dm3_ensGene_annot/introns.sense.nochr.bed")
intron.names <- unlist(lapply(strsplit(as.character(intron.bed$V4),split=";"),"[",1))
intron.nums <- as.numeric(unlist(lapply(strsplit(unlist(lapply(strsplit(as.character(intron.bed$V4),split=";"),"[",3)),split="="),"[",2)))
full.intron.num <- intron.nums[match(fullintronlist, intron.names)]

# splice sites
introns.3ss <- read.table("~/Dropbox (MIT)/Annotations/dm3_ensGene_annot/introns.sense.nochr.threeSS.maxEnt.tab.out",fill=T)
names.3ss <- unlist(lapply(strsplit(as.character(introns.3ss$V1),split=";"),"[",1))
names.3ss <- sub(">","",names.3ss)
full.intron.3ss <- introns.3ss[match(fullintronlist,names.3ss),4] 

introns.5ss <- read.table("~/Dropbox (MIT)/Annotations/dm3_ensGene_annot/introns.sense.nochr.fiveSS.maxEnt.tab.out",fill=T)
names.5ss <- unlist(lapply(strsplit(as.character(introns.5ss$V1),split=";"),"[",1))
names.5ss <- sub(">","",names.5ss)
full.intron.5ss <- introns.5ss[match(fullintronlist,names.5ss),4] 

# intron type
RI.gff <- read.table("~/Dropbox (MIT)/Annotations/dm3_ensGene_annot/MISO_annots/modENCODE_RI_MISO.gff3")
RI.gff <- RI.gff[RI.gff$V3 == "gene",c(7,9)]
RI.names <- sub("ID=","",unlist(lapply(strsplit(as.character(RI.gff$V9),split=";"),"[",1)))
flipRInames <- function(x){
  # x = [name, strand]
  name = as.character(x[1])
  if(as.character(x[2]) == "-"){
    name.split = strsplit(as.character(x[1]),split="@")[[1]]
    name = paste(name.split[2],name.split[1],sep="@")
  }
  return(name)
}
RI.names.flip <- apply(cbind(RI.names,as.character(RI.gff[,1])),1,flipRInames)

SE.gff <- read.table("~/Dropbox (MIT)/Annotations/dm3_ensGene_annot/MISO_annots/modENCODE_SE_MISO.gff3")
SE.gff <- SE.gff[SE.gff$V3 == "gene",c(7,9)]
SE.names <- sub("ID=","",unlist(lapply(strsplit(as.character(SE.gff$V9),split=";"),"[",1)))
flankingSEnames <- function(x){
  name = strsplit(as.character(x[1]),split="@")[[1]]
  flanks = c(paste(name[1],name[2],sep="@"),paste(name[2],name[3],sep="@"))
  if(as.character(x[2] == "-")){
    flanks = c(paste(name[3],name[2],sep="@"),paste(name[2],name[1],sep="@"))
  }
  return(flanks)
}
SE.names.flanks <- t(apply(cbind(SE.names,as.character(SE.gff[,1])),1,flankingSEnames))
SE.names.flanks <- c(SE.names.flanks[,1],SE.names.flanks[,2])
SE.names.combo <- c(SE.names, SE.names)

containingSEnames <- function(x){
  name = strsplit(as.character(x[1]),split="@")[[1]]
  containing = paste(name[1],name[3],sep="@")
  if(as.character(x[2] == "-")){
    containing = paste(name[3],name[1],sep="@")
  }
  return(containing)
}
SE.names.containing <- apply(cbind(SE.names, as.character(SE.gff[,1])),1,containingSEnames)

full.intron.type <- rep("CI", length(fullintronlist))
full.intron.type[which(!is.na(match(fullintronlist, RI.names.flip)))] <- "RI"
full.intron.type[which(!is.na(match(fullintronlist, SE.names.flanks)))] <- "SEflanking"
full.intron.type[which(!is.na(match(fullintronlist, SE.names.containing)))] <- "SEcontaining"
full.intron.SEname <- rep(NA, length(fullintronlist))
full.intron.SEname[which(full.intron.type=="SEflanking")] <- SE.names.combo[match(fullintronlist[which(full.intron.type == "SEflanking")], SE.names.flanks)]
full.intron.SEname[which(full.intron.type=="SEcontaining")] <- SE.names[match(fullintronlist[which(full.intron.type == "SEcontaining")], SE.names.containing)]

fullintronlist.data <- data.frame(intron = fullintronlist, gene = full.intron.genes, TPM_total = full.intron.TPM, PSI_total = full.intron.PSI,
                                  threelength = full.intron.threelength, intronlen = full.intron.length, intronnum = full.intron.num,
                                  ss3 = full.intron.3ss, ss5 = full.intron.5ss, type = full.intron.type, SEname = full.intron.SEname)

```

Re-structure dataframe to (1) combined across replicates and (2) allow easy access to the same intron from different timepoints. Simultaneously, calculate ratio of intron-exon and exon-exon junction reads and other parameters necessary to model half-lives. Transcription rate is set at 1500 nt/min.
```{r}
txnrate = 1500
mappingnt = 40
# 5min
minute=5
min5.juncratio.data <- data.frame(intron = fullintronlist, threelength = full.intron.threelength,
                                  ee_count_combo = rowSums(cbind(subset(juncratio.data, time=="5min" & rep=="rep1")$ee_count[match(fullintronlist, subset(juncratio.data, time=="5min" & rep=="rep1")$intron)],
                                                                 subset(juncratio.data, time=="5min" & rep=="rep2")$ee_count[match(fullintronlist, subset(juncratio.data, time=="5min" & rep=="rep2")$intron)],
                                                                 subset(juncratio.data, time=="5min" & rep=="rep3")$ee_count[match(fullintronlist, subset(juncratio.data, time=="5min" & rep=="rep3")$intron)]),na.rm=T),
                                  ie_count_combo = rowSums(cbind(subset(juncratio.data, time=="5min" & rep=="rep1")$ie_count[match(fullintronlist, subset(juncratio.data, time=="5min" & rep=="rep1")$intron)],
                                                                 subset(juncratio.data, time=="5min" & rep=="rep2")$ie_count[match(fullintronlist, subset(juncratio.data, time=="5min" & rep=="rep2")$intron)],
                                                                 subset(juncratio.data, time=="5min" & rep=="rep3")$ie_count[match(fullintronlist, subset(juncratio.data, time=="5min" & rep=="rep3")$intron)]),na.rm=T))
min5.juncratio.data$ratio <- (min5.juncratio.data$ie_count_combo/mappingnt)/(min5.juncratio.data$ee_count_combo/mappingnt)
min5.juncratio.data$D_prime <- (min5.juncratio.data$threelength) + (minute * txnrate)
min5.juncratio.data$R_prime <- (min5.juncratio.data$D_prime*log(2))/(txnrate*((1/min5.juncratio.data$ratio) + 1))

# 10min
minute=10
min10.juncratio.data <- data.frame(intron = fullintronlist, threelength = full.intron.threelength,
                                  ee_count_combo = rowSums(cbind(subset(juncratio.data, time=="10min" & rep=="rep1")$ee_count[match(fullintronlist, subset(juncratio.data, time=="10min" & rep=="rep1")$intron)],
                                                                 subset(juncratio.data, time=="10min" & rep=="rep2")$ee_count[match(fullintronlist, subset(juncratio.data, time=="10min" & rep=="rep2")$intron)],
                                                                 subset(juncratio.data, time=="10min" & rep=="rep3")$ee_count[match(fullintronlist, subset(juncratio.data, time=="10min" & rep=="rep3")$intron)]),na.rm=T),
                                  ie_count_combo = rowSums(cbind(subset(juncratio.data, time=="10min" & rep=="rep1")$ie_count[match(fullintronlist, subset(juncratio.data, time=="10min" & rep=="rep1")$intron)],
                                                                 subset(juncratio.data, time=="10min" & rep=="rep2")$ie_count[match(fullintronlist, subset(juncratio.data, time=="10min" & rep=="rep2")$intron)],
                                                                 subset(juncratio.data, time=="10min" & rep=="rep3")$ie_count[match(fullintronlist, subset(juncratio.data, time=="10min" & rep=="rep3")$intron)]),na.rm=T))
min10.juncratio.data$ratio <- (min10.juncratio.data$ie_count_combo/mappingnt)/(min10.juncratio.data$ee_count_combo/mappingnt)
min10.juncratio.data$D_prime <- (min10.juncratio.data$threelength) + (minute * txnrate)
min10.juncratio.data$R_prime <- (min10.juncratio.data$D_prime*log(2))/(txnrate*((1/min10.juncratio.data$ratio) + 1))

# 20min
minute=20
min20.juncratio.data <- data.frame(intron = fullintronlist, threelength = full.intron.threelength,
                                  ee_count_combo = rowSums(cbind(subset(juncratio.data, time=="20min" & rep=="rep1")$ee_count[match(fullintronlist, subset(juncratio.data, time=="20min" & rep=="rep1")$intron)],
                                                                 subset(juncratio.data, time=="20min" & rep=="rep2")$ee_count[match(fullintronlist, subset(juncratio.data, time=="20min" & rep=="rep2")$intron)],
                                                                 subset(juncratio.data, time=="20min" & rep=="rep3")$ee_count[match(fullintronlist, subset(juncratio.data, time=="20min" & rep=="rep3")$intron)]),na.rm=T),
                                  ie_count_combo = rowSums(cbind(subset(juncratio.data, time=="20min" & rep=="rep1")$ie_count[match(fullintronlist, subset(juncratio.data, time=="20min" & rep=="rep1")$intron)],
                                                                 subset(juncratio.data, time=="20min" & rep=="rep2")$ie_count[match(fullintronlist, subset(juncratio.data, time=="20min" & rep=="rep2")$intron)],
                                                                 subset(juncratio.data, time=="20min" & rep=="rep3")$ie_count[match(fullintronlist, subset(juncratio.data, time=="20min" & rep=="rep3")$intron)]),na.rm=T))
min20.juncratio.data$ratio <- (min20.juncratio.data$ie_count_combo/mappingnt)/(min20.juncratio.data$ee_count_combo/mappingnt)
min20.juncratio.data$D_prime <- (min20.juncratio.data$threelength) + (minute * txnrate)
min20.juncratio.data$R_prime <- (min20.juncratio.data$D_prime*log(2))/(txnrate*((1/min20.juncratio.data$ratio) + 1))

# total
minute=12*60
total.juncratio.data <- data.frame(intron = fullintronlist, threelength = full.intron.threelength,
                                   ee_count_combo = rowSums(cbind(subset(juncratio.data, time=="total" & rep=="rep1")$ee_count[match(fullintronlist, subset(juncratio.data, time=="total" & rep=="rep1")$intron)],
                                                                  subset(juncratio.data, time=="total" & rep=="rep2")$ee_count[match(fullintronlist, subset(juncratio.data, time=="total" & rep=="rep2")$intron)])),
                                   ie_count_combo = rowSums(cbind(subset(juncratio.data, time=="total" & rep=="rep1")$ie_count[match(fullintronlist, subset(juncratio.data, time=="total" & rep=="rep1")$intron)],
                                                                  subset(juncratio.data, time=="total" & rep=="rep2")$ie_count[match(fullintronlist, subset(juncratio.data, time=="total" & rep=="rep2")$intron)])))
total.juncratio.data$ratio <- (total.juncratio.data$ie_count_combo/mappingnt)/(total.juncratio.data$ee_count_combo/mappingnt)
total.juncratio.data$D_prime <- (total.juncratio.data$threelength) + (minute * txnrate)
total.juncratio.data$R_prime <- (total.juncratio.data$D_prime*log(2))/(txnrate*((1/total.juncratio.data$ratio) + 1))

# combine
combo.juncratio.data <- data.frame(fullintronlist.data,
                                   ee_count_5 = min5.juncratio.data$ee_count_combo, ee_count_10 = min10.juncratio.data$ee_count_combo, 
                                   ee_count_20 = min20.juncratio.data$ee_count_combo, ee_count_total = total.juncratio.data$ee_count_combo,
                                   ie_count_5 = min5.juncratio.data$ie_count_combo, ie_count_10 = min10.juncratio.data$ie_count_combo, 
                                   ie_count_20 = min20.juncratio.data$ie_count_combo, ie_count_total = total.juncratio.data$ie_count_combo,
                                   ratio_5 = min5.juncratio.data$ratio, ratio_10 = min10.juncratio.data$ratio, ratio_20 = min20.juncratio.data$ratio, ratio_total = total.juncratio.data$ratio,
                                   Dprime_5 = min5.juncratio.data$D_prime, Dprime_10 = min10.juncratio.data$D_prime, Dprime_20 = min20.juncratio.data$D_prime,
                                   Rprime_5 = min5.juncratio.data$R_prime, Rprime_10 = min10.juncratio.data$R_prime, Rprime_20 = min20.juncratio.data$R_prime)
```

Function to jointly model half-lives from all timepoints
```{r}
sumsqequationsolve <- function(atts, txnrate){
  # atts[1:3] are Dprimes
  # atts[4:6] are Rprimes
  print(atts)
  D_prime = atts[1:3]
  R_prime = atts[4:6]
  hold.row <- c(NA, NA)
  #   f <- function(h) { (h*(1 - 2^(-D_prime/(h*txnrate)))) - R_prime }
  f <- function(h){ ((h*(1 - 2^(-D_prime[1]/(h*txnrate)))) - R_prime[1])^2 + ((h*(1 - 2^(-D_prime[2]/(h*txnrate)))) - R_prime[2])^2 + ((h*(1 - 2^(-D_prime[3]/(h*txnrate)))) - R_prime[3])^2 }
  starth = 0
  if(sum(is.na(R_prime))==3){ return(hold.row) }
  try(fit.hold <- optim(starth, f))
  try(hold.row <- c(fit.hold$par, fit.hold$value))
  return(hold.row)
}
```

Run function across all introns in dataset and add results to dataframe
```{r} 
sumsqfit.data <- t(apply(combo.juncratio.data[,c(24:29)], 1, sumsqequationsolve, 1500))
combo.juncratio.data$fitvalue <- sumsqfit.data[,1]
combo.juncratio.data$yvalue <- sumsqfit.data[,2]

write.table(combo.juncratio.data, file="combo.juncratio.data.txt",sep="\t",quote=F,row.names=F,col.names=F)
```

Parse modeled introns to a set with optimal power to confidently detect half-lives
```{r}

# Remove those where fit failed
combo.juncratio.data.parsed <- subset(combo.juncratio.data, !is.na(fitvalue))
# Remove those without exon-exon junction reads
combo.juncratio.data.parsed <- subset(combo.juncratio.data.parsed, !is.infinite(ratio_5) & !is.infinite(ratio_10) & !is.infinite(ratio_20))
# Remove those without intron-exon junction coverage in the 5m timepoint
combo.juncratio.data.parsed <- subset(combo.juncratio.data.parsed, ie_count_5!=0)
# Keep only those with coverage in at least two timepoints
combo.juncratio.data.parsed <- subset(combo.juncratio.data.parsed, ie_count_10>0 | ie_count_20>0)

# Remove those not expressed (TPM < 5)
combo.juncratio.data.parsed <- subset(combo.juncratio.data.parsed, TPM_total >= 5)
# Remove those that are retained in total timepoint
combo.juncratio.data.parsed <- subset(combo.juncratio.data.parsed, PSI_total <= 0.2)
# Remove those that contain SEs
combo.juncratio.data.parsed <- subset(combo.juncratio.data.parsed, type!="SEcontaining")

write.table(combo.juncratio.data.parsed, file="combo.juncratio.data.parsed.txt",sep="\t",quote=F,row.names=F,col.names=F)

parsed.introns <- match(combo.juncratio.data.parsed$intron, combo.juncratio.data$intron)

```

Fit model to each replicate individually for subset of 1000 most expressed genes
```{r}

# add necessary information to replicate divided dataframe
juncratio.data$gene <- fullintronlist.data$gene[match(juncratio.data$intron, fullintronlist.data$intron)]
juncratio.data$TPM <- fullintronlist.data$TPM_total[match(juncratio.data$intron, fullintronlist.data$intron)]
juncratio.data$threelength <- fullintronlist.data$threelength[match(juncratio.data$intron, fullintronlist.data$intron)]
juncratio.data$min <- as.numeric(sub("min","",juncratio.data$time))
juncratio.data$min[which(is.na(juncratio.data$min))] <- 12*60

juncratio.data$D_prime <- juncratio.data$threelength + (juncratio.data$min * 1500)
juncratio.data$R_prime <- (juncratio.data$D_prime*log(2))/(1500*((1/juncratio.data$ratio) + 1))

getroot <- function(atts, txnrate){
  #print(atts)
  # assigns parameters to variables
  D_prime <- as.numeric(atts[1])
  R_prime <- as.numeric(atts[2])
  # makes the lowerbound for fitting into the R_prime
  lowerbound = 0
  #lowerbound = R_prime
  #try(if(R_prime < 0){ lowerbound = 0 })
  # dummy variable for results
  root.hold <- NA
  hold.row <- c(NA, NA, NA)
  if(is.na(R_prime)){ return(hold.row) }
  # actual function to get the root for
  #f <- function(h) { ((D_prime)/(h*txnrate)) - log(1 - (R_prime/h))/log(2) }
  f <- function(h) { (h*(1 - 2^(-D_prime/(h*txnrate)))) - R_prime }
  try(root.hold <- uniroot(f, lower=lowerbound, upper=7500),silent=T)
  try(hold.row <- c(root.hold$root, root.hold$f.root, root.hold$estim.prec),silent=T)
  return(hold.row)
}

juncroot.data <- t(apply(juncratio.data[,c(11,12)], 1, getroot, 1500))
juncratio.data$root_value <- juncroot.data[,1]
write.table(juncratio.data, file="juncratio.data.txt",sep="\t",quote=F,row.names=F,col.names=T)

juncratio.data <- read.table(file="juncratio.data.txt",header=F)
colnames(juncratio.data)

ggplot(juncratio.data, aes(x=factor(time),y=ratio,fill=factor(rep))) + geom_boxplot(notch=T) + scale_y_log10(limits=c(0.001,125),breaks=c(0.001,0.01,0.1, 1,10,100),labels=comma) + 
  scale_fill_manual(values=c("dodgerblue3","dodgerblue2","dodgerblue1")) + labs(x="labeling period",y="ratio of ie/ee junction counts",fill="replicates") + theme(legend.position="bottom")

juncratio.data.combo <- data.frame(fullintronlist.data,
                                   fit_5r1 = subset(juncratio.data, time=="5min" & rep=="rep1")$root_value[match(fullintronlist,subset(juncratio.data, time=="5min" & rep=="rep1")$intron)],
                                   fit_5r2 = subset(juncratio.data, time=="5min" & rep=="rep2")$root_value[match(fullintronlist,subset(juncratio.data, time=="5min" & rep=="rep2")$intron)],
                                   fit_5r3 = subset(juncratio.data, time=="5min" & rep=="rep3")$root_value[match(fullintronlist,subset(juncratio.data, time=="5min" & rep=="rep3")$intron)],
                                   fit_10r1 = subset(juncratio.data, time=="10min" & rep=="rep1")$root_value[match(fullintronlist,subset(juncratio.data, time=="10min" & rep=="rep1")$intron)],
                                   fit_10r2 = subset(juncratio.data, time=="10min" & rep=="rep2")$root_value[match(fullintronlist,subset(juncratio.data, time=="10min" & rep=="rep2")$intron)],
                                   fit_10r3 = subset(juncratio.data, time=="10min" & rep=="rep3")$root_value[match(fullintronlist,subset(juncratio.data, time=="10min" & rep=="rep3")$intron)],
                                   fit_20r1 = subset(juncratio.data, time=="20min" & rep=="rep1")$root_value[match(fullintronlist,subset(juncratio.data, time=="20min" & rep=="rep1")$intron)],
                                   fit_20r2 = subset(juncratio.data, time=="20min" & rep=="rep2")$root_value[match(fullintronlist,subset(juncratio.data, time=="20min" & rep=="rep2")$intron)],
                                   fit_20r3 = subset(juncratio.data, time=="20min" & rep=="rep3")$root_value[match(fullintronlist,subset(juncratio.data, time=="20min" & rep=="rep3")$intron)])


library(gplots)
heatmap.2(cor(as.matrix(juncratio.data.combo[,c(12:20)]),use="complete.obs", method="spearman"))
heatmap.2(cor(as.matrix(subset(juncratio.data.combo, TPM_total > 514)[,c(12:20)]),use="complete.obs", method="pearson"))

withincors <- function(x){
  # each column in x is a vector to be correlated to all other vectors in x
  cor.vec <- c()
  for(i in 1:ncol(x)){
    for(j in i:ncol(x)){
      if(i == j){ next }
      else{ cor.vec <- c(cor.vec, cor(x[,i], x[,j], use="complete.obs")) }
    }
  }
  return(cor.vec)
}
acrosstimecors <- function(x, y){
  #each column in x is a vector to be correlated to all the column vectors in y
  cor.vec <- c()
  for(i in 1:ncol(x)){
    for(j in 1:ncol(y)){
      cor.vec <- c(cor.vec, cor(x[,i], y[,j], use="complete.obs"))
    }
  }
  return(cor.vec)
}
coefvar <- function(x){ sd(x)/mean(x) }

withintime.5 <- apply(juncratio.data.combo[,c('fit_5r1', 'fit_5r2', 'fit_5r3')], 1, coefvar)
withintime.10 <- apply(juncratio.data.combo[,c('fit_10r1', 'fit_10r2', 'fit_10r3')], 1, coefvar)
withintime.20 <- apply(juncratio.data.combo[,c('fit_20r1', 'fit_20r2', 'fit_20r3')], 1, coefvar)

acrosstime.5.10 <- apply(juncratio.data.combo[,c('fit_5r1', 'fit_5r2', 'fit_5r3', 'fit_10r1', 'fit_10r2', 'fit_10r3')], 1, coefvar)
acrosstime.5.20 <- apply(juncratio.data.combo[,c('fit_5r1', 'fit_5r2', 'fit_5r3', 'fit_20r1', 'fit_20r2', 'fit_20r3')], 1, coefvar)
acrosstime.10.20 <- apply(juncratio.data.combo[,c('fit_10r1', 'fit_10r2', 'fit_10r3', 'fit_20r1', 'fit_20r2', 'fit_20r3')], 1, coefvar)

acrossall <- apply(juncratio.data.combo[,c('fit_5r1', 'fit_5r2', 'fit_5r3', 'fit_10r1', 'fit_10r2', 'fit_10r3','fit_20r1', 'fit_20r2', 'fit_20r3')], 1, coefvar)

coef.data <- data.frame(type = c(rep("withintime", length(withintime.5)*3), rep("acrosstime", length(withintime.5)*3), rep("acrossall", length(acrossall))),
                        time = c(rep(c("5","10","20","5-10","5-20","10-20","all"),each=length(withintime.5))),
                        coef = c(withintime.5, withintime.10, withintime.20, acrosstime.5.10, acrosstime.5.20, acrosstime.10.20, acrossall))
coef.data$type <- factor(coef.data$type, levels=c("withintime","acrosstime","acrossall"))
coef.data$time <- factor(coef.data$time, levels=c("5","10","20","5-10","5-20","10-20","all"))

expinds <- which(juncratio.data.combo$TPM_total >= 514)
coef.data.exp <- data.frame(type = c(rep("withintime", length(expinds)*3), rep("acrosstime", length(expinds)*3), rep("acrossall", length(expinds))),
                            time = c(rep(c("5","10","20","5-10","5-20","10-20","all"),each=length(expinds))),
                            coef = c(withintime.5[expinds], withintime.10[expinds], withintime.20[expinds], acrosstime.5.10[expinds], acrosstime.5.20[expinds], acrosstime.10.20[expinds], acrossall[expinds]))
coef.data.exp$type <- factor(coef.data.exp$type, levels=c("withintime","acrosstime","acrossall"))
coef.data.exp$time <- factor(coef.data.exp$time, levels=c("5","10","20","5-10","5-20","10-20","all"))

ggplot(coef.data.exp, aes(x=factor(type),y=coef, fill=factor(time))) + geom_boxplot(notch=T) + ylim(0,1) +
  scale_fill_manual(values=c(brewer.pal(11,"BrBG")[8:10], rev(brewer.pal(11,"BrBG")[2:4]), "darkgrey")) +
  labs(x="comparison",y="coefficient of variation",fill="labeling periods") + theme(legend.position="bottom")


```

Get confidence intervals for each intron
```{r}

test1 <- read.table("subsampling/juncrates_sub1.txt", header=T)
test2 <- read.table("subsampling/juncrates_sub2.txt", header=T)
test3 <- read.table("subsampling/juncrates_sub3.txt", header=T)

sampling.dist <- data.frame(intron=combo.juncratio.data.parsed$intron)
for(i in c(1:10)){
  print(i)
  sample.hold <- read.table(paste0("subsampling/juncrates_sub",i,".txt"),header=T)
  sample.hold.parsed <- subset(sample.hold, !is.na(root))
  sample.hold.parsed <- subset(sample.hold.parsed, !is.infinite(ratio_5) & !is.infinite(ratio_10) & !is.infinite(ratio_20))
  sample.hold.parsed <- subset(sample.hold.parsed, ie_count_5!=0)
  sample.hold.parsed <- subset(sample.hold.parsed, ie_count_10 > 0 | ie_count_20 > 0)
  match.inds <- match(combo.juncratio.data.parsed$intron, sample.hold.parsed$intron)
  sampling.dist <- cbind(sampling.dist, sample.hold.parsed$root[match.inds])
}

colnames(sampling.dist)[2:11] <- paste("sample",seq(1:10),sep="")

sampling.dist$se <- apply(sampling.dist[,c(2:11)], 1, sd, na.rm=T)
combo.juncratio.data.parsed$se <- sampling.dist$se


```

Get rates from different transcription rates
```{r}

sumsqfit.data.matrix <- combo.juncratio.data.parsed[,c(1:11)]

sumsqfit.data.matrix$fitvalue_500 <- sumsqfit.500$halflife[match(sumsqfit.data.matrix$intron, rownames(sumsqfit.500))]
sumsqfit.data.matrix$fitvalue_1000 <- sumsqfit.1000$halflife[match(sumsqfit.data.matrix$intron, rownames(sumsqfit.1000))]
sumsqfit.data.matrix$fitvalue_1500 <- sumsqfit.1500$halflife[match(sumsqfit.data.matrix$intron, rownames(sumsqfit.1500))]
sumsqfit.data.matrix$fitvalue_2000 <- sumsqfit.2000$halflife[match(sumsqfit.data.matrix$intron, rownames(sumsqfit.2000))]
sumsqfit.data.matrix$fitvalue_2500 <- sumsqfit.2500$halflife[match(sumsqfit.data.matrix$intron, rownames(sumsqfit.2500))]
sumsqfit.data.matrix$fitvalue_3000 <- sumsqfit.3000$halflife[match(sumsqfit.data.matrix$intron, rownames(sumsqfit.3000))]
sumsqfit.data.matrix$fitvalue_3500 <- sumsqfit.3500$halflife[match(sumsqfit.data.matrix$intron, rownames(sumsqfit.3500))]
sumsqfit.data.matrix$fitvalue_4000 <- sumsqfit.4000$halflife[match(sumsqfit.data.matrix$intron, rownames(sumsqfit.4000))]

randomtxn <- c()
for(i in 1:nrow(sumsqfit.data.matrix)){
  print(i)
  randomtxn[i] <- as.numeric(sample(sumsqfit.data.matrix[i,c(12:19)],1))
}

sumsqfit.data.matrix$fitvalue <- randomtxn

```

Redo with 4sU incorporation lag times
```{r}


# use getroot function to get individual labeling period half-lives
# recalculate D_prime and R_prime with each labeling period

get_r2 <- function(min5, min10, min20, lagtime){
  # min5 is dataframe: intron, threelength, ee_count_combo, ie_count_combo, ratio, D_prime, R_prime
  # min10 is dataframe: intron, threelength, ee_count_combo, ie_count_combo, ratio, D_prime, R_prime
  # min20 is dataframe: intron, threelength, ee_count_combo, ie_count_combo, ratio, D_prime, R_prime
  # lagtime is offset in minutes
  txnrate = 1500
  newtime <- c(5, 10, 20) + lagtime
  # calculate new D_prime & R_prime values
  min5.Dprime <- min5$threelength + (newtime[1] * txnrate)
  min5.Rprime <- (min5.Dprime*log(2))/(txnrate*((1/min5$ratio) + 1))
  min10.Dprime <- min10$threelength + (newtime[2] * txnrate)
  min10.Rprime <- (min10.Dprime*log(2))/(txnrate*((1/min10$ratio) + 1))
  min20.Dprime <- min20$threelength + (newtime[3] * txnrate)
  min20.Rprime <- (min20.Dprime*log(2))/(txnrate*((1/min20$ratio) + 1))
  # get half lives
  print("CALCULATING 5 MIN...")
  min5.half <- t(apply(cbind(min5.Dprime, min5.Rprime), 1, getroot, txnrate))[,1]
  print("CALCULATING 10 MIN...")
  min10.half <- t(apply(cbind(min10.Dprime, min10.Rprime), 1, getroot, txnrate))[,1]
  print("CALCULATING 20 MIN...")
  min20.half <- t(apply(cbind(min20.Dprime, min20.Rprime), 1, getroot, txnrate))[,1]
  # recalculate ratio
  getratio <- function(half, Dprime, txnrate){
    Hprime <- (half*txnrate)/log(2)
    DHratio <- Dprime/Hprime
    expR <- 1/((DHratio*(1 - exp(-DHratio))) - 1)
    return(expR)
  }
  min5.expR <- getratio(min5.half, min5.Dprime, txnrate)
  min10.expR <- getratio(min10.half, min10.Dprime, txnrate)
  min20.expR <- getratio(min20.half, min20.Dprime, txnrate)
  # calculate R2
  SSres <- (min5$ratio - min5.expR)^2 + (min10$ratio - min10.expR)^2 + (min20$ratio - min20.expR)^2
  #ratiomean <- rowMeans(cbind(min5$ratio, min10$ratio, min20$ratio))
  #SStot <- (min5$ratio - ratiomean)^2 + (min10$ratio - ratiomean)^2 + (min20$ratio - ratiomean)^2
  #r_squared <- 1 - (SSres / SStot)
  return(SSres)
}

#rsq.minus3 <- get_r2(min5.juncratio.data, min10.juncratio.data, min20.juncratio.data, -3.0)[parsed.introns] 
#rsq.minus2 <- get_r2(min5.juncratio.data, min10.juncratio.data, min20.juncratio.data, -2.0)[parsed.introns] 
rsq.minus1 <- get_r2(min5.juncratio.data, min10.juncratio.data, min20.juncratio.data, -1.0)[parsed.introns] 
rsq.minus08 <- get_r2(min5.juncratio.data, min10.juncratio.data, min20.juncratio.data, -0.8)[parsed.introns] 
rsq.minus06 <- get_r2(min5.juncratio.data, min10.juncratio.data, min20.juncratio.data, -0.6)[parsed.introns] 
rsq.minus04 <- get_r2(min5.juncratio.data, min10.juncratio.data, min20.juncratio.data, -0.4)[parsed.introns] 
rsq.minus02 <- get_r2(min5.juncratio.data, min10.juncratio.data, min20.juncratio.data, -0.2)[parsed.introns] 
rsq.zero <- get_r2(min5.juncratio.data, min10.juncratio.data, min20.juncratio.data, 0)[parsed.introns] 
rsq.plus02 <- get_r2(min5.juncratio.data, min10.juncratio.data, min20.juncratio.data, 0.2)[parsed.introns] 
rsq.plus04 <- get_r2(min5.juncratio.data, min10.juncratio.data, min20.juncratio.data, 0.4)[parsed.introns] 
rsq.plus06 <- get_r2(min5.juncratio.data, min10.juncratio.data, min20.juncratio.data, 0.6)[parsed.introns] 
rsq.plus08 <- get_r2(min5.juncratio.data, min10.juncratio.data, min20.juncratio.data, 0.8)[parsed.introns] 
rsq.plus1 <- get_r2(min5.juncratio.data, min10.juncratio.data, min20.juncratio.data, 1)[parsed.introns] 
#rsq.plus2 <- get_r2(min5.juncratio.data, min10.juncratio.data, min20.juncratio.data, 2)[parsed.introns] 
#rsq.plus3 <- get_r2(min5.juncratio.data, min10.juncratio.data, min20.juncratio.data, 3)[parsed.introns] 

rsq.data <- data.frame(offset = rep(c(-1,-0.8,-0.6,-0.4,-0.2,0,0.2,0.4,0.6,0.8,1), each=length(parsed.introns)),
                       rsq = c(rsq.minus1, rsq.minus08, rsq.minus06, rsq.minus04, rsq.minus02, 
                               rsq.zero,rsq.plus02, rsq.plus04, rsq.plus06, rsq.plus08, rsq.plus1))

pdf("~/Desktop/RSS_lagtime.pdf")
ggplot(rsq.data, aes(y=rsq, x=factor(offset))) + geom_boxplot(notch=T) + ylim(0,10) +
  labs(x="lag time (minutes)",y="residual sum of squares")
def.off()

ggplot(data.frame(rsq = rsq.zero), aes(x=log10(rsq))) + geom_histogram(fill="black",color="white") + xlim(-25,2) +
  labs(x="log10(residual sum of squares)",y="count")
  
ggplot(data.frame(rsq = rsq.zero), aes(x=rsq)) + stat_ecdf() + xlim(0,5)
ggplot(data.frame(rsq = rsq.zero), aes(y=rsq,x=1)) + geom_boxplot() + ylim(0,100)


ggplot(rsq.data, aes(x=rsq, color=factor(offset))) + geom_density() + xlim(0,0.00001)

ggplot(rsq.data, aes(x=rsq, color=factor(offset))) + stat_ecdf() + scale_x_log10()


```

# PLOT examples
```{r}

# previous short intron
shortintron <- "chr3R:7799796:7799949:-@chr3R:7800032:7800179:-"
subset(combo.juncratio.data.parsed, intron==shortintron)
# PSI total = 0.02; TPM = 46.98; intron length = 83; intron num = 2; fitvalue = 5.23 +- 0.083
```

pick short intron 
```{r}

#median halflifefor 60-70: 1.7; mean 2.25

combo.juncratio.data.parsed$strand <- unlist(lapply(strsplit(as.character(combo.juncratio.data.parsed$intron),split=":"),"[",7))
combo.juncratio.data.parsed$chr <- unlist(lapply(strsplit(as.character(combo.juncratio.data.parsed$intron),split=":"),"[",1))
short.subset <- subset(combo.juncratio.data.parsed, TPM_total>= 30 & PSI_total <= 0.02 & lenbin5=="60-70" & type=="CI" & fitvalue < 2 & !is.na(intronnum) & intronnum !=0 & intronnum < 2 & strand=="+")

short.intron.bed <- data.frame(chr = short.subset$chr, 
                               start = unlist(lapply(strsplit(as.character(short.subset$intron), split=":"), "[", 3)),
                               end = unlist(lapply(strsplit(as.character(short.subset$intron), split=":"), "[", 5)),
                               name = short.subset$intron, dot = ".", strand = short.subset$strand)
write.table(short.intron.bed, file="~/Dropbox (MIT)/Projects/Adelman/timecourse/Figures/revisedfigures/coverage_examples/short/intron.bed",sep="\t",quote=F,row.names=F,col.names=F)
short.upexon.bed <- data.frame(chr = short.subset$chr, 
                               start = unlist(lapply(strsplit(as.character(short.subset$intron), split=":"), "[", 2)),
                               end = unlist(lapply(strsplit(as.character(short.subset$intron), split=":"), "[", 3)),
                               name = short.subset$intron, dot = ".", strand = short.subset$strand)
write.table(short.upexon.bed, file="~/Dropbox (MIT)/Projects/Adelman/timecourse/Figures/revisedfigures/coverage_examples/short/upexon.bed",sep="\t",quote=F,row.names=F,col.names=F)
short.downexon.bed <- data.frame(chr = short.subset$chr, 
                               start = unlist(lapply(strsplit(as.character(short.subset$intron), split=":"), "[", 5)),
                               end = unlist(lapply(strsplit(as.character(short.subset$intron), split=":"), "[", 6)),
                               name = short.subset$intron, dot = ".", strand = short.subset$strand)
write.table(short.downexon.bed, file="~/Dropbox (MIT)/Projects/Adelman/timecourse/Figures/revisedfigures/coverage_examples/short/downexon.bed",sep="\t",quote=F,row.names=F,col.names=F)

short.subset.ratio <- data.frame(intron = rep(short.subset$intron, 4),
                                 ee_count = c(short.subset$ee_count_5, short.subset$ee_count_10, short.subset$ee_count_20, short.subset$ee_count_total),
                                 ie_count = c(short.subset$ie_count_5, short.subset$ie_count_10, short.subset$ie_count_20, short.subset$ie_count_total),
                                 time = rep(c(5, 10, 20, 25), each=nrow(short.subset)))
short.subset.ratio$sum <- short.subset.ratio$ee_count + short.subset.ratio$ie_count
short.subset.introns <- as.character(unique(short.subset.ratio$intron))

pdf("~/Dropbox (MIT)/Projects/Adelman/timecourse/Figures/revisedfigures/coverage_examples/short/ratio_plots.pdf")
for(i in short.subset.introns){
  print(i)
  hold <- subset(short.subset.ratio, intron==i)
  print(ggplot(hold, aes(x=time)) + geom_point(aes(y=ee_count/sum),size=5,color="blue") + geom_point(aes(y=ie_count/sum), size=5, color="pink") + 
    scale_x_continuous(breaks=c(0,5,10,15,20,25),labels=c("","5m","10m","","20m","total")) + ylim(0,1) + 
    geom_vline(xintercept=22.5,color="grey17",linetype="dotted") + labs(x="labeling period (L)", y="junction ratio",title=i))
}
dev.off()

```

pick long intron
```{r}

# mean >1000nt: 4.8m

long.subset <- subset(combo.juncratio.data.parsed, TPM_total >=20 & PSI_total<=0.05 & intronlen>=1000 & type=="CI" & fitvalue > 2 & !is.na(intronnum)& intronnum < 4 & strand=="+")
long.mid.subset <- subset(combo.juncratio.data.parsed, TPM_total >=20 & PSI_total<=0.05 & intronlen>=500 & intronlen<1000 & type=="CI" & fitvalue > 2 & !is.na(intronnum)& intronnum < 4 & strand=="+")

long.intron.bed <- data.frame(chr = long.mid.subset$chr, 
                               start = unlist(lapply(strsplit(as.character(long.mid.subset$intron), split=":"), "[", 3)),
                               end = unlist(lapply(strsplit(as.character(long.mid.subset$intron), split=":"), "[", 5)),
                               name = long.mid.subset$intron, dot = ".", strand = long.mid.subset$strand)
write.table(long.intron.bed, file="~/Dropbox (MIT)/Projects/Adelman/timecourse/Figures/revisedfigures/coverage_examples/longmid/intron.bed",sep="\t",quote=F,row.names=F,col.names=F)
long.upexon.bed <- data.frame(chr = long.mid.subset$chr, 
                               start = unlist(lapply(strsplit(as.character(long.mid.subset$intron), split=":"), "[", 2)),
                               end = unlist(lapply(strsplit(as.character(long.mid.subset$intron), split=":"), "[", 3)),
                               name = long.mid.subset$intron, dot = ".", strand = long.mid.subset$strand)
write.table(long.upexon.bed, file="~/Dropbox (MIT)/Projects/Adelman/timecourse/Figures/revisedfigures/coverage_examples/longmid/upexon.bed",sep="\t",quote=F,row.names=F,col.names=F)
long.downexon.bed <- data.frame(chr = long.mid.subset$chr, 
                               start = unlist(lapply(strsplit(as.character(long.mid.subset$intron), split=":"), "[", 5)),
                               end = unlist(lapply(strsplit(as.character(long.mid.subset$intron), split=":"), "[", 6)),
                               name = long.mid.subset$intron, dot = ".", strand = long.mid.subset$strand)
write.table(long.downexon.bed, file="~/Dropbox (MIT)/Projects/Adelman/timecourse/Figures/revisedfigures/coverage_examples/longmid/downexon.bed",sep="\t",quote=F,row.names=F,col.names=F)

long.subset.ratio <- data.frame(intron = rep(long.mid.subset$intron, 4),
                                 ee_count = c(long.mid.subset$ee_count_5, long.mid.subset$ee_count_10, long.mid.subset$ee_count_20, long.mid.subset$ee_count_total),
                                 ie_count = c(long.mid.subset$ie_count_5, long.mid.subset$ie_count_10, long.mid.subset$ie_count_20, long.mid.subset$ie_count_total),
                                 time = rep(c(5, 10, 20, 25), each=nrow(long.mid.subset)))
long.subset.ratio$sum <- long.subset.ratio$ee_count + long.subset.ratio$ie_count
long.subset.introns <- as.character(unique(long.subset.ratio$intron))

pdf("~/Dropbox (MIT)/Projects/Adelman/timecourse/Figures/revisedfigures/coverage_examples/longmid/ratio_plots.pdf")
for(i in long.subset.introns){
  print(i)
  hold <- subset(long.subset.ratio, intron==i)
  print(ggplot(hold, aes(x=time)) + geom_point(aes(y=ee_count/sum),size=5,color="blue") + geom_point(aes(y=ie_count/sum), size=5, color="pink") + 
    scale_x_continuous(breaks=c(0,5,10,15,20,25),labels=c("","5m","10m","","20m","total")) + ylim(0,1) + 
    geom_vline(xintercept=22.5,color="grey17",linetype="dotted") + labs(x="labeling period (L)", y="junction ratio",title=i))
}
dev.off()

```

# Length

Add length bins - 20% quantiles
```{r}
combo.juncratio.data.parsed$len_bin <- binVec(combo.juncratio.data.parsed$intronlen, 20)
combo.juncratio.data.parsed$len_bin <- factor(combo.juncratio.data.parsed$len_bin, levels=paste(seq(0,100,20),"%",sep=""))

ggplot(combo.juncratio.data.parsed, aes(x=factor(len_bin),y=fitvalue)) + geom_boxplot(notch=T) + scale_y_log10()

ggplot(subset(combo.juncratio.data.parsed, type!="SEcontaining"), aes(x=factor(len_bin),y=fitvalue,fill=factor(type))) + geom_boxplot(notch=T) + scale_y_log10()
ggplot(subset(combo.juncratio.data.parsed, type!="SEcontaining"), aes(x=factor(type),y=fitvalue)) + geom_boxplot(notch=T) + scale_y_log10()
```

Add length bins - 10bp bins
```{r}
combo.juncratio.data.parsed$lenbin5 <- rep(">10,000", nrow(combo.juncratio.data.parsed))
for(i in rev(seq(1000,10000,by=1000))){
  combo.juncratio.data.parsed$lenbin5[which(combo.juncratio.data.parsed$intronlen <= i)] <- paste0(as.character(i-1000),"-",as.character(i))
}
for(i in rev(seq(100,1000,by=100))){
  combo.juncratio.data.parsed$lenbin5[which(combo.juncratio.data.parsed$intronlen <= i)] <- paste0(as.character(i-100),"-",as.character(i))
}
for(i in rev(seq(40,100,by=10))){
  combo.juncratio.data.parsed$lenbin5[which(combo.juncratio.data.parsed$intronlen <= i)] <- paste0(as.character(i-10),"-",as.character(i))
}
#combo.juncratio.data.parsed$lenbin5[which(combo.juncratio.data.parsed$intronlen < 10000)] <- "2,000-10,000"
#combo.juncratio.data.parsed$lenbin5[which(combo.juncratio.data.parsed$intronlen < 2000)] <- "1,000-2,000"
#combo.juncratio.data.parsed$lenbin5[which(combo.juncratio.data.parsed$intronlen < 1000)] <- "500-1,000"
#combo.juncratio.data.parsed$lenbin5[which(combo.juncratio.data.parsed$intronlen < 500)] <- "250-500"
#combo.juncratio.data.parsed$lenbin5[which(combo.juncratio.data.parsed$intronlen < 250)] <- "100-250"
combo.juncratio.data.parsed$lenbin5 <- factor(combo.juncratio.data.parsed$lenbin5, levels=c("40-50","50-60","60-70","70-80","80-90","90-100",
                                                                                            "100-200","200-300","300-400","400-500","500-600","600-700","700-800","800-900","900-1000",
                                                                                            "1000-2000","2000-3000","3000-4000","4000-5000","5000-6000","6000-7000","7000-8000","8000-9000","9000-10000",">10,000"))

ggplot(combo.juncratio.data.parsed, aes(x=factor(lenbin5),y=fitvalue)) + geom_boxplot(notch=T) + scale_y_log10()
ggplot(subset(combo.juncratio.data.parsed, intronlen<=100), aes(x=factor(lenbin5),y=fitvalue)) + geom_boxplot(notch=T) + scale_y_log10()

# get length histogram for background
lenbin5_unique <- sort(unique(combo.juncratio.data.parsed$lenbin5))
combo_introncounts <- c()
for(i in 1:length(lenbin5_unique)){
  combo_introncounts[i] <- length(which(combo.juncratio.data.parsed$lenbin5 == lenbin5_unique[i]))
}

combo.lenbin.data <- data.frame(lenbin=lenbin5_unique, counts=combo_introncounts)


```

# Exonlength
```{r}

# Add exon length columns
strands <- unlist(lapply(strsplit(rownames(combo.juncratio.data.parsed), split=":"),"[",7))
firstexon <- as.numeric(unlist(lapply(strsplit(rownames(combo.juncratio.data.parsed), split=":"),"[",3))) - 
             as.numeric(unlist(lapply(strsplit(rownames(combo.juncratio.data.parsed), split=":"),"[",2)))
secondexon <- as.numeric(unlist(lapply(strsplit(rownames(combo.juncratio.data.parsed), split=":"),"[",6))) - 
              as.numeric(unlist(lapply(strsplit(rownames(combo.juncratio.data.parsed), split=":"),"[",5)))

combo.juncratio.data.parsed$upexon_len <- firstexon
combo.juncratio.data.parsed$upexon_len[which(strands =="-")] <- secondexon[which(strands=="-")]
combo.juncratio.data.parsed$downexon_len <- secondexon
combo.juncratio.data.parsed$downexon_len[which(strands =="-")] <- firstexon[which(strands=="-")]

# ratio of exon to intron
combo.juncratio.data.parsed$meanexon <- (combo.juncratio.data.parsed$upexon_len+combo.juncratio.data.parsed$downexon_len)/2
combo.juncratio.data.parsed$IEratio <- combo.juncratio.data.parsed$intronlen/combo.juncratio.data.parsed$meanexon

combo.juncratio.data.parsed$IEratio_bin <- "confused"
combo.juncratio.data.parsed$IEratio_bin[which(combo.juncratio.data.parsed$IEratio < 0.75)] <- "introndef"
combo.juncratio.data.parsed$IEratio_bin[which(combo.juncratio.data.parsed$IEratio > 1.33)] <- "exondef"

# binning by exon length
combo.juncratio.data.parsed$len_bin_exon <- "20%"
combo.juncratio.data.parsed$len_bin_exon[which(combo.juncratio.data.parsed$meanexon >= quantile(combo.juncratio.data.parsed$meanexon, 0.2))] <- "40%"
combo.juncratio.data.parsed$len_bin_exon[which(combo.juncratio.data.parsed$meanexon >= quantile(combo.juncratio.data.parsed$meanexon, 0.4))] <- "60%"
combo.juncratio.data.parsed$len_bin_exon[which(combo.juncratio.data.parsed$meanexon >= quantile(combo.juncratio.data.parsed$meanexon, 0.6))] <- "80%"
combo.juncratio.data.parsed$len_bin_exon[which(combo.juncratio.data.parsed$meanexon >= quantile(combo.juncratio.data.parsed$meanexon, 0.8))] <- "100%"
combo.juncratio.data.parsed$len_bin_exon <- factor(combo.juncratio.data.parsed$len_bin_exon, levels=c("20%","40%","60%","80%","100%"))

# separating into definitions to bin separately
combo.juncratio.data.parsed.ED <- subset(combo.juncratio.data.parsed, IEratio_bin=="exondef")
combo.juncratio.data.parsed.ID <- subset(combo.juncratio.data.parsed, IEratio_bin=="introndef")

# re-doing bins for only exon def
combo.juncratio.data.parsed.ED$len_bin_intron <- "20%"
combo.juncratio.data.parsed.ED$len_bin_intron[which(combo.juncratio.data.parsed.ED$intronlen >= quantile(combo.juncratio.data.parsed.ED$intronlen, 0.2))] <- "40%"
combo.juncratio.data.parsed.ED$len_bin_intron[which(combo.juncratio.data.parsed.ED$intronlen >= quantile(combo.juncratio.data.parsed.ED$intronlen, 0.4))] <- "60%"
combo.juncratio.data.parsed.ED$len_bin_intron[which(combo.juncratio.data.parsed.ED$intronlen >= quantile(combo.juncratio.data.parsed.ED$intronlen, 0.6))] <- "80%"
combo.juncratio.data.parsed.ED$len_bin_intron[which(combo.juncratio.data.parsed.ED$intronlen >= quantile(combo.juncratio.data.parsed.ED$intronlen, 0.8))] <- "100%"
combo.juncratio.data.parsed.ED$len_bin_intron <- factor(combo.juncratio.data.parsed.ED$len_bin_intron, levels=c("20%","40%","60%","80%","100%"))
combo.juncratio.data.parsed.ED$len_bin_intron_def <- paste0(combo.juncratio.data.parsed.ED$len_bin_intron,"_ED")
combo.juncratio.data.parsed.ED$len_bin_intron_def <- factor(combo.juncratio.data.parsed.ED$len_bin_intron_def, levels=c("20%_ED","40%_ED","60%_ED","80%_ED","100%_ED"))

combo.juncratio.data.parsed.ED$len_bin_exon <- "20%"
combo.juncratio.data.parsed.ED$len_bin_exon[which(combo.juncratio.data.parsed.ED$meanexon >= quantile(combo.juncratio.data.parsed.ED$meanexon, 0.2))] <- "40%"
combo.juncratio.data.parsed.ED$len_bin_exon[which(combo.juncratio.data.parsed.ED$meanexon >= quantile(combo.juncratio.data.parsed.ED$meanexon, 0.4))] <- "60%"
combo.juncratio.data.parsed.ED$len_bin_exon[which(combo.juncratio.data.parsed.ED$meanexon >= quantile(combo.juncratio.data.parsed.ED$meanexon, 0.6))] <- "80%"
combo.juncratio.data.parsed.ED$len_bin_exon[which(combo.juncratio.data.parsed.ED$meanexon >= quantile(combo.juncratio.data.parsed.ED$meanexon, 0.8))] <- "100%"
combo.juncratio.data.parsed.ED$len_bin_exon <- factor(combo.juncratio.data.parsed.ED$len_bin_exon, levels=c("20%","40%","60%","80%","100%"))
combo.juncratio.data.parsed.ED$len_bin_exon_def <- paste0(combo.juncratio.data.parsed.ED$len_bin_exon,"_ED")
combo.juncratio.data.parsed.ED$len_bin_exon_def <- factor(combo.juncratio.data.parsed.ED$len_bin_exon_def, levels=c("20%_ED","40%_ED","60%_ED","80%_ED","100%_ED"))

combo.juncratio.data.parsed.ED$len_bin_upexon <- "20%"
combo.juncratio.data.parsed.ED$len_bin_upexon[which(combo.juncratio.data.parsed.ED$upexon >= quantile(combo.juncratio.data.parsed.ED$upexon, 0.2))] <- "40%"
combo.juncratio.data.parsed.ED$len_bin_upexon[which(combo.juncratio.data.parsed.ED$upexon >= quantile(combo.juncratio.data.parsed.ED$upexon, 0.4))] <- "60%"
combo.juncratio.data.parsed.ED$len_bin_upexon[which(combo.juncratio.data.parsed.ED$upexon >= quantile(combo.juncratio.data.parsed.ED$upexon, 0.6))] <- "80%"
combo.juncratio.data.parsed.ED$len_bin_upexon[which(combo.juncratio.data.parsed.ED$upexon >= quantile(combo.juncratio.data.parsed.ED$upexon, 0.8))] <- "100%"
combo.juncratio.data.parsed.ED$len_bin_upexon <- factor(combo.juncratio.data.parsed.ED$len_bin_upexon, levels=c("20%","40%","60%","80%","100%"))
combo.juncratio.data.parsed.ED$len_bin_upexon_def <- paste0(combo.juncratio.data.parsed.ED$len_bin_upexon,"_ED")
combo.juncratio.data.parsed.ED$len_bin_upexon_def <- factor(combo.juncratio.data.parsed.ED$len_bin_upexon_def, levels=c("20%_ED","40%_ED","60%_ED","80%_ED","100%_ED"))


# re-doing bins for only intron def
combo.juncratio.data.parsed.ID$len_bin_intron <- "20%"
combo.juncratio.data.parsed.ID$len_bin_intron[which(combo.juncratio.data.parsed.ID$intronlen >= quantile(combo.juncratio.data.parsed.ID$intronlen, 0.2))] <- "40%"
combo.juncratio.data.parsed.ID$len_bin_intron[which(combo.juncratio.data.parsed.ID$intronlen >= quantile(combo.juncratio.data.parsed.ID$intronlen, 0.4))] <- "60%"
combo.juncratio.data.parsed.ID$len_bin_intron[which(combo.juncratio.data.parsed.ID$intronlen >= quantile(combo.juncratio.data.parsed.ID$intronlen, 0.6))] <- "80%"
combo.juncratio.data.parsed.ID$len_bin_intron[which(combo.juncratio.data.parsed.ID$intronlen >= quantile(combo.juncratio.data.parsed.ID$intronlen, 0.8))] <- "100%"
combo.juncratio.data.parsed.ID$len_bin_intron <- factor(combo.juncratio.data.parsed.ID$len_bin_intron, levels=c("20%","40%","60%","80%","100%"))
combo.juncratio.data.parsed.ID$len_bin_intron_def <- paste0(combo.juncratio.data.parsed.ID$len_bin_intron,"_ID")
combo.juncratio.data.parsed.ID$len_bin_intron_def <- factor(combo.juncratio.data.parsed.ID$len_bin_intron_def, levels=c("20%_ID","40%_ID","60%_ID","80%_ID","100%_ID"))

combo.juncratio.data.parsed.ID$len_bin_exon <- "20%"
combo.juncratio.data.parsed.ID$len_bin_exon[which(combo.juncratio.data.parsed.ID$meanexon >= quantile(combo.juncratio.data.parsed.ID$meanexon, 0.2))] <- "40%"
combo.juncratio.data.parsed.ID$len_bin_exon[which(combo.juncratio.data.parsed.ID$meanexon >= quantile(combo.juncratio.data.parsed.ID$meanexon, 0.4))] <- "60%"
combo.juncratio.data.parsed.ID$len_bin_exon[which(combo.juncratio.data.parsed.ID$meanexon >= quantile(combo.juncratio.data.parsed.ID$meanexon, 0.6))] <- "80%"
combo.juncratio.data.parsed.ID$len_bin_exon[which(combo.juncratio.data.parsed.ID$meanexon >= quantile(combo.juncratio.data.parsed.ID$meanexon, 0.8))] <- "100%"
combo.juncratio.data.parsed.ID$len_bin_exon <- factor(combo.juncratio.data.parsed.ID$len_bin_exon, levels=c("20%","40%","60%","80%","100%"))
combo.juncratio.data.parsed.ID$len_bin_exon_def <- paste0(combo.juncratio.data.parsed.ID$len_bin_exon,"_ID")
combo.juncratio.data.parsed.ID$len_bin_exon_def <- factor(combo.juncratio.data.parsed.ID$len_bin_exon_def, levels=c("20%_ID","40%_ID","60%_ID","80%_ID","100%_ID"))

combo.juncratio.data.parsed.ID$len_bin_upexon <- "20%"
combo.juncratio.data.parsed.ID$len_bin_upexon[which(combo.juncratio.data.parsed.ID$upexon >= quantile(combo.juncratio.data.parsed.ID$upexon, 0.2))] <- "40%"
combo.juncratio.data.parsed.ID$len_bin_upexon[which(combo.juncratio.data.parsed.ID$upexon >= quantile(combo.juncratio.data.parsed.ID$upexon, 0.4))] <- "60%"
combo.juncratio.data.parsed.ID$len_bin_upexon[which(combo.juncratio.data.parsed.ID$upexon >= quantile(combo.juncratio.data.parsed.ID$upexon, 0.6))] <- "80%"
combo.juncratio.data.parsed.ID$len_bin_upexon[which(combo.juncratio.data.parsed.ID$upexon >= quantile(combo.juncratio.data.parsed.ID$upexon, 0.8))] <- "100%"
combo.juncratio.data.parsed.ID$len_bin_upexon <- factor(combo.juncratio.data.parsed.ID$len_bin_upexon, levels=c("20%","40%","60%","80%","100%"))
combo.juncratio.data.parsed.ID$len_bin_upexon_def <- paste0(combo.juncratio.data.parsed.ID$len_bin_upexon,"_ID")
combo.juncratio.data.parsed.ID$len_bin_upexon_def <- factor(combo.juncratio.data.parsed.ID$len_bin_upexon_def, levels=c("20%_ID","40%_ID","60%_ID","80%_ID","100%_ID"))


# combine again
combo.juncratio.data.parsed.bothdef <- rbind(combo.juncratio.data.parsed.ED, combo.juncratio.data.parsed.ID)

ggplot(combo.juncratio.data.parsed.bothdef, aes(x=factor(IEratio_bin),y=fitvalue,fill=factor(len_bin_intron_def))) + geom_boxplot(notch=T,outlier.color="lightgrey") + 
  scale_y_log10(limits=c(0.4,40),breaks=c(1,4,16)) + scale_x_discrete(labels=c("exon definition","intron definition")) +
  scale_fill_manual(values=c(brewer.pal(9,"BuGn")[4:8], brewer.pal(9,"BuPu")[4:8]), 
                    labels=c("61-447 nt","448-839 nt","840-1,517 nt","1,518-3,528 nt","> 3,529 nt","43-59 nt","60-63 nt","64-68 nt","69-87 nt","88-2,455 nt"),
                    guide=guide_legend(nrow=2,byrow=T)) +
  labs(x="",y="half-life (min)",fill="intron length\n(quintiles)") + theme(legend.position="bottom")

ggplot(combo.juncratio.data.parsed.bothdef, aes(x=intronlen, fill=factor(len_bin_intron_def))) + geom_histogram(position="stack",bins=500) + scale_x_log10() +
  scale_fill_manual(values=c(brewer.pal(9,"BuGn")[4:8], brewer.pal(9,"BuPu")[4:8]), 
                    labels=c("61-447 nt","448-839 nt","840-1,517 nt","1,518-3,528 nt","> 3,529 nt","43-59 nt","60-63 nt","64-68 nt","69-87 nt","88-2,455 nt"),
                    guide=guide_legend(nrow=2,byrow=T)) + theme(legend.position="bottom")

ggplot(combo.juncratio.data.parsed.bothdef, aes(x=intronlen, fill=factor(len_bin_intron_def))) + geom_density(alpha=0.75) + scale_x_log10() +
  scale_fill_manual(values=c(brewer.pal(9,"BuGn")[4:8], brewer.pal(9,"BuPu")[4:8]), 
                    labels=c("61-447 nt","448-839 nt","840-1,517 nt","1,518-3,528 nt","> 3,529 nt","43-59 nt","60-63 nt","64-68 nt","69-87 nt","88-2,455 nt"),
                    guide=guide_legend(nrow=2,byrow=T)) + theme(legend.position="bottom")

ggplot(combo.juncratio.data.parsed.bothdef, aes(x=intronlen, color=factor(len_bin_intron_def))) + geom_freqpoly() + scale_x_log10()
ggplot(combo.juncratio.data.parsed.bothdef, aes(x=intronlen, color=factor(len_bin_intron_def))) + geom_area(aes(y=..count.., fill=factor(len_bin_intron_def)),stat="bin") + scale_x_log10()

ggplot(combo.juncratio.data.parsed.bothdef, aes(x=factor(IEratio_bin),y=fitvalue,fill=factor(len_bin_exon))) + geom_boxplot(notch=T) + scale_y_log10()

## look at RInE vs mean exon length
ggplot(combo.juncratio.data.parsed, aes(x=IEratio)) + geom_histogram() + scale_x_log10()

combo.juncratio.data.parsed$meanexon_bin <- "short"
combo.juncratio.data.parsed$meanexon_bin[which(combo.juncratio.data.parsed$meanexon >= quantile(combo.juncratio.data.parsed$meanexon, 0.33))] <- "medium"
combo.juncratio.data.parsed$meanexon_bin[which(combo.juncratio.data.parsed$meanexon >= quantile(combo.juncratio.data.parsed$meanexon, 0.66))] <- "long"

ggplot(combo.juncratio.data.parsed, aes(x=IEratio, fill=factor(meanexon_bin))) + geom_histogram(position = "dodge") + scale_x_log10()

## heatmap of RInE
ggplot(combo.juncratio.data.parsed, aes(x=intronlen,y=meanexon,color=log10(IEratio))) + geom_point() + scale_x_log10() + scale_y_log10() + scale_color_gradient2(low="red",high="blue")

ggplot(combo.juncratio.data.parsed, aes(x=intronlen,y=meanexon,color=log10(IEratio))) + geom_point() + scale_x_log10() + scale_y_log10() + scale_color_gradient2(low="red",high="blue") + 
  geom_smooth(method="lm")


ggplot(combo.juncratio.data.parsed, aes(x=intronlen,y=meanexon,color=log10(fitvalue))) + geom_point() + scale_x_log10() + scale_y_log10()

intron.bin.seq <- seq(40,5000,10)
combo.juncratio.data.parsed$intron10nt <- max(intron.bin.seq)
for(i in intron.bin.seq){
  print(i)
  combo.juncratio.data.parsed$intron10nt[which(combo.juncratio.data.parsed$intronlen >= i & combo.juncratio.data.parsed$intronlen < (i+10))] <- i
}

exon.bin.seq <- seq(40,2000,10)
combo.juncratio.data.parsed$exon10nt <- max(exon.bin.seq)
for(i in exon.bin.seq){
  print(i)
  combo.juncratio.data.parsed$exon10nt[which(combo.juncratio.data.parsed$meanexon >= i & combo.juncratio.data.parsed$meanexon < (i+10))] <- i
}

heatmap.data <- data.frame(intron = rep(intron.bin.seq, length(exon.bin.seq)), exon = rep(exon.bin.seq, each=length(intron.bin.seq)),
                  nintron = 0, sum_hl = 0)
for(i in 1:nrow(combo.juncratio.data.parsed)){
  print(i)
  binind <- which(heatmap.data$intron == combo.juncratio.data.parsed$intron10nt[i] & heatmap.data$exon == combo.juncratio.data.parsed$exon10nt[i])
  heatmap.data$nintron[binind] <- heatmap.data$nintron[binind] + 1
  heatmap.data$sum_hl[binind] <- heatmap.data$sum_hl[binind] + combo.juncratio.data.parsed$fitvalue[i]
}

heatmap.data$mean_hl <- heatmap.data$sum_hl/heatmap.data$nintron

ggplot(heatmap.data, aes(x=intron, y=exon, fill=log2(mean_hl))) + geom_tile()
ggplot(subset(heatmap.data, nintron!=0), aes(x=intron, y=exon, fill=log2(mean_hl))) + geom_tile()
ggplot(subset(heatmap.data, nintron!=0 & intron<=1000), aes(x=intron, y=exon, fill=log2(mean_hl))) + geom_tile()


```

Make circular plot
```{r}

getpercentile <- function(datacol, div){
  # datacol is vale of interest
  # div is the quantile percentage, as a fraction
  quantrow <- rep(div, length(datacol))
  div.seq <- seq(div, 1-div, by=div)
  for(i in div.seq){ quantrow[which(datacol >= quantile(datacol, i, na.rm=T))] <- i+div }
  return(quantrow)
}

# bin by distance from 0 (combined intron & exon len distance)
circ.data <- combo.juncratio.data.parsed[,c('intronlen','meanexon','IEratio','fitvalue')]
circ.data$circ_dist <- sqrt((circ.data$intronlen ^ 2) + (circ.data$meanexon ^ 2))
circ.data$circ_bin <- getpercentile(circ.data$circ_dist, 1/10)
ggplot(circ.data, aes(x=intronlen, y=meanexon, color=factor(circ_bin))) + geom_point() + scale_x_log10() + scale_y_log10()

# get proportional quantile amounts 
binnum=40
id.prop <- nrow(subset(circ.data, IEratio < 1))/nrow(circ.data)
ed.prop <- nrow(subset(circ.data, IEratio > 1))/nrow(circ.data)
id.quantile <- 1/floor(binnum*id.prop)
ed.quantile <- 1/ceiling(binnum*ed.prop)

# bin by RIME (ratio fo intron/exon)
circ.data$RIMEbin <- 0
circ.data$RIMEbin[which(circ.data$IEratio < 1)] <- -(1-getpercentile(subset(circ.data, IEratio < 1)$IEratio, id.quantile)+id.quantile)
circ.data$RIMEbin[which(circ.data$IEratio >= 1)] <- getpercentile(subset(circ.data, IEratio >= 1)$IEratio, ed.quantile)
#ggplot(circ.data, aes(x=intronlen, y=meanexon, color=factor(RIMEbin))) + geom_point() + scale_x_log10() + scale_y_log10()

# format data
circ.seq <- sort(unique(circ.data$circ_bin))
RIME.seq <- sort(unique(circ.data$RIMEbin))
circ.data.format <- data.frame(circ = rep(circ.seq, length(RIME.seq)), RIME = rep(RIME.seq, each=length(circ.seq)),
                               mean = NA, median = NA)
for(i in 1:nrow(circ.data.format)){
  print(i)
  hold.data <- subset(circ.data, circ_bin==circ.data.format$circ[i] & RIMEbin==circ.data.format$RIME[i])
  circ.data.format$mean[i] <- mean(hold.data$fitvalue, na.rm=T)
  circ.data.format$median[i] <- median(hold.data$fitvalue, na.rm=T)
}

# change to polar coordinates
replace.RIME.seq.ID <- head(seq(0, 0.125, by=0.125/length(which(RIME.seq<0))), -1)
replace.RIME.seq.ED <- head(seq(0.125, 0.25, by=0.125/length(which(RIME.seq>0))), -1)
replace.RIME.seq.non <- head(seq(0.25, 1, by=0.01), -1)
replace.RIME.seq <- c(replace.RIME.seq.ID, replace.RIME.seq.ED, replace.RIME.seq.non)
# add variable widths
RIME.width.ID <- rep(0.125/length(replace.RIME.seq.ID), length(replace.RIME.seq.ID))
RIME.width.ED <- rep(0.125/length(replace.RIME.seq.ED), length(replace.RIME.seq.ED))
RIME.width.non <- rep(0.75/length(replace.RIME.seq.non), length(replace.RIME.seq.non))
RIME.width.seq <- c(RIME.width.ID, RIME.width.ED, RIME.width.non)

circ.data.format$RIMEpolar <- 0
circ.data.format$RIMEwidth <- 0
for(i in 1:length(RIME.seq)){
  circ.data.format$RIMEpolar[which(circ.data.format$RIME == RIME.seq[i])] <- replace.RIME.seq[i]
  circ.data.format$RIMEwidth[which(circ.data.format$RIME == RIME.seq[i])] <- RIME.width.seq[i]
}
circ.data.format <- rbind(circ.data.format,
                          data.frame(circ = min(circ.seq),RIME = 1, RIMEpolar = replace.RIME.seq[which(replace.RIME.seq >= 0.25)], RIMEwidth = RIME.width.seq[which(replace.RIME.seq >= 0.25)],
                                     mean = NA, median = NA))
circ.data.format <- subset(circ.data.format, RIMEpolar<1)

circ.data.format$hl_bin <- getpercentile(circ.data.format$mean, 0.05)
circ.data.format$hl_bin[which(is.na(circ.data.format$mean))] <- NA
circ.data.format$circ <- round(circ.data.format$circ, 4)

pdf(paste0("~/Dropbox (MIT)/Projects/Adelman/timecourse/Figures/revisedfigures/fig2_circplot_",binnum,"radial_15L.pdf"),width=4,height=3.5)
ggplot(circ.data.format, aes(x=RIMEpolar, y=factor(circ))) + geom_tile(aes(fill=hl_bin, width=RIMEwidth)) + coord_polar(theta="x",start=0) +
  scale_fill_gradient(low="yellow",high="purple1",na.value="black") + geom_segment(aes(x=0.122,xend=0.122,y=0,yend=15.5),color="gold",size=0.75) +
  labs(x="percentiles of RImE",y="percentiles of aggregated intron & exon length",fill="percentiles of intron half-life") +
  theme(legend.position=c(0.25,0.25),axis.text.x=element_blank())
dev.off()

pdf(paste0("~/Dropbox (MIT)/Projects/Adelman/timecourse/Figures/revisedfigures/fig2_circplot_",binnum,"radial_10L_yellowpurple4.pdf"),width=4,height=3.5)
ggplot(circ.data.format, aes(x=RIMEpolar, y=factor(circ))) + geom_tile(aes(fill=hl_bin, width=RIMEwidth)) + coord_polar(theta="x",start=0) +
  scale_fill_gradient(low="yellow",high="purple4",na.value="black") + geom_segment(aes(x=0.122,xend=0.122,y=0,yend=10.5),color="gold",size=0.75) +
  labs(x="percentiles of RImE",y="percentiles of aggregated intron & exon length",fill="percentiles of intron half-life") +
  theme(legend.position=c(0.25,0.25),axis.text.x=element_blank())
dev.off()

ggplot(circ.data, aes(x=intronlen, y=meanexon, color=RIMEbin)) + geom_point(alpha=0.5) + 
  scale_x_log10(limits=c(40,10000),breaks=c(10,100,1000,10000,100000),labels=comma) + scale_y_log10(limits=c(40,10000),breaks=c(10,100,1000,10000),labels=comma) + 
  scale_color_gradient2(low="grey17",mid="slategray1",high="grey17") + labs(x="intron length (nt)", y="mean exon length (nt)", color="bins of RIME") + background_grid(major="xy") +
  theme(legend.position="right",axis.text=element_text(size=12),axis.title=element_text(size=12),
        legend.text=element_text(size=11),legend.title=element_text(size=11),legend.key.size=unit(3,"mm"))

ggplot(circ.data, aes(x=intronlen, y=meanexon, color=circ_bin)) + geom_point(alpha=0.5) + 
  scale_x_log10(limits=c(40,10000),breaks=c(10,100,1000,10000,100000),labels=comma) +
  scale_y_log10(limits=c(40,10000),breaks=c(10,100,1000,10000),labels=comma) + 
  scale_color_gradient(low="grey17",high="slategray1") + 
  labs(x="intron length (nt)", y="mean exon length (nt)", color="aggregate bins") +
  background_grid(major="xy") +
  theme(legend.position="right",axis.text=element_text(size=12),axis.title=element_text(size=12),legend.text=element_text(size=11),legend.title=element_text(size=11),legend.key.size=unit(3,"mm"))

```

Add running median to RInE vs. half-life plot
```{r}

combo.juncratio.data.parsed.RINEordered <- combo.juncratio.data.parsed[order(combo.juncratio.data.parsed$IEratio),]

combo.median.data.50txpts.RINE <- get_runningmedians(combo.juncratio.data.parsed.RINEordered,50,5, which(names(combo.juncratio.data.parsed.RINEordered)=="fitvalue"))
combo.median.data.100txpts.RINE <- get_runningmedians(combo.juncratio.data.parsed.RINEordered,100,5, which(names(combo.juncratio.data.parsed.RINEordered)=="fitvalue"))
combo.median.data.200txpts.RINE <- get_runningmedians(combo.juncratio.data.parsed.RINEordered,200,5, which(names(combo.juncratio.data.parsed.RINEordered)=="fitvalue"))
combo.median.data.500txpts.RINE <- get_runningmedians(combo.juncratio.data.parsed.RINEordered,500,5, which(names(combo.juncratio.data.parsed.RINEordered)=="fitvalue"))

ggplot(combo.juncratio.data.parsed, aes(x=IEratio, y=fitvalue)) + geom_point(alpha=0.25) + scale_x_log10() + scale_y_log10() + geom_smooth() + geom_vline(xintercept=1,color="red") + 
  labs(x="intron/exon ratio",y="half-lives (min)")

rine.min <- combo.median.data.200txpts.RINE$IEratio[10]
rine.max <- combo.median.data.200txpts.RINE$IEratio[nrow(combo.median.data.200txpts.RINE)-10]

ggplot(combo.median.data.200txpts.RINE, aes(x=IEratio,y=half)) + geom_point(alpha=0.15) + 
  geom_vline(xintercept=1,color=wes_palette("FantasticFox")[2],linetype="dashed",alpha=0.95) + 
  geom_point(data=subset(combo.median.data.200txpts.RINE, IEratio > rine.min & IEratio < rine.max), aes(x=IEratio, y=fit_half),color=wes_palette("FantasticFox")[3]) + 
  scale_y_log10(lim=c(0.4,40),breaks=c(0.5,1,4,16,64,256,1024)) + scale_x_log10(breaks=c(0.1,0.3,1,3,10),labels=comma) + labs(x="RInE",y="half-life (min)") + 
  theme(axis.text.x=element_text(angle=45,hjust=1,size=8)) + geom_vline(xintercept=c(0.70,1.3),color="red")

combo.juncratio.data.parsed$IEratio_thirds <- "lower"
combo.juncratio.data.parsed$IEratio_thirds[which(combo.juncratio.data.parsed$IEratio > 0.70)] <- "mid"
combo.juncratio.data.parsed$IEratio_thirds[which(combo.juncratio.data.parsed$IEratio >= 1.3)] <- "upper"

ggplot(combo.juncratio.data.parsed, aes(x=factor(IEratio_thirds),y=fitvalue)) + geom_boxplot(notch=T) + scale_y_log10()

```

Proportion of variance in splicing half-lives explained by length
```{r}

combo.juncratio.data.parsed$len_category <- "under65"
combo.juncratio.data.parsed$len_category[which(combo.juncratio.data.parsed$intronlen >= 65)] <- "over65"

# including all data: 3.6% explained
glm.test <- glm(data=combo.juncratio.data.parsed, formula=log10(fitvalue)~factor(len_category)*log10(intronlen))
1-(glm.test$deviance/glm.test$null.deviance)
lm.test <- lm(data=combo.juncratio.data.parsed, formula=log10(fitvalue)~factor(len_category)*log10(intronlen))
summary(lm.test)

# using only introns < 100nt: 2.6% explained
glm.test <- glm(data=subset(combo.juncratio.data.parsed, intronlen<=100), formula=log10(fitvalue)~factor(len_category)*log10(intronlen))
1-(glm.test$deviance/glm.test$null.deviance)
lm.test <- lm(data=subset(combo.juncratio.data.parsed, intronlen<=100), formula=log10(fitvalue)~factor(len_category)*log10(intronlen))
summary(lm.test)

# separately for ID vs ED
glm.test <- glm(data=subset(combo.juncratio.data.parsed, IEratio_bin=="introndef"), formula=log10(fitvalue)~factor(len_category)*log10(intronlen))
1-(glm.test$deviance/glm.test$null.deviance)
lm.test <- lm(data=subset(combo.juncratio.data.parsed, IEratio_bin=="introndef"), formula=log10(fitvalue)~factor(len_category)*log10(intronlen))
summary(lm.test)

glm.test <- glm(data=subset(combo.juncratio.data.parsed, IEratio_bin=="exondef"), formula=log10(fitvalue)~factor(len_category)*log10(intronlen))
1-(glm.test$deviance/glm.test$null.deviance)
lm.test <- lm(data=subset(combo.juncratio.data.parsed, IEratio_bin=="exondef"), formula=log10(fitvalue)~factor(len_category)*log10(intronlen))
summary(lm.test)

# include def in model
glm.test <- glm(data=subset(combo.juncratio.data.parsed), formula=log10(fitvalue)~factor(len_category)*log10(intronlen)*factor(IEratio_bin))
1-(glm.test$deviance/glm.test$null.deviance)
lm.test <- lm(data=subset(combo.juncratio.data.parsed), formula=log10(fitvalue)~factor(len_category)*log10(intronlen)*factor(IEratio_bin))
summary(lm.test)

```

Add running medians of half life
```{r}

combo.juncratio.data.parsed.ordered <- combo.juncratio.data.parsed[order(combo.juncratio.data.parsed$intronlen),]

combo.median.data.50txpts <- get_runningmedians(combo.juncratio.data.parsed.ordered,50,5, which(names(combo.juncratio.data.parsed.ordered)=="fitvalue"))
combo.median.data.100txpts <- get_runningmedians(combo.juncratio.data.parsed.ordered,100,5, which(names(combo.juncratio.data.parsed.ordered)=="fitvalue"))
combo.median.data.200txpts <- get_runningmedians(combo.juncratio.data.parsed.ordered,200,5, which(names(combo.juncratio.data.parsed.ordered)=="fitvalue"))
combo.median.data.250txpts <- get_runningmedians(combo.juncratio.data.parsed.ordered,250,5, which(names(combo.juncratio.data.parsed.ordered)=="fitvalue"))
combo.median.data.500txpts <- get_runningmedians(combo.juncratio.data.parsed.ordered,500,5, which(names(combo.juncratio.data.parsed.ordered)=="fitvalue"))

ggplot(combo.median.data.200txpts, aes(x=length,y=half)) + geom_point(alpha=0.25) + geom_point(aes(x=length, y=fit_half),color=wes_palette("FantasticFox")[3]) + 
  scale_y_log10(lim=c(0.05,60),breaks=c(1,4,16,64,256,1024)) + scale_x_log10(breaks=c(50,100,1000,10000,100000),labels=comma) + labs(x="intron length (nt)",y="half-life (min)") + 
  theme(axis.text.x=element_text(angle=45,hjust=1,size=8))

pdf("~/Dropbox (MIT)/Projects/Adelman/timecourse/Figures/exondef/runningmedians.pdf")
ggplot(combo.median.data.500txpts, aes(x=length,y=half)) + geom_point(alpha=0.25) + 
  geom_point(aes(x=length, y=fit_half),color=wes_palette("FantasticFox")[3]) + 
  geom_point(data=combo.median.data.ID.500txpts, aes(x=length, y=fit_half),color="blue") +
  geom_point(data=combo.median.data.ED.500txpts, aes(x=length, y=fit_half),color="red") +
  scale_y_log10(lim=c(0.25,32),breaks=c(1,4,16,64,256,1024)) + scale_x_log10(breaks=c(50,100,1000,10000,100000),labels=comma) + labs(x="intron length (nt)",y="half-life (min)") + 
  theme(axis.text.x=element_text(angle=45,hjust=1,size=8))
dev.off()

pdf("~/Dropbox (MIT)/Projects/Adelman/timecourse/Figures/exondef/runningmedians.g100.pdf")
ggplot(combo.median.data.500txpts, aes(x=length,y=half)) + geom_point(alpha=0.25) + 
  geom_point(aes(x=length, y=fit_half),color=wes_palette("FantasticFox")[3]) + 
  geom_point(data=combo.median.data.ID.500txpts, aes(x=length, y=fit_half),color="blue") +
  geom_point(data=combo.median.data.ED.500txpts, aes(x=length, y=fit_half),color="red") +
  scale_y_log10(lim=c(0.25,32),breaks=c(1,4,16,64,256,1024)) + scale_x_log10(limits=c(100,100000),breaks=c(50,100,1000,10000,100000),labels=comma) + labs(x="intron length (nt)",y="half-life (min)") + 
  theme(axis.text.x=element_text(angle=45,hjust=1,size=8))
dev.off()


```

Look at kmers in exons/introns of ID/ED genes
```{r}

getintronbed <- function(x){
  chrhere <- unlist(lapply(strsplit(as.character(x), split=":"),"[",1))
  starthere <- as.numeric(unlist(lapply(strsplit(as.character(x), split=":"),"[",3)))
  endhere <- as.numeric(unlist(lapply(strsplit(as.character(x), split=":"),"[",5)))
  strandhere <- unlist(lapply(strsplit(as.character(x), split=":"),"[",7))
  bed <- data.frame(chr = chrhere, start = starthere, end = endhere-1, name=x, dot=".", strand = strandhere)
  return(bed)
}
getintronbed_rmSS <- function(x){
  chrhere <- unlist(lapply(strsplit(as.character(x), split=":"),"[",1))
  starthere <- as.numeric(unlist(lapply(strsplit(as.character(x), split=":"),"[",3)))
  endhere <- as.numeric(unlist(lapply(strsplit(as.character(x), split=":"),"[",5)))
  strandhere <- unlist(lapply(strsplit(as.character(x), split=":"),"[",7))
  bed <- data.frame(chr = chrhere, start = starthere+10, end = endhere-25, name=x, dot=".", strand = strandhere)
  bed$start[which(strandhere=="-")] = starthere[which(strandhere=="-")] + 25
  bed$end[which(strandhere=="-")] = endhere[which(strandhere=="-")] - 10 
  return(bed)
}
getintronbed_rmSS_split <- function(x){
  chrhere <- unlist(lapply(strsplit(as.character(x), split=":"),"[",1))
  starthere <- as.numeric(unlist(lapply(strsplit(as.character(x), split=":"),"[",3)))
  endhere <- as.numeric(unlist(lapply(strsplit(as.character(x), split=":"),"[",5)))
  strandhere <- unlist(lapply(strsplit(as.character(x), split=":"),"[",7))
  bed <- data.frame(chr = chrhere, start = starthere+10, end = endhere-25, name=x, dot=".", strand = strandhere)
  bed$start[which(strandhere=="-")] = starthere[which(strandhere=="-")] + 25
  bed$end[which(strandhere=="-")] = endhere[which(strandhere=="-")] - 10 
  # make updown beds
  upbed <- data.frame(chr = chrhere, start = bed$start, end = NA, name=x, dot=".", strand = strandhere)
  downbed <- data.frame(chr = chrhere, start = bed$end - 100, end = NA, name=x, dot=".", strand = strandhere)
  # switch strand
  upbed$start[which(strandhere=="-")] <- bed$end[which(strandhere=="-")] - 100
  downbed$start[which(strandhere=="-")] <- bed$start[which(strandhere=="-")]
  # add 100 to get end pos
  upbed$end = upbed$start + 100
  downbed$end = downbed$start + 100
  # add identifier 
  bothbed <- data.frame(type = rep(c("up", "down"), each=length(x)),
                        rbind(upbed, downbed))
  return(bothbed)
}

getupexonbed <- function(x){
  chrhere <- unlist(lapply(strsplit(as.character(x), split=":"),"[",1))
  upexon_start <- as.numeric(unlist(lapply(strsplit(as.character(x), split=":"),"[",2)))
  upexon_end <- as.numeric(unlist(lapply(strsplit(as.character(x), split=":"),"[",3)))
  downexon_start <- as.numeric(unlist(lapply(strsplit(as.character(x), split=":"),"[",5)))
  downexon_end <- as.numeric(unlist(lapply(strsplit(as.character(x), split=":"),"[",6)))
  strandhere <- unlist(lapply(strsplit(as.character(x), split=":"),"[",7))
  upbed <- data.frame(chr = chrhere, start = upexon_start, end = upexon_end, name=x, dot=".", strand = strandhere)
  upbed$start[which(strandhere=="-")] = downexon_start[which(strandhere=="-")]
  upbed$end[which(strandhere=="-")] = downexon_end[which(strandhere=="-")]
  return(upbed)
}
getdownexonbed <- function(x){
  chrhere <- unlist(lapply(strsplit(as.character(x), split=":"),"[",1))
  upexon_start <- as.numeric(unlist(lapply(strsplit(as.character(x), split=":"),"[",2)))
  upexon_end <- as.numeric(unlist(lapply(strsplit(as.character(x), split=":"),"[",3)))
  downexon_start <- as.numeric(unlist(lapply(strsplit(as.character(x), split=":"),"[",5)))
  downexon_end <- as.numeric(unlist(lapply(strsplit(as.character(x), split=":"),"[",6)))
  strandhere <- unlist(lapply(strsplit(as.character(x), split=":"),"[",7))
  downbed <- data.frame(chr = chrhere, start = downexon_start, end = downexon_end, name=x, dot=".", strand = strandhere)
  downbed$start[which(strandhere=="-")] <- upexon_start[which(strandhere=="-")]
  downbed$end[which(strandhere=="-")] <- upexon_end[which(strandhere=="-")]
  return(downbed)
}

get5ssbed <- function(x){
  chrhere <- unlist(lapply(strsplit(as.character(x), split=":"),"[",1))
  starthere <- as.numeric(unlist(lapply(strsplit(as.character(x), split=":"),"[",3)))
  endhere <- as.numeric(unlist(lapply(strsplit(as.character(x), split=":"),"[",5)))
  strandhere <- unlist(lapply(strsplit(as.character(x), split=":"),"[",7))
  bed <- data.frame(chr = chrhere, start = starthere-3, end = starthere+6, name=x, dot=".", strand = strandhere)
  bed$start[which(strandhere=="-")] = endhere[which(strandhere=="-")] -7
  bed$end[which(strandhere=="-")] = endhere[which(strandhere=="-")] + 2 
  return(bed)
}
get3ssbed <- function(x){
  chrhere <- unlist(lapply(strsplit(as.character(x), split=":"),"[",1))
  starthere <- as.numeric(unlist(lapply(strsplit(as.character(x), split=":"),"[",3)))
  endhere <- as.numeric(unlist(lapply(strsplit(as.character(x), split=":"),"[",5)))
  strandhere <- unlist(lapply(strsplit(as.character(x), split=":"),"[",7))
  bed <- data.frame(chr = chrhere, start = endhere-20, end = endhere+2, name=x, dot=".", strand = strandhere)
  bed$start[which(strandhere=="-")] = starthere[which(strandhere=="-")] -3
  bed$end[which(strandhere=="-")] = starthere[which(strandhere=="-")] + 19 
  return(bed)
}

all.intron <- getintronbed(combo.juncratio.data.parsed$intron)
all.intron_rmSS <- getintronbed_rmSS(combo.juncratio.data.parsed$intron)
all.intron_rmSS.split <- getintronbed_rmSS_split(subset(combo.juncratio.data.parsed, intronlen >= 240)$intron)
all.upexon <- getupexonbed(combo.juncratio.data.parsed$intron)
all.downexon <- getdownexonbed(combo.juncratio.data.parsed$intron)

ID.intron <- getintronbed(subset(combo.juncratio.data.parsed, IEratio_bin=="introndef")$intron)
ID.intron_rmSS <- getintronbed_rmSS(subset(combo.juncratio.data.parsed, IEratio_bin=="introndef")$intron)
ID.intron_rmSS.split <- getintronbed_rmSS_split(subset(combo.juncratio.data.parsed, IEratio_bin=="introndef"  & intronlen >= 240)$intron)
ID.upexon <- getupexonbed(subset(combo.juncratio.data.parsed, IEratio_bin=="introndef")$intron)
ID.downexon <- getdownexonbed(subset(combo.juncratio.data.parsed, IEratio_bin=="introndef")$intron)

ED.intron <- getintronbed(subset(combo.juncratio.data.parsed, IEratio_bin=="exondef")$intron)
ED.intron_rmSS <- getintronbed_rmSS(subset(combo.juncratio.data.parsed, IEratio_bin=="exondef")$intron)
ED.intron_rmSS.split <- getintronbed_rmSS_split(subset(combo.juncratio.data.parsed, IEratio_bin=="exondef"  & intronlen >= 240)$intron)
ED.upexon <- getupexonbed(subset(combo.juncratio.data.parsed, IEratio_bin=="exondef")$intron)
ED.downexon <- getdownexonbed(subset(combo.juncratio.data.parsed, IEratio_bin=="exondef")$intron)

kmerdir = "~/Dropbox (MIT)/Projects/Adelman/timecourse/definition/kmers/"
write.table(all.intron, file=paste0(kmerdir,"all_intron.bed"),sep="\t",quote=F,row.names=F,col.names=F)
write.table(all.intron_rmSS, file=paste0(kmerdir,"all_intron_rmSS.bed"),sep="\t",quote=F,row.names=F,col.names=F)
write.table(subset(all.intron_rmSS.split, type=="up")[,-1], file=paste0(kmerdir,"all_intron_rmSS_upregion.bed"),sep="\t",quote=F,row.names=F,col.names=F)
write.table(subset(all.intron_rmSS.split, type=="down")[,-1], file=paste0(kmerdir,"all_intron_rmSS_downregion.bed"),sep="\t",quote=F,row.names=F,col.names=F)
write.table(all.upexon, file=paste0(kmerdir,"all_upexon.bed"),sep="\t",quote=F,row.names=F,col.names=F)
write.table(all.downexon, file=paste0(kmerdir,"all_downexon.bed"),sep="\t",quote=F,row.names=F,col.names=F)

write.table(ID.intron, file=paste0(kmerdir,"ID_intron.bed"),sep="\t",quote=F,row.names=F,col.names=F)
write.table(ID.intron_rmSS, file=paste0(kmerdir,"ID_intron_rmSS.bed"),sep="\t",quote=F,row.names=F,col.names=F)
write.table(subset(ID.intron_rmSS.split, type=="up")[,-1], file=paste0(kmerdir,"ID_intron_rmSS_upregion.bed"),sep="\t",quote=F,row.names=F,col.names=F)
write.table(subset(ID.intron_rmSS.split, type=="down")[,-1], file=paste0(kmerdir,"ID_intron_rmSS_downregion.bed"),sep="\t",quote=F,row.names=F,col.names=F)
write.table(ID.upexon, file=paste0(kmerdir,"ID_upexon.bed"),sep="\t",quote=F,row.names=F,col.names=F)
write.table(ID.downexon, file=paste0(kmerdir,"ID_downexon.bed"),sep="\t",quote=F,row.names=F,col.names=F)

write.table(ED.intron, file=paste0(kmerdir,"ED_intron.bed"),sep="\t",quote=F,row.names=F,col.names=F)
write.table(ED.intron_rmSS, file=paste0(kmerdir,"ED_intron_rmSS.bed"),sep="\t",quote=F,row.names=F,col.names=F)
write.table(subset(ED.intron_rmSS.split, type=="up")[,-1], file=paste0(kmerdir,"ED_intron_rmSS_upregion.bed"),sep="\t",quote=F,row.names=F,col.names=F)
write.table(subset(ED.intron_rmSS.split, type=="down")[,-1], file=paste0(kmerdir,"ED_intron_rmSS_downregion.bed"),sep="\t",quote=F,row.names=F,col.names=F)
write.table(ED.upexon, file=paste0(kmerdir,"ED_upexon.bed"),sep="\t",quote=F,row.names=F,col.names=F)
write.table(ED.downexon, file=paste0(kmerdir,"ED_downexon.bed"),sep="\t",quote=F,row.names=F,col.names=F)

# get splice site pictograms
all.5ss <- get5ssbed(combo.juncratio.data.parsed$intron)
all.3ss <- get3ssbed(combo.juncratio.data.parsed$intron)
write.table(all.5ss, file=paste0(kmerdir,"ss_pict/all_5ss.bed"),quote=F,row.names=F,col.names=F,sep="\t")
write.table(all.3ss, file=paste0(kmerdir,"ss_pict/all_3ss.bed"),quote=F,row.names=F,col.names=F,sep="\t")

ID.5ss <- get5ssbed(subset(combo.juncratio.data.parsed, IEratio_bin=="introndef")$intron)
ID.3ss <- get3ssbed(subset(combo.juncratio.data.parsed, IEratio_bin=="introndef")$intron)
write.table(ID.5ss, file=paste0(kmerdir,"ss_pict/ID_5ss.bed"),quote=F,row.names=F,col.names=F,sep="\t")
write.table(ID.3ss, file=paste0(kmerdir,"ss_pict/ID_3ss.bed"),quote=F,row.names=F,col.names=F,sep="\t")
write.table(ID.5ss[sample(nrow(ID.5ss), 5000),], file=paste0(kmerdir,"ss_pict/IDsub_5ss.bed"),quote=F,row.names=F,col.names=F,sep="\t")
write.table(ID.3ss[sample(nrow(ID.3ss), 5000),], file=paste0(kmerdir,"ss_pict/IDsub_3ss.bed"),quote=F,row.names=F,col.names=F,sep="\t")

confused.5ss <- get5ssbed(subset(combo.juncratio.data.parsed, IEratio_bin=="confused")$intron)
confused.3ss <- get3ssbed(subset(combo.juncratio.data.parsed, IEratio_bin=="confused")$intron)
write.table(confused.5ss, file=paste0(kmerdir,"ss_pict/confused_5ss.bed"),quote=F,row.names=F,col.names=F,sep="\t")
write.table(confused.3ss, file=paste0(kmerdir,"ss_pict/confused_3ss.bed"),quote=F,row.names=F,col.names=F,sep="\t")

ED.5ss <- get5ssbed(subset(combo.juncratio.data.parsed, IEratio_bin=="exondef")$intron)
ED.3ss <- get3ssbed(subset(combo.juncratio.data.parsed, IEratio_bin=="exondef")$intron)
write.table(ED.5ss, file=paste0(kmerdir,"ss_pict/ED_5ss.bed"),quote=F,row.names=F,col.names=F,sep="\t")
write.table(ED.3ss, file=paste0(kmerdir,"ss_pict/ED_3ss.bed"),quote=F,row.names=F,col.names=F,sep="\t")

# read in kmer enrichment results

ID.intron.enrich <- read.table("~/Dropbox (MIT)/Projects/Adelman/timecourse/definition/kmers/ID_intron.enrich",header=T,sep="\t")
ID.intron_rmSS.enrich <- read.table("~/Dropbox (MIT)/Projects/Adelman/timecourse/definition/kmers/ID_intron_rmSS.enrich",header=T,sep="\t")
ID.intron_rmSS_upregion.enrich <- read.table("~/Dropbox (MIT)/Projects/Adelman/timecourse/definition/kmers/ID_intron_rmSS_upregion.enrich",header=T,sep="\t")
ID.intron_rmSS_downregion.enrich <- read.table("~/Dropbox (MIT)/Projects/Adelman/timecourse/definition/kmers/ID_intron_rmSS_downregion.enrich",header=T,sep="\t")
ID.upexon.enrich <- read.table("~/Dropbox (MIT)/Projects/Adelman/timecourse/definition/kmers/ID_upexon.enrich",header=T,sep="\t")
ID.downexon.enrich <- read.table("~/Dropbox (MIT)/Projects/Adelman/timecourse/definition/kmers/ID_downexon.enrich",header=T,sep="\t")
names(ID.intron.enrich) <- names(ID.intron_rmSS.enrich) <- names(ID.intron_rmSS_upregion.enrich) <- names(ID.intron_rmSS_downregion.enrich) <- names(ID.upexon.enrich) <- names(ID.downexon.enrich) <- c("kmer","fore","back","enrichment","pval","BH_corrected_pval","best_protein_matches")

#ED.intron.enrich <- read.table("~/Dropbox (MIT)/Projects/Adelman/timecourse/definition/kmers/ED_intron.enrich",header=T,sep="\t")
#ED.intron_rmSS.enrich <- read.table("~/Dropbox (MIT)/Projects/Adelman/timecourse/definition/kmers/ED_intron_rmSS.enrich",header=T,sep="\t")
ED.intron_rmSS_upregion.enrich <- read.table("~/Dropbox (MIT)/Projects/Adelman/timecourse/definition/kmers/ED_intron_rmSS_upregion.enrich",header=T,sep="\t")
ED.intron_rmSS_downregion.enrich <- read.table("~/Dropbox (MIT)/Projects/Adelman/timecourse/definition/kmers/ED_intron_rmSS_downregion.enrich",header=T,sep="\t")
ED.upexon.enrich <- read.table("~/Dropbox (MIT)/Projects/Adelman/timecourse/definition/kmers/ED_upexon.enrich",header=T,sep="\t")
names(ED.intron.enrich) <- names(ED.intron_rmSS.enrich) <- names(ED.intron_rmSS_upregion.enrich) <- names(ED.intron_rmSS_downregion.enrich) <- names(ED.upexon.enrich) <- names(ED.downexon.enrich) <- c("kmer","fore","back","enrichment","pval","BH_corrected_pval","best_protein_matches")

all.enrich <- data.frame(def = rep(c("introndef", "exondef"), each=4096*3),
                         region = rep(rep(c("upexon","intron","downexon"), each=4096), 2),
                         rbind(ID.upexon.enrich, ID.intron.enrich, ID.downexon.enrich,
                               ED.upexon.enrich, ED.intron.enrich, ED.downexon.enrich))

all.enrich_rmSS <- data.frame(def = rep(c("introndef", "exondef"), each=4096*3),
                         region = rep(rep(c("upexon","intron","downexon"), each=4096), 2),
                         rbind(ID.upexon.enrich, ID.intron_rmSS.enrich, ID.downexon.enrich,
                               ED.upexon.enrich, ED.intron_rmSS.enrich, ED.downexon.enrich))

#all.enrich_rmSS <- data.frame(def = rep(c(rep("introndef", 4096*5), rep("exondef", 4096*4))),
#                         region = c(rep(c("upexon","intron_up","intron","intron_down","downexon"), each=4096),
#                                    rep(c("upexon","intron_up","intron_down","downexon"), each=4096)),
#                         rbind(ID.upexon.enrich, ID.intron_rmSS_upregion.enrich, ID.intron_rmSS.enrich, ID.intron_rmSS_downregion.enrich, ID.downexon.enrich,
#                               ED.upexon.enrich, ED.intron_rmSS_upregion.enrich, ED.intron_rmSS_downregion.enrich, ED.downexon.enrich))
#all.enrich_rmSS$def <- factor(all.enrich_rmSS$def, levels=c("introndef","exondef"))
#all.enrich_rmSS$region <- factor(all.enrich_rmSS$region, levels=c("upexon","intron_up","intron","intron_down","downexon"))

# read SRE
ESE <- read.table("~/Dropbox (MIT)/Annotations/SREs/ESE_hexamers_U.txt",header=F)
ESS <- read.table("~/Dropbox (MIT)/Annotations/SREs/ESS_hexamers_U.txt",header=F)
ISE <- read.table("~/Dropbox (MIT)/Annotations/SREs/ISE_hexamers_U.txt",header=F)
ISS <- read.table("~/Dropbox (MIT)/Annotations/SREs/ISS_hexamers_U.txt",header=F)

# #all.enrich$SRE <- NA
# all.enrich$SRE[which(all.enrich$region=="intron" & !is.na(match(all.enrich$kmer, ISS$V1)))] <- "ISS"
# all.enrich$SRE[which(all.enrich$region=="intron" & !is.na(match(all.enrich$kmer, ISE$V1)))] <- "ISE"
# all.enrich$SRE[which(all.enrich$region!="intron" & !is.na(match(all.enrich$kmer, ESS$V1)))] <- "ESS"
# all.enrich$SRE[which(all.enrich$region!="intron" & !is.na(match(all.enrich$kmer, ESE$V1)))] <- "ESE"
# ss3_6mers <- c("CCCCAG","CCUCAG","CUCCAG","CUUCAG","UCCCAG","UCUCAG","UUCCAG","UUUCAG")
# all.enrich$SRE[which(all.enrich$region=="intron" & !is.na(match(all.enrich$kmer, ss3_6mers)))] <- "3'ss"
# all.enrich$SRE[which(all.enrich$region=="intron" & (all.enrich$kmer == "GUAAGU" | all.enrich$kmer == "GUGAGU"))] <- "5'ss"
# all.enrich$def <- factor(all.enrich$def, levels=c("introndef","exondef"))
# all.enrich$region <- factor(all.enrich$region, levels=c("upexon","intron","downexon"))

all.enrich_rmSS$SRE <- NA
all.enrich_rmSS$SRE[which(all.enrich_rmSS$region=="intron_rmSS" & !is.na(match(all.enrich_rmSS$kmer, ISS$V1)))] <- "ISS"
all.enrich_rmSS$SRE[which(all.enrich_rmSS$region=="intron_rmSS" & !is.na(match(all.enrich_rmSS$kmer, ISE$V1)))] <- "ISE"
all.enrich_rmSS$SRE[which(all.enrich_rmSS$region!="intron_rmSS" & !is.na(match(all.enrich_rmSS$kmer, ESS$V1)))] <- "ESS"
all.enrich_rmSS$SRE[which(all.enrich_rmSS$region!="intron_rmSS" & !is.na(match(all.enrich_rmSS$kmer, ESE$V1)))] <- "ESE"
ss3_6mers <- c("CCCCAG","CCUCAG","CUCCAG","CUUCAG","UCCCAG","UCUCAG","UUCCAG","UUUCAG")
all.enrich_rmSS$SRE[which(all.enrich_rmSS$region=="intron_rmSS" & !is.na(match(all.enrich_rmSS$kmer, ss3_6mers)))] <- "3'ss"
all.enrich_rmSS$SRE[which(all.enrich_rmSS$region=="intron_rmSS" & (all.enrich_rmSS$kmer == "GUAAGU" | all.enrich_rmSS$kmer == "GUGAGU"))] <- "5'ss"
all.enrich_rmSS$def <- factor(all.enrich_rmSS$def, levels=c("introndef","exondef"))
all.enrich_rmSS$region <- factor(all.enrich_rmSS$region, levels=c("upexon","intron_rmSS","downexon"))


pdf("~/Dropbox (MIT)/Projects/Adelman/timecourse/definition/kmers/enrichment.pdf",width=15,height=10,useDingbats = F)
ggplot(all.enrich, aes(x=log2(enrichment), y=-log10(pval))) + geom_point(alpha=0.5) + 
  geom_text_repel(data=subset(all.enrich, BH_corrected_pval<10^-30 & abs(log2(enrichment)) > 0.5), aes(label=kmer, color=factor(SRE)),size=3) +
  facet_grid(def~region)
dev.off()

pdf("~/Dropbox (MIT)/Projects/Adelman/timecourse/definition/kmers/enrichment_rmSS.pdf",width=15,height=10,useDingbats = F)
ggplot(all.enrich_rmSS, aes(x=log2(enrichment), y=-log10(pval))) + geom_point(alpha=0.5) + 
  geom_text_repel(data=subset(all.enrich_rmSS, BH_corrected_pval<10^-30 & abs(log2(enrichment)) > 0.5), aes(label=kmer, color=factor(SRE)),size=3) +
  facet_grid(def~region)
dev.off()

pdf("~/Dropbox (MIT)/Projects/Adelman/timecourse/definition/kmers/enrichment_rmSS_intronregions.pdf",width=15,height=10,useDingbats = F)
ggplot(all.enrich_rmSS, aes(x=log2(enrichment), y=-log10(pval))) + geom_point(alpha=0.5) + 
  geom_text_repel(data=subset(all.enrich_rmSS, BH_corrected_pval<10^-30 & abs(log2(enrichment)) > 0.5), aes(label=kmer),size=3) + 
  facet_grid(def~region)
dev.off()

pdf("~/Dropbox (MIT)/Projects/Adelman/timecourse/definition/kmers/splicesites_bydef.pdf")
ggplot(combo.juncratio.data.parsed.bothdef, aes(x=factor(IEratio_bin),y=ss3,fill=factor(len_bin_intron_def))) + geom_boxplot(notch=T) + 
  scale_fill_manual(values=c(brewer.pal(9,"PuRd")[5:9], brewer.pal(9,"Blues")[5:9]), 
                    labels=c("43-59 nt","60-63 nt","64-68 nt","69-99 nt","99-2,493 nt","57-353 nt","354-674 nt","675-1,252 nt","1,253-2,943 nt","> 2,944 nt"), guide=guide_legend(ncol=5,byrow=T)) +
  labs(x="definition",y="3'splice site score (maxEnt)",fill="quantiles") + theme(legend.position="bottom")
ggplot(combo.juncratio.data.parsed.bothdef, aes(x=factor(IEratio_bin),y=ss5,fill=factor(len_bin_intron_def))) + geom_boxplot(notch=T) + 
  scale_fill_manual(values=c(brewer.pal(9,"PuRd")[5:9], brewer.pal(9,"Blues")[5:9]), 
                    labels=c("43-59 nt","60-63 nt","64-68 nt","69-99 nt","99-2,493 nt","57-353 nt","354-674 nt","675-1,252 nt","1,253-2,943 nt","> 2,944 nt"), guide=guide_legend(ncol=5,byrow=T)) +
  labs(x="definition",y="5'splice site score (maxEnt)",fill="quantiles") + theme(legend.position="bottom")
dev.off()

```

# Splice Sites
```{r}

miso.splicesite.data <- data.frame(type=rep(c("5SS","3SS"),each=nrow(combo.juncratio.data.parsed)),
                                  halflife = rep(combo.juncratio.data.parsed$fitvalue, 2),
                                  len_bin = rep(combo.juncratio.data.parsed$len_bin,2),
                                  len.bin5 = rep(combo.juncratio.data.parsed$lenbin5, 2),
                                  len = rep(combo.juncratio.data.parsed$intronlen, 2),
                                  def = rep(combo.juncratio.data.parsed$IEratio_bin, 2),
                                  SS = c(combo.juncratio.data.parsed$ss5, combo.juncratio.data.parsed$ss3))

ggplot(miso.splicesite.data, aes(x=factor(len.bin5),y=SS,fill=factor(type))) + geom_boxplot(notch=T)

ggplot(miso.splicesite.data, aes(x=factor(len.bin5),y=SS,fill=factor(type))) + geom_boxplot(notch=T) + facet_wrap(~def)


##### does splice site strength of small introns explain slower splicing?

indssmall <- which(combo.juncratio.data.parsed$intronlen <= 50)
indsmid <- which(combo.juncratio.data.parsed$intronlen > 60 & combo.juncratio.data.parsed$intronlen >= 70)

ss3.match <- ss5.match <- both.match <- c()
for(i in indssmall){
  print(i)
  ss3 = combo.juncratio.data.parsed$ss3[i]
  ss5 = combo.juncratio.data.parsed$ss5[i]
  try(ss3.match <- c(ss3.match, sample(indsmid[which(abs(combo.juncratio.data.parsed$ss3[indsmid]-ss3) < 0.5)], 1)))
  try(ss5.match <- c(ss5.match, sample(indsmid[which(abs(combo.juncratio.data.parsed$ss5[indsmid]-ss5) < 0.5)], 1)))
  try(both.match <- c(both.match, sample(indsmid[which(abs(combo.juncratio.data.parsed$ss3[indsmid]-ss3) < 0.5 & abs(combo.juncratio.data.parsed$ss5[indsmid]-ss5) < 0.5)], 1)))
}

ss.match.data <- data.frame(type = c(rep("40-50 nt",length(indssmall)),rep("60-70 nt\n(matched for 3'ss score)", length(ss3.match)),
                                     rep("60-70 nt\n(matched for 5'ss score)", length(ss5.match)),rep("60-70 nt\n(matched for both ss scores)", length(both.match))),
                            half.lives = c(combo.juncratio.data.parsed$fitvalue[indssmall],combo.juncratio.data.parsed$fitvalue[ss3.match],
                                           combo.juncratio.data.parsed$fitvalue[ss5.match],combo.juncratio.data.parsed$fitvalue[both.match]),
                            ss3 = c(combo.juncratio.data.parsed$ss3[indssmall],combo.juncratio.data.parsed$ss3[ss3.match],
                                    combo.juncratio.data.parsed$ss3[ss5.match],combo.juncratio.data.parsed$ss3[both.match]),
                            ss5 = c(combo.juncratio.data.parsed$ss5[indssmall],combo.juncratio.data.parsed$ss5[ss3.match],
                                    combo.juncratio.data.parsed$ss5[ss5.match],combo.juncratio.data.parsed$ss5[both.match]))
ss.match.data$type <- factor(ss.match.data$type, levels=c("40-50 nt","60-70 nt\n(matched for 3'ss score)","60-70 nt\n(matched for 5'ss score)","60-70 nt\n(matched for both ss scores)"))

ggplot(ss.match.data, aes(x=factor(type),y=half.lives,fill=factor(type))) + geom_boxplot(notch=T,outlier.color="white") + scale_y_log10(limits=c(0.25,100),breaks=c(10,100))

```

Find branchpoint in variable length introns
```{r}

getbpseq <- function(datahere, upstream, window){
  chr <- unlist(lapply(strsplit(as.character(datahere$intron), split=":"),"[",1))
  intron_up <- as.numeric(unlist(lapply(strsplit(as.character(datahere$intron), split=":"),"[",3)))
  intron_down <- as.numeric(unlist(lapply(strsplit(as.character(datahere$intron), split=":"),"[",5)))
  strand <- unlist(lapply(strsplit(as.character(datahere$intron), split=":"),"[",7))
  window_start <- intron_down - upstream - window
  window_end <- intron_down - upstream
  window_start[which(strand=="-")] <- intron_up[which(strand=="-")] + upstream
  window_end[which(strand=="-")] <- intron_up[which(strand=="-")] + upstream + window
  outdata <- data.frame(chr=chr,
                        start = window_start,
                        end = window_end,
                        name = paste(datahere$intron,";intron-",datahere$intronlen,sep=""),
                        dot=".",
                        strand = strand)
  return(outdata)
}

# get data
data50 <- subset(combo.juncratio.data.parsed, intronlen <= 50)
data60 <- subset(combo.juncratio.data.parsed, intronlen > 50 & intronlen <= 60)[c(1:1000),]
data60.sample <- data60[sample(nrow(data60),nrow(data50)),]
data70 <- subset(combo.juncratio.data.parsed, intronlen > 60 & intronlen <= 70)[c(1:1000),]
data70.sample <- data70[sample(nrow(data70),nrow(data50)),]
data80 <- subset(combo.juncratio.data.parsed, intronlen > 70 & intronlen <= 80)[c(1:1000),]
data80.sample <- data80[sample(nrow(data80),nrow(data50)),]
data90 <- subset(combo.juncratio.data.parsed, intronlen > 80 & intronlen <= 90)
data90.sample <- data90[sample(nrow(data90),nrow(data50)),]
data100 <- subset(combo.juncratio.data.parsed, intronlen > 90 & intronlen <= 100)
data100.sample <- data100[sample(nrow(data100),nrow(data50)),]

for(upstream in seq(0,30,5)){
  for(window in c(10, 15)){
    print(paste(upstream,window,sep=" - "))
    write.table(getbpseq(data50, upstream, window), file=paste0("getBP/bpsearch_50nt_upstream",upstream,"_window",window,".txt"), sep="\t", quote=F, row.names=F, col.names=F)
    write.table(getbpseq(data60, upstream, window), file=paste0("getBP/bpsearch_60nt_upstream",upstream,"_window",window,".txt"), sep="\t", quote=F, row.names=F, col.names=F)
    write.table(getbpseq(data60.sample, upstream, window), file=paste0("getBP/bpsearch_60sample_upstream",upstream,"_window",window,".txt"), sep="\t", quote=F, row.names=F, col.names=F)
    write.table(getbpseq(data70, upstream, window), file=paste0("getBP/bpsearch_70nt_upstream",upstream,"_window",window,".txt"), sep="\t", quote=F, row.names=F, col.names=F)
    write.table(getbpseq(data70.sample, upstream, window), file=paste0("getBP/bpsearch_70sample_upstream",upstream,"_window",window,".txt"), sep="\t", quote=F, row.names=F, col.names=F)
    write.table(getbpseq(data80, upstream, window), file=paste0("getBP/bpsearch_80nt_upstream",upstream,"_window",window,".txt"), sep="\t", quote=F, row.names=F, col.names=F)
    write.table(getbpseq(data80.sample, upstream, window), file=paste0("getBP/bpsearch_80sample_upstream",upstream,"_window",window,".txt"), sep="\t", quote=F, row.names=F, col.names=F)
    write.table(getbpseq(data90, upstream, window), file=paste0("getBP/bpsearch_90nt_upstream",upstream,"_window",window,".txt"), sep="\t", quote=F, row.names=F, col.names=F)
    write.table(getbpseq(data90.sample, upstream, window), file=paste0("getBP/bpsearch_90sample_upstream",upstream,"_window",window,".txt"), sep="\t", quote=F, row.names=F, col.names=F)
    write.table(getbpseq(data100, upstream, window), file=paste0("getBP/bpsearch_100nt_upstream",upstream,"_window",window,".txt"), sep="\t", quote=F, row.names=F, col.names=F)
    write.table(getbpseq(data100.sample, upstream, window), file=paste0("getBP/bpsearch_100sample_upstream",upstream,"_window",window,".txt"), sep="\t", quote=F, row.names=F, col.names=F)
  }
}

# write out whole sequence
all.introns <- data.frame(chr = unlist(lapply(strsplit(as.character(combo.juncratio.data.parsed$intron),split=":"),"[",1)),
                          up = as.numeric(unlist(lapply(strsplit(as.character(combo.juncratio.data.parsed$intron),split=":"),"[",3))),
                          down = as.numeric(unlist(lapply(strsplit(as.character(combo.juncratio.data.parsed$intron), split=":"),"[",5))),
                          name = combo.juncratio.data.parsed$intron,
                          dot = ".",
                          strand = unlist(lapply(strsplit(as.character(combo.juncratio.data.parsed$intron),split=":"),"[",7)))
all.introns$start <- all.introns$down - 50
all.introns$end <- all.introns$down
all.introns$start[which(all.introns$strand == "-")] <- all.introns$up[which(all.introns$strand=="-")]
all.introns$end[which(all.introns$strand == "-")] <- all.introns$up[which(all.introns$strand=="-")] + 50

write.table(all.introns, file="getBP/bpsearch_allintrons.bed",sep="\t",quote=F,row.names=F,col.names=F)
write.table(all.introns[,c(1,7,8,4:6)], file="getBP/bpsearch_allintrons50.bed",sep="\t",quote=F,row.names=F,col.names=F)

#bpscan <- read.table("~/Dropbox (MIT)/scripts/scanMotifs/bpsearch_allintrons.scan",header=T)
bpscan <- read.table("~/Dropbox (MIT)/scripts/scanMotifs/bpsearch_allintrons50.scan",header=T)
# get distance from 3'ss
bpscan$ss3_dist <- bpscan$seq_len - 5 - bpscan$max_PSSM_pos
# get intron length
bpscan$intronlen <- as.numeric(unlist(lapply(strsplit(as.character(bpscan$gene),split=":"),"[",5))) - as.numeric(unlist(lapply(strsplit(as.character(bpscan$gene),split=":"),"[",3)))

# bin into 10bp bins
bpscan$bin_10bp <- NA
bpscan$bin_10bp[which(bpscan$intronlen <= 50)] <- "40-50nt"
bpscan$bin_10bp[which(bpscan$intronlen > 50 & bpscan$intronlen <= 60)] <- "50-60nt"
bpscan$bin_10bp[which(bpscan$intronlen > 60 & bpscan$intronlen <= 70)] <- "60-70nt"
bpscan$bin_10bp[which(bpscan$intronlen > 70 & bpscan$intronlen <= 80)] <- "70-80nt"
bpscan$bin_10bp[which(bpscan$intronlen > 80 & bpscan$intronlen <= 90)] <- "80-90nt"
bpscan$bin_10bp[which(bpscan$intronlen > 90 & bpscan$intronlen <= 100)] <- "90-100nt"
bpscan$bin_10bp[which(bpscan$intronlen > 100 & bpscan$intronlen <= 110)] <- "100-110nt"
bpscan$bin_10bp[which(bpscan$intronlen > 110 & bpscan$intronlen <= 120)] <- "110-120nt"
bpscan$bin_10bp[which(bpscan$intronlen > 120 & bpscan$intronlen <= 130)] <- "120-130nt"
bpscan$bin_10bp[which(bpscan$intronlen > 130 & bpscan$intronlen <= 140)] <- "130-140nt"
bpscan$bin_10bp[which(bpscan$intronlen > 140 & bpscan$intronlen <= 150)] <- "140-150nt"
bpscan$bin_10bp <- factor(bpscan$bin_10bp, levels=c("40-50nt", "50-60nt", "60-70nt", "70-80nt", "80-90nt", "90-100nt",
                                                    "100-110nt", "110-120nt", "120-130nt", "130-140nt", "140-150nt", NA))

ggplot(bpscan, aes(x=intronlen, y=max_PSSM_pos)) + geom_point() + scale_x_log10()
ggplot(subset(bpscan, intronlen<=100), aes(x=factor(bin_10bp), y=max_PSSM_pos)) + geom_boxplot()
ggplot(subset(bpscan, intronlen<=150), aes(x=factor(bin_10bp), y=ss3_dist)) + geom_boxplot(notch=T)
ggplot(subset(bpscan, intronlen<=150), aes(x=ss3_dist,fill=factor(bin_10bp))) + geom_density(alpha=0.5)
ggplot(subset(bpscan, intronlen<=150), aes(x=ss3_dist,color=factor(bin_10bp))) + geom_density(alpha=0.5) + xlim(0,50)
ggplot(bpscan, aes(x=ss3_dist,color=factor(bin_10bp))) + geom_density(alpha=0.5) + xlim(0,50)

ggplot(subset(bpscan, intronlen<=150), aes(x=factor(bin_10bp),y=max_PSSM_score)) + geom_boxplot()

bpscan_bar <- data.frame(bin = c("40-50nt","50-60nt","60-70nt","70-80nt","80-90nt","90-100nt"),
                         mean = c(mean(subset(bpscan, bin_10bp=="40-50nt")$ss3_dist, na.rm=T),
                                  mean(subset(bpscan, bin_10bp=="50-60nt")$ss3_dist, na.rm=T),
                                  mean(subset(bpscan, bin_10bp=="60-70nt")$ss3_dist, na.rm=T),
                                  mean(subset(bpscan, bin_10bp=="70-80nt")$ss3_dist, na.rm=T),
                                  mean(subset(bpscan, bin_10bp=="80-90nt")$ss3_dist, na.rm=T),
                                  mean(subset(bpscan, bin_10bp=="90-100nt")$ss3_dist, na.rm=T)),
                         se = c(sd(subset(bpscan, bin_10bp=="40-50nt")$ss3_dist, na.rm=T)/sqrt(nrow(subset(bpscan, bin_10bp=="40-50nt"))),
                                sd(subset(bpscan, bin_10bp=="50-60nt")$ss3_dist, na.rm=T)/sqrt(nrow(subset(bpscan, bin_10bp=="50-60nt"))),
                                sd(subset(bpscan, bin_10bp=="60-70nt")$ss3_dist, na.rm=T)/sqrt(nrow(subset(bpscan, bin_10bp=="60-70nt"))),
                                sd(subset(bpscan, bin_10bp=="70-80nt")$ss3_dist, na.rm=T)/sqrt(nrow(subset(bpscan, bin_10bp=="70-80nt"))),
                                sd(subset(bpscan, bin_10bp=="80-90nt")$ss3_dist, na.rm=T)/sqrt(nrow(subset(bpscan, bin_10bp=="80-90nt"))),
                                sd(subset(bpscan, bin_10bp=="90-100nt")$ss3_dist, na.rm=T)/sqrt(nrow(subset(bpscan, bin_10bp=="90-100nt")))))
bpscan_bar$bin <- factor(bpscan_bar$bin, levels=rev(c("40-50nt","50-60nt","60-70nt","70-80nt","80-90nt","90-100nt")))

ggplot(bpscan_bar, aes(x=factor(bin),y=-mean)) + geom_bar(stat="identity") + geom_errorbar(aes(ymin=-mean-se, ymax=-mean+se),width=0.5) + ylim(-40,0) + coord_flip() + 
  labs(y="distance from 3' splice site",x="intron length")

```

Write out fastas of sequences for pictograms
```{r}

write.table(paste0(">",subset(bpscan, bin_10bp=="40-50nt")$gene,"\n",subset(bpscan, bin_10bp=="40-50nt")$seq), file="getBP/bpfasta.50.fa", sep="\t",quote=F,row.names=F,col.names=F)
write.table(paste0(">",subset(bpscan, bin_10bp=="50-60nt")$gene,"\n",subset(bpscan, bin_10bp=="50-60nt")$seq), file="getBP/bpfasta.60.fa", sep="\t",quote=F,row.names=F,col.names=F)
write.table(paste0(">",subset(bpscan, bin_10bp=="60-70nt")$gene,"\n",subset(bpscan, bin_10bp=="60-70nt")$seq), file="getBP/bpfasta.70.fa", sep="\t",quote=F,row.names=F,col.names=F)
write.table(paste0(">",subset(bpscan, bin_10bp=="70-80nt")$gene,"\n",subset(bpscan, bin_10bp=="70-80nt")$seq), file="getBP/bpfasta.80.fa", sep="\t",quote=F,row.names=F,col.names=F)
write.table(paste0(">",subset(bpscan, bin_10bp=="80-90nt")$gene,"\n",subset(bpscan, bin_10bp=="80-90nt")$seq), file="getBP/bpfasta.90.fa", sep="\t",quote=F,row.names=F,col.names=F)
write.table(paste0(">",subset(bpscan, bin_10bp=="90-100nt")$gene,"\n",subset(bpscan, bin_10bp=="90-100nt")$seq), file="getBP/bpfasta.100.fa", sep="\t",quote=F,row.names=F,col.names=F)
write.table(paste0(">",subset(bpscan, is.na(bin_10bp))$gene,"\n",subset(bpscan, is.na(bin_10bp))$seq), file="getBP/bpfasta.all.fa", sep="\t",quote=F,row.names=F,col.names=F)

```

# Same Gene
```{r}

combo.juncratio.data.parsed.CI <- subset(combo.juncratio.data.parsed, type=="CI")

# get median half-lives + standard deviations across introns from the same gene
genes <- as.character(unique(combo.juncratio.data.parsed$gene))
combo.metadata.polyA.half.median <- c()
for(i in 1:length(genes)){
  print(i)
  hold <- which(combo.juncratio.data.parsed$gene == as.character(genes[i]))
  # initialize vectors
  per_RINE = NA; per_optimal = NA
  numexp.introns = 0; rate_FI = NA; len_FI = NA; len_all = NA; len_sd = NA; RIME_all = NA; RIME_sd = NA
  rate_Other = NA; std_Other = NA; rate_all = NA; std_all = NA
  # if no matches - everything NA
  # only 1 matching intron
  if(length(hold) == 1){
    per_RINE = sum(combo.juncratio.data.parsed$IEratio[hold] >= 1)/length(hold)
    per_optimal = sum(combo.juncratio.data.parsed$intronlen[hold] >= 60 & combo.juncratio.data.parsed$intronlen[hold] <= 70)/length(hold)
    numexp.introns = 1; 
    rate_all<- combo.juncratio.data.parsed$fitvalue[hold]; 
    std_all=NA; 
    std_Other=NA;
    len_all = combo.juncratio.data.parsed$intronlen[hold]
    len_sd = NA
    RIME_all = combo.juncratio.data.parsed$IEratio[hold]
    RIME_sd = NA
    if(!is.na(combo.juncratio.data.parsed$intronnum[hold]) & combo.juncratio.data.parsed$intronnum[hold] == 0){ 
      rate_Other=NA
      rate_FI = combo.juncratio.data.parsed$fitvalue[hold]; 
      len_FI = combo.juncratio.data.parsed$intronlen[hold] }
    if(is.na(combo.juncratio.data.parsed$intronnum[hold]) | combo.juncratio.data.parsed$intronnum[hold] !=0){ 
      rate_Other <- combo.juncratio.data.parsed$fitvalue[hold]
      rate_FI = NA; 
      len_FI = NA }
  }
  # multiple matching introns
  if(length(hold) > 1){
    per_RINE = sum(combo.juncratio.data.parsed$IEratio[hold] >= 1)/length(hold)
    per_optimal = sum(combo.juncratio.data.parsed$intronlen[hold] >= 60 & combo.juncratio.data.parsed$intronlen[hold] <= 70)/length(hold)
    numexp.introns = length(hold); 
    rate_all = median(combo.juncratio.data.parsed$fitvalue[hold], na.rm=T); 
    std_all = sd(combo.juncratio.data.parsed$fitvalue[hold], na.rm=T)
    len_all = median(combo.juncratio.data.parsed$intronlen[hold],na.rm=T)
    len_sd = sd(combo.juncratio.data.parsed$intronlen[hold],na.rm=T)
    RIME_all = median(combo.juncratio.data.parsed$IEratio[hold], na.rm=T)
    RIME_sd = sd(combo.juncratio.data.parsed$IEratio[hold], na.rm=T)
    first.hold <- which(combo.juncratio.data.parsed$intronnum[hold] == 0)
    if(length(first.hold) == 0){ 
      rate_FI = NA; 
      len_FI = NA 
      rate_Other = median(combo.juncratio.data.parsed$fitvalue[hold], na.rm=T); 
      std_Other = sd(combo.juncratio.data.parsed$fitvalue[hold], na.rm=T)
    }
    if(length(first.hold) == 1){
      rate_FI = combo.juncratio.data.parsed$fitvalue[hold[first.hold]]; 
      len_FI = combo.juncratio.data.parsed$intronlen[hold[first.hold]]
      rate_Other = median(combo.juncratio.data.parsed$fitvalue[hold[-first.hold]], na.rm=T)
      std_Other = sd(combo.juncratio.data.parsed$fitvalue[hold[-first.hold]], na.rm=T)
    }
    if(length(first.hold) > 1){
      rate_FI = median(combo.juncratio.data.parsed$fitvalue[hold[first.hold]], na.rm=T); 
      len_FI = median(combo.juncratio.data.parsed$intronlen[hold[first.hold]], na.rm=T)
      rate_Other = median(combo.juncratio.data.parsed$fitvalue[hold[-first.hold]], na.rm=T)
      std_Other = sd(combo.juncratio.data.parsed$fitvalue[hold[-first.hold]], na.rm=T)
    }
  }
  ## CI matching
  hold_CI <- which(combo.juncratio.data.parsed.CI$gene == as.character(genes[i]))
  #if no CI matches
  if(length(hold_CI) == 0){
   numexp.introns.CI = 0; rate_FI_CI = NA; ss3_FI_CI = NA; ss5_FI_CI = NA; len_FI_CI = NA
   rate_Other_CI = NA; std_Other_CI = NA; rate_all_CI = NA; std_all_CI = NA
  }
  #only 1 matching intron
  if(length(hold_CI) == 1){
   numexp.introns.CI = 1; 
   rate_all_CI <- combo.juncratio.data.parsed.CI$fitvalue[hold_CI]; 
   std_all_CI=NA; 
   std_Other_CI=NA
   if(!is.na(combo.juncratio.data.parsed.CI$intronnum[hold_CI]) & combo.juncratio.data.parsed.CI$intronnum[hold_CI] == 0){
     rate_FI_CI = combo.juncratio.data.parsed$fitvalue[hold_CI]; 
     len_FI_CI = combo.juncratio.data.parsed.CI$intronlen[hold_CI]
     ss3_FI_CI = combo.juncratio.data.parsed.CI$ss3[hold_CI]; 
     ss5_FI_CI = combo.juncratio.data.parsed.CI$ss5[hold_CI]
     rate_Other_CI = NA
   }
   if(is.na(combo.juncratio.data.parsed.CI$intronnum[hold_CI]) | combo.juncratio.data.parsed.CI$intronnum[hold_CI] !=0){ 
     rate_FI_CI = NA; 
     len_FI_CI = NA;
     ss3_FI_CI = NA
     ss5_FI_CI = NA
     rate_Other_CI = combo.juncratio.data.parsed.CI$fitvalue[hold_CI]
     }
  }
  # multiple matching introns
  if(length(hold_CI) > 1){
    numexp.introns.CI = length(hold_CI); 
    rate_all_CI = median(combo.juncratio.data.parsed.CI$fitvalue[hold_CI], na.rm=T); 
    std_all_CI = sd(combo.juncratio.data.parsed.CI$fitvalue[hold_CI], na.rm=T)
    first.hold <- which(combo.juncratio.data.parsed.CI$intronnum[hold_CI] == 0)
    if(length(first.hold) == 0){
      rate_FI_CI = NA; len_FI_CI = NA; ss3_FI_CI = NA; ss5_FI_CI = NA
      rate_Other_CI = median(combo.juncratio.data.parsed.CI$fitvalue[hold_CI], na.rm=T); std_Other_CI = sd(combo.juncratio.data.parsed.CI$fitvalue[hold_CI], na.rm=T)
    }
    if(length(first.hold) == 1){
      rate_FI_CI = combo.juncratio.data.parsed.CI$fitvalue[hold_CI[first.hold]]; 
      len_FI_CI = combo.juncratio.data.parsed.CI$intronlen[hold_CI[first.hold]]
      ss3_FI_CI = combo.juncratio.data.parsed.CI$ss3[hold_CI[first.hold]]; 
      ss5_FI_CI = combo.juncratio.data.parsed.CI$ss5[hold_CI[first.hold]]
      rate_Other_CI = median(combo.juncratio.data.parsed.CI$fitvalue[hold_CI[-first.hold]], na.rm=T)
      std_Other_CI = sd(combo.juncratio.data.parsed.CI$fitvalue[hold_CI[-first.hold]], na.rm=T)
    }
    if(length(first.hold) > 1){
      rate_FI_CI = median(combo.juncratio.data.parsed.CI$fitvalue[hold_CI[first.hold]], na.rm=T); 
      len_FI_CI = median(combo.juncratio.data.parsed.CI$intronlen[hold_CI[first.hold]], na.rm=T)
      ss3_FI_CI = median(combo.juncratio.data.parsed.CI$ss3[hold_CI[first.hold]], na.rm=T); 
      ss5_FI_CI = median(combo.juncratio.data.parsed.CI$ss5[hold_CI[first.hold]], na.rm=T)
      rate_Other_CI = median(combo.juncratio.data.parsed.CI$fitvalue[hold_CI[-first.hold]], na.rm=T)
      std_Other_CI = sd(combo.juncratio.data.parsed.CI$fitvalue[hold_CI[-first.hold]], na.rm=T)
    }
  }
  hold.row <- data.frame(Gene.ID = genes[i], Number.of.annotated.introns = NA, Number.of.detected.introns = numexp.introns, Number.of.detected.introns.CI = numexp.introns.CI,
                         Splicing.rate.first.intron = rate_FI, Splicing.rate.first.intron.CI = rate_FI_CI, Splice.5site.strength.first.intron.CI = ss3_FI_CI, Splice.3site.strength.first.intron.CI= ss5_FI_CI,
                         Length.first.intron = len_FI, Length.first.intron.CI = len_FI_CI,
                         Splicing.rate.remaining.introns = rate_Other, Std.Dev.remaining.introns = std_Other, Splicing.rate.all.introns = rate_all, Std.Dev.all.introns = std_all,
                         Splicing.rate.remaining.introns.CI = rate_Other_CI, Std.Dev.remaining.introns.CI = std_Other_CI, Splicing.rate.all.introns.CI = rate_all_CI, Std.Dev.all.introns.CI = std_all_CI,
                         percentage_exondef = per_RINE, percentage_optimal = per_optimal, Mean.length.all = len_all, SD.length.all = len_sd, RIME.all = RIME_all, RIME.sd = RIME_sd)
  combo.metadata.polyA.half.median <- rbind(combo.metadata.polyA.half.median, hold.row)
}

sumsqfit.metadata.polyA.half.median <- c()
for(i in 1:length(genes)){
  print(i)
  hold <- which(sumsqfit.data.matrix$gene == as.character(genes[i]))
  # initialize vectors
  numexp.introns = 0; rate_FI = NA; len_FI = NA; len_all = NA; len_sd = NA; 
  rate_Other = NA; std_Other = NA; rate_all = NA; std_all = NA
  # if no matches - everything NA
  # only 1 matching intron
  if(length(hold) == 1){
    numexp.introns = 1; 
    rate_all<- sumsqfit.data.matrix$fitvalue[hold]; 
    std_all=NA; 
    std_Other=NA;
    len_all = sumsqfit.data.matrix$intronlen[hold]
    len_sd = NA
    if(!is.na(sumsqfit.data.matrix$intronnum[hold]) & sumsqfit.data.matrix$intronnum[hold] == 0){ 
      rate_Other=NA
      rate_FI = sumsqfit.data.matrix$fitvalue[hold]; 
      len_FI = sumsqfit.data.matrix$intronlen[hold] }
    if(is.na(sumsqfit.data.matrix$intronnum[hold]) | sumsqfit.data.matrix$intronnum[hold] !=0){ 
      rate_Other <- sumsqfit.data.matrix$fitvalue[hold]
      rate_FI = NA; 
      len_FI = NA }
  }
  # multiple matching introns
  if(length(hold) > 1){
    numexp.introns = length(hold); 
    rate_all = median(sumsqfit.data.matrix$fitvalue[hold], na.rm=T); 
    std_all = sd(sumsqfit.data.matrix$fitvalue[hold], na.rm=T)
    len_all = median(sumsqfit.data.matrix$intronlen[hold],na.rm=T)
    len_sd = sd(sumsqfit.data.matrix$intronlen[hold],na.rm=T)
    first.hold <- which(sumsqfit.data.matrix$intronnum[hold] == 0)
    if(length(first.hold) == 0){ 
      rate_FI = NA; 
      len_FI = NA 
      rate_Other = median(sumsqfit.data.matrix$fitvalue[hold], na.rm=T); 
      std_Other = sd(sumsqfit.data.matrix$fitvalue[hold], na.rm=T)
    }
    if(length(first.hold) == 1){
      rate_FI = sumsqfit.data.matrix$fitvalue[hold[first.hold]]; 
      len_FI = sumsqfit.data.matrix$intronlen[hold[first.hold]]
      rate_Other = median(sumsqfit.data.matrix$fitvalue[hold[-first.hold]], na.rm=T)
      std_Other = sd(sumsqfit.data.matrix$fitvalue[hold[-first.hold]], na.rm=T)
    }
    if(length(first.hold) > 1){
      rate_FI = median(sumsqfit.data.matrix$fitvalue[hold[first.hold]], na.rm=T); 
      len_FI = median(sumsqfit.data.matrix$intronlen[hold[first.hold]], na.rm=T)
      rate_Other = median(sumsqfit.data.matrix$fitvalue[hold[-first.hold]], na.rm=T)
      std_Other = sd(sumsqfit.data.matrix$fitvalue[hold[-first.hold]], na.rm=T)
    }
  }
  hold.row <- data.frame(Gene.ID = genes[i], 
                         Number.of.annotated.introns = NA, Number.of.detected.introns = numexp.introns, 
                         Splicing.rate.first.intron = rate_FI,  
                         Length.first.intron = len_FI, 
                         Splicing.rate.remaining.introns = rate_Other, 
                         Std.Dev.remaining.introns = std_Other, 
                         Splicing.rate.all.introns = rate_all, 
                         Std.Dev.all.introns = std_all,
                         Mean.length.all = len_all, SD.length.all = len_sd)
  sumsqfit.metadata.polyA.half.median <- rbind(sumsqfit.metadata.polyA.half.median, hold.row)
}

corr_halfs_all_median <- function(metadatahere, cols, datahere){
  # cols = c(column # of Number of detected introns, column # of Splicing rate first intron)
  setnum = 10 # number of sampling sets
  inds <- which(metadatahere[,cols[1]] > 0)
  detected.introns <- metadatahere[inds,cols[1]]
  first.present <- !is.na(metadatahere[,cols[2]])[inds]
  sampling.data <- c()
  for(i in 1:setnum){
    print(paste0("sampling ",i))
    atrandom.med <- onefirst.all.med <- onefirst.remaining.med <- actualdist.all.med <- actualdist.remaining.med <- intronlen.all.med <- RIME.all.med <- c()
    atrandom.std <- onefirst.all.std <- onefirst.remaining.std <- actualdist.all.std <- actualdist.remaining.std <- intronlen.all.std <- RIME.all.std <- c()
    # pick introns at random
    print("...random")
    all.introns <- 1:nrow(datahere)
    for(t in 1:length(detected.introns)){
      hold <- sample(all.introns, detected.introns[t], replace=F)
      atrandom.med[t] <- median(datahere$fitvalue[hold],na.rm=T)
      atrandom.std[t] <- sd(datahere$fitvalue[hold],na.rm=T)
      all.introns <- all.introns[-hold]
      intronlen.all.med[t] <- median(datahere$intronlen[hold], na.rm=T)
      intronlen.all.std[t] <- sd(datahere$intronlen[hold], na.rm=T)
      RIME.all.med[t] <- median(datahere$IEratio[hold], na.rm=T)
      RIME.all.std[t] <- sd(datahere$IEratio[hold], na.rm=T)
    }
    # pick 1 first intron, and remaining non-first
    print("...always first")
    first.inds <- which(datahere$intronnum == 0)
    remaining.inds <- which(datahere$intronnum > 0)
    for(t in 1:length(detected.introns)){
      # only first exon
      if(detected.introns[t] == 1){
        hold.first <- sample(first.inds, 1)
        onefirst.all.med[t] <- datahere$fitvalue[hold.first]
        onefirst.all.std[t] <- NA
        onefirst.remaining.med[t] <- onefirst.remaining.std[t] <- NA
        first.inds <- first.inds[-hold.first]
      }
      # multiple exons
      if(detected.introns[t] > 1){
        hold.first <- sample(first.inds, 1)
        hold.rest <- sample(remaining.inds, detected.introns[t]-1, replace=F)
        onefirst.all.med[t] <- median(datahere$fitvalue[c(hold.first, hold.rest)], na.rm=T)
        onefirst.all.std[t] <- sd(datahere$fitvalue[c(hold.first, hold.rest)], na.rm=T)
        onefirst.remaining.med[t] <- median(datahere$fitvalue[hold.rest], na.rm=T)
        onefirst.remaining.std[t] <- sd(datahere$fitvalue[hold.rest], na.rm=T)
        first.inds <- first.inds[-hold.first]
        remaining.inds <- remaining.inds[-hold.rest]
      }
    }
    # pick as per actual distribution (if has first intron, yes, if not, then no)
    print("...actual distribution")
    first.inds <- which(datahere$intronnum == 0)
    remaining.inds <- which(datahere$intronnum > 0)
    for(t in 1:length(detected.introns)){
      # if first exon present
      if(first.present[t] == T){
        # only first exon
        if(detected.introns[t] == 1){
          hold.first <- sample(first.inds, 1)
          actualdist.all.med[t] <- datahere$fitvalue[hold.first]
          actualdist.all.std[t] <- NA
          actualdist.remaining.med[t] <- actualdist.remaining.std[t] <- NA
          first.inds <- first.inds[-hold.first]
        }
        # multiple exons
        if(detected.introns[t] > 1){
          hold.first <- sample(first.inds, 1)
          hold.rest <- sample(remaining.inds, detected.introns[t]-1, replace=F)
          actualdist.all.med[t] <- median(datahere$fitvalue[c(hold.first, hold.rest)], na.rm=T)
          actualdist.all.std[t] <- sd(datahere$fitvalue[c(hold.first, hold.rest)], na.rm=T)
          actualdist.remaining.med[t] <- median(datahere$fitvalue[hold.rest], na.rm=T)
          actualdist.remaining.std[t] <- sd(datahere$fitvalue[hold.rest], na.rm=T)
          first.inds <- first.inds[-hold.first]
          remaining.inds <- remaining.inds[-hold.rest]
        }
      }
      # if first exon
      if(first.present[t] == F){
        # remaining exons
        hold.rest <- sample(remaining.inds, detected.introns[t], replace=F)
        actualdist.all.med[t] <- actualdist.remaining.med[t] <- median(datahere$fitvalue[hold.rest], na.rm=T)
        actualdist.all.std[t] <- actualdist.remaining.std[t] <- sd(datahere$fitvalue[hold.rest],na.rm=T)
        remaining.inds <- remaining.inds[-hold.rest]
      }
    }
    sampling.data <- rbind(sampling.data, data.frame(set=rep(i,length(detected.introns)),
                                                     Random.splicing.rate = atrandom.med,
                                                     FirstAlways.splicing.rate.all = onefirst.all.med, 
                                                     FirstAlways.splicing.rate.remaining = onefirst.remaining.med,
                                                     ActualDist.splicing.rate.all = actualdist.all.med,
                                                     ActualDist.splicing.rate.remaining = actualdist.remaining.med,
                                                     Random.stddev = atrandom.std,
                                                     FirstAlways.stddev.all = onefirst.all.std, 
                                                     FirstAlways.stddev.remaining = onefirst.remaining.std,
                                                     ActualDist.stddev.all = actualdist.all.std,
                                                     ActualDist.stddev.remaining = actualdist.remaining.std,
                                                     IntronLength.mean = intronlen.all.med,
                                                     IntronLength.std = intronlen.all.std,
                                                     RIME.mean = RIME.all.med,
                                                     RIME.std = RIME.all.std))
  }
  sampling.data.miso <- data.frame(type = rep(c("all","remaining"),each=nrow(sampling.data)),
                                   set = rep(sampling.data$set, 2),
                                   Random.splicing.rate = c(sampling.data$Random.splicing.rate,rep(NA,nrow(sampling.data))),
                                   FirstAlways.splicing.rate = c(sampling.data$FirstAlways.splicing.rate.all, sampling.data$FirstAlways.splicing.rate.remaining),
                                   ActualDist.splicing.rate = c(sampling.data$ActualDist.splicing.rate.all, sampling.data$ActualDist.splicing.rate.remaining),
                                   Random.stddev = c(sampling.data$Random.stddev, rep(NA,nrow(sampling.data))),
                                   FirstAlways.stddev = c(sampling.data$FirstAlways.stddev.all, sampling.data$FirstAlways.stddev.remaining),
                                   ActualDist.stddev = c(sampling.data$ActualDist.stddev.all, sampling.data$ActualDist.stddev.remaining),
                                   IntronLength.mean = c(sampling.data$IntronLength.mean, rep(NA,nrow(sampling.data))),
                                   IntronLength.std = c(sampling.data$IntronLength.std, rep(NA, nrow(sampling.data))),
                                   RIME.mean = c(sampling.data$RIME.mean, rep(NA,nrow(sampling.data))),
                                   RIME.std = c(sampling.data$RIME.mean, rep(NA, nrow(sampling.data))))
  return(sampling.data.miso)
}

# across all introns
sampling.data.combo.half.median <- corr_halfs_all_median(combo.metadata.polyA.half.median, c(3,5), combo.juncratio.data.parsed.ordered)
sampling.data.combo.half.median$both <- interaction(sampling.data.combo.half.median$type, sampling.data.combo.half.median$set)
sampling.data.combo.half.median$both <- factor(sampling.data.combo.half.median$both, levels=c(paste("all",seq(1,10),sep="."),paste("remaining",seq(1,10),sep=".")))

# sampling within length class

length_sampling <- function(metadatahere, cols, datahere){
  # cols = c(column # of Number of detected introns, column # of Splicing rate first intron)
  setnum = 1 # number of sampling sets
  inds <- which(metadatahere[,cols[1]] > 0)
  detected.introns <- metadatahere[inds,cols[1]]
  first.present <- !is.na(metadatahere[,cols[2]])[inds]
  sampling.data <- c()
  for(i in 1:setnum){
    print(paste0("sampling ",i))
    atrandom.med <- onefirst.all.med <- onefirst.remaining.med <- actualdist.all.med <- actualdist.remaining.med <- intronlen.all.med <- RIME.all.med <- c()
    atrandom.std <- onefirst.all.std <- onefirst.remaining.std <- actualdist.all.std <- actualdist.remaining.std <- intronlen.all.std <- RIME.all.std <- c()
    # pick introns at random
    print("...random")
    all.introns <- 1:nrow(datahere)
    for(t in 1:length(detected.introns)){
      hold <- sample(all.introns, detected.introns[t], replace=T)
      atrandom.med[t] <- median(datahere$fitvalue[hold],na.rm=T)
      atrandom.std[t] <- sd(datahere$fitvalue[hold],na.rm=T)
      all.introns <- all.introns[-hold]
      intronlen.all.med[t] <- median(datahere$intronlen[hold], na.rm=T)
      intronlen.all.std[t] <- sd(datahere$intronlen[hold], na.rm=T)
      RIME.all.med[t] <- median(datahere$IEratio[hold], na.rm=T)
      RIME.all.std[t] <- sd(datahere$IEratio[hold], na.rm=T)
    }
    sampling.data <- rbind(sampling.data, data.frame(set=rep(i,length(detected.introns)),
                                                     Random.splicing.rate = atrandom.med,
                                                     Random.stddev = atrandom.std,
                                                     IntronLength.mean = intronlen.all.med,
                                                     IntronLength.std = intronlen.all.std,
                                                     RIME.mean = RIME.all.med,
                                                     RIME.std = RIME.all.std))
  }
  return(sampling.data)
}

sampling.data.combo.60.80nt <- length_sampling(combo.metadata.polyA.half.median, c(3,5), subset(combo.juncratio.data.parsed.ordered, intronlen >= 60 & intronlen < 80))
sampling.data.combo.80.200nt <- length_sampling(combo.metadata.polyA.half.median, c(3,5), subset(combo.juncratio.data.parsed.ordered, intronlen >= 80 & intronlen < 200))
sampling.data.combo.200.500nt <- length_sampling(combo.metadata.polyA.half.median, c(3,5), subset(combo.juncratio.data.parsed.ordered, intronlen >= 200 & intronlen < 500))
sampling.data.combo.500.1000nt <- length_sampling(combo.metadata.polyA.half.median, c(3,5), subset(combo.juncratio.data.parsed.ordered, intronlen >= 500 & intronlen < 1000))
sampling.data.combo.1000.10000nt <- length_sampling(combo.metadata.polyA.half.median, c(3,5), subset(combo.juncratio.data.parsed.ordered, intronlen >= 1000 & intronlen < 10000))
sampling.data.combo.g.10000nt <- length_sampling(combo.metadata.polyA.half.median, c(3,5), subset(combo.juncratio.data.parsed.ordered, intronlen >= 10000))


combo.stddev.data.median <- data.frame(type=c("introns from same gene","randomly sample introns"),
                                      median_stdev = c(median(combo.metadata.polyA.half.median$Std.Dev.all.introns,na.rm=T),
                                                       median(subset(sampling.data.combo.half.median, type=="all" &
                                                                       set==1)$ActualDist.stddev, na.rm=T)),
                                      mean_stdev = c(mean(combo.metadata.polyA.half.median$Std.Dev.all.introns,na.rm=T),
                                                     mean(subset(sampling.data.combo.half.median, type=="all" &
                                                                   set==1)$ActualDist.stddev, na.rm=T)),
                                      sem = c((sd(combo.metadata.polyA.half.median$Std.Dev.all.introns,na.rm=T)/
                                                 sqrt(nrow(combo.metadata.polyA.half.median))), 
                                              (sd(subset(sampling.data.combo.half.median, type=="all" & set==1)$ActualDist.stddev, na.rm=T)/sqrt(nrow(subset(sampling.data.combo.half.median, type=="all" & set==1))))),
                                      median_intron_stddev = c(median(combo.metadata.polyA.half.median$SD.length.all, na.rm=T), 
                                                             median(subset(sampling.data.combo.half.median, type=="all" & set==1)$IntronLength.std, na.rm=T)),
                                      mean_intron_stddev = c(mean(combo.metadata.polyA.half.median$SD.length.all, na.rm=T), 
                                                             mean(subset(sampling.data.combo.half.median, type=="all" & set==1)$IntronLength.std, na.rm=T)),
                                      sem_intron_stddev = c(sd(combo.metadata.polyA.half.median$SD.length.all, na.rm=T)/
                                                              sqrt(nrow(combo.metadata.polyA.half.median)), 
                                                             sd(subset(sampling.data.combo.half.median, type=="all" & set==1)$IntronLength.std, na.rm=T)/sqrt(nrow(subset(sampling.data.combo.half.median, type=="all" & set==1)))))

combo.stddev.data.median.txnrate$type <- factor(combo.stddev.data.median.txnrate$type, 
                                                levels=c("introns from same gene","introns from random txn rates","randomly sample introns"))

combo.stddev.data.median.txnrate <- data.frame(type=c("introns from same gene","introns from random txn rates","randomly sample introns"),
                                      median_stdev = c(median(combo.metadata.polyA.half.median$Std.Dev.all.introns,na.rm=T),
                                                       median(sumsqfit.metadata.polyA.half.median$Std.Dev.all.introns, na.rm=T),
                                                       median(subset(sampling.data.combo.half.median, type=="all" &
                                                                       set==1)$ActualDist.stddev, na.rm=T)),
                                      mean_stdev = c(mean(combo.metadata.polyA.half.median$Std.Dev.all.introns,na.rm=T),
                                                     mean(sumsqfit.metadata.polyA.half.median$Std.Dev.all.introns, na.rm=T),
                                                     mean(subset(sampling.data.combo.half.median, type=="all" &
                                                                   set==1)$ActualDist.stddev, na.rm=T)),
                                      sem = c((sd(combo.metadata.polyA.half.median$Std.Dev.all.introns,na.rm=T)/
                                                 sqrt(nrow(combo.metadata.polyA.half.median))), 
                                              (sd(sumsqfit.metadata.polyA.half.median$Std.Dev.all.introns,na.rm=T)/
                                                 sqrt(nrow(sumsqfit.metadata.polyA.half.median))),
                                              (sd(subset(sampling.data.combo.half.median, type=="all" & set==1)$ActualDist.stddev, na.rm=T)/sqrt(nrow(subset(sampling.data.combo.half.median, type=="all" & set==1))))),
                                      median_intron_stddev = c(median(combo.metadata.polyA.half.median$SD.length.all, na.rm=T), 
                                                               median(sumsqfit.metadata.polyA.half.median$SD.length.all, na.rm=T),
                                                             median(subset(sampling.data.combo.half.median, type=="all" & set==1)$IntronLength.std, na.rm=T)),
                                      mean_intron_stddev = c(mean(combo.metadata.polyA.half.median$SD.length.all, na.rm=T), 
                                                             mean(sumsqfit.metadata.polyA.half.median$SD.length.all, na.rm=T),
                                                             mean(subset(sampling.data.combo.half.median, type=="all" & set==1)$IntronLength.std, na.rm=T)),
                                      sem_intron_stddev = c(sd(combo.metadata.polyA.half.median$SD.length.all, na.rm=T)/
                                                              sqrt(nrow(combo.metadata.polyA.half.median)), 
                                                            sd(sumsqfit.metadata.polyA.half.median$SD.length.all, na.rm=T)/
                                                              sqrt(nrow(sumsqfit.metadata.polyA.half.median)),
                                                             sd(subset(sampling.data.combo.half.median, type=="all" & set==1)$IntronLength.std, na.rm=T)/sqrt(nrow(subset(sampling.data.combo.half.median, type=="all" & set==1)))))

combo.stddev.data.median.txnrate.length <- data.frame(type=c("introns from same gene","introns from random txn rates","random 60-80","random 80-200","random 200-500","random 500-1000","random 1000-10,000","random greater than 10,000","random"),
                                      median_stdev = c(median(combo.metadata.polyA.half.median$Std.Dev.all.introns,na.rm=T),
                                                       median(sumsqfit.metadata.polyA.half.median$Std.Dev.all.introns, na.rm=T),
                                                       median(sampling.data.combo.60.80nt$Random.stddev, na.rm=T),
                                                       median(sampling.data.combo.80.200nt$Random.stddev, na.rm=T),
                                                       median(sampling.data.combo.200.500nt$Random.stddev, na.rm=T),
                                                       median(sampling.data.combo.500.1000nt$Random.stddev, na.rm=T),
                                                       median(sampling.data.combo.1000.10000nt$Random.stddev, na.rm=T),
                                                       median(sampling.data.combo.g.10000nt$Random.stddev, na.rm=T),
                                                       median(subset(sampling.data.combo.half.median, type=="all" &
                                                                       set==1)$ActualDist.stddev, na.rm=T)),
                                      mean_stdev = c(mean(combo.metadata.polyA.half.median$Std.Dev.all.introns,na.rm=T),
                                                     mean(sumsqfit.metadata.polyA.half.median$Std.Dev.all.introns, na.rm=T),                                                           mean(sampling.data.combo.60.80nt$Random.stddev, na.rm=T),
                                                     mean(sampling.data.combo.80.200nt$Random.stddev, na.rm=T),
                                                     mean(sampling.data.combo.200.500nt$Random.stddev, na.rm=T),
                                                     mean(sampling.data.combo.500.1000nt$Random.stddev, na.rm=T),
                                                     mean(sampling.data.combo.1000.10000nt$Random.stddev, na.rm=T),
                                                     mean(sampling.data.combo.g.10000nt$Random.stddev, na.rm=T),
                                                     mean(subset(sampling.data.combo.half.median, type=="all" &
                                                                   set==1)$ActualDist.stddev, na.rm=T)),
                                      sem = c((sd(combo.metadata.polyA.half.median$Std.Dev.all.introns,na.rm=T)/
                                                 sqrt(nrow(combo.metadata.polyA.half.median))),
                                              (sd(sumsqfit.metadata.polyA.half.median$Std.Dev.all.introns,na.rm=T)/
                                                 sqrt(nrow(sumsqfit.metadata.polyA.half.median))),
                                              sd(sampling.data.combo.60.80nt$Random.stddev, na.rm=T)/
                                                sqrt(nrow(sampling.data.combo.60.80nt)),
                                              sd(sampling.data.combo.80.200nt$Random.stddev, na.rm=T)/
                                                sqrt(nrow(sampling.data.combo.60.80nt)),
                                              sd(sampling.data.combo.200.500nt$Random.stddev, na.rm=T)/
                                                sqrt(nrow(sampling.data.combo.60.80nt)),
                                              sd(sampling.data.combo.500.1000nt$Random.stddev, na.rm=T)/
                                                sqrt(nrow(sampling.data.combo.60.80nt)),
                                              sd(sampling.data.combo.1000.10000nt$Random.stddev, na.rm=T)/
                                                sqrt(nrow(sampling.data.combo.60.80nt)),
                                              sd(sampling.data.combo.g.10000nt$Random.stddev, na.rm=T)/
                                                sqrt(nrow(sampling.data.combo.60.80nt)),
                                              (sd(subset(sampling.data.combo.half.median, type=="all" & set==1)$ActualDist.stddev, na.rm=T)/sqrt(nrow(subset(sampling.data.combo.half.median, type=="all" & set==1))))))
combo.stddev.data.median.txnrate.length$type <- factor(combo.stddev.data.median.txnrate.length$type, levels=c("introns from same gene","introns from random txn rates","random 60-80","random 80-200","random 200-500","random 500-1000","random 1000-10,000","random greater than 10,000","random"))


combo.stddev.data.both <- data.frame(type = rep(rep(c("introns from same gene","randomly sampled introns"), each=nrow(combo.metadata.polyA.half.median)), 2),
                                     comp = rep(c("intron length","RIME"), each=nrow(combo.metadata.polyA.half.median)*2),
                                     sd = c(combo.metadata.polyA.half.median$SD.length.all, subset(sampling.data.combo.half.median, type=="all" & set==1)$IntronLength.std,
                                            combo.metadata.polyA.half.median$RIME.sd, subset(sampling.data.combo.half.median, type=="all" & set==1)$RIME.std))

ggplot(combo.stddev.data.median, aes(x=factor(type),y=mean_stdev,fill=factor(type))) + geom_bar(stat="identity",width=0.5) + geom_errorbar(aes(ymin=mean_stdev-sem, ymax=mean_stdev+sem),width=0.25) +
  ylim(0,6.5) + annotate("segment",x=1,xend=2,y=6.2,yend=6.2,size=0.25) +
  annotate("text",x=1.5,y=6.3,label="***") +
  scale_fill_manual(values=c(wes_palette("FantasticFox")[3], wes_palette("Royal1")[1]),guide=F) + scale_x_discrete(labels=c("introns from\nsame gene","randomly sampled\nintrons")) +
  background_grid(major="y",minor="y") + labs(y="half-life SD (min)",x="",fill="") + theme(axis.text.x=element_text(size=7))

ggplot(combo.stddev.data.median.txnrate, aes(x=factor(type),y=mean_stdev,fill=factor(type))) + geom_bar(stat="identity",width=0.5) + geom_errorbar(aes(ymin=mean_stdev-sem, ymax=mean_stdev+sem),width=0.25) +
  ylim(0,6.5) + annotate("segment",x=c(1,2),xend=c(3,3),y=c(6.2,5.2),yend=c(6.2, 5.2),size=0.25) +
  annotate("text",x=c(2.5,2.5),y=c(6.3,5.2),label="***") +
  scale_fill_manual(values=c(wes_palette("FantasticFox")[3], wes_palette("FantasticFox")[4], wes_palette("Royal1")[1]),guide=F) + scale_x_discrete(labels=c("introns from\nsame gene","introns from same gene \nw/ diff txn rates","randomly sampled\nintrons")) +
  background_grid(major="y",minor="y") + labs(y="half-life SD (min)",x="",fill="") + theme(axis.text.x=element_text(size=7))

ggplot(combo.stddev.data.median.txnrate.length, aes(x=factor(type),y=mean_stdev,fill=factor(type))) + geom_bar(stat="identity",width=0.5) + geom_errorbar(aes(ymin=mean_stdev-sem, ymax=mean_stdev+sem),width=0.25) +
  ylim(0,6.5) + annotate("segment",x=c(1,2),xend=c(3,3),y=c(6.2,5.2),yend=c(6.2, 5.2),size=0.25) +
  annotate("text",x=c(2.5,2.5),y=c(6.3,5.2),label="***") +
  scale_fill_manual(values=c(wes_palette("FantasticFox")[3], wes_palette("FantasticFox")[4], wes_palette("Royal1")[1]),guide=F) + scale_x_discrete(labels=c("introns from\nsame gene","introns from same gene \nw/ diff txn rates","randomly sampled\nintrons")) +
  background_grid(major="y",minor="y") + labs(y="half-life SD (min)",x="",fill="") + theme(axis.text.x=element_text(size=7))

pdf("~/Desktop/lengthbinning_SDs.pdf",width=11,height=5)
ggplot(combo.stddev.data.median.txnrate.length, aes(x=factor(type),y=mean_stdev,fill=factor(type))) + geom_bar(stat="identity",width=0.5) + geom_errorbar(aes(ymin=mean_stdev-sem, ymax=mean_stdev+sem),width=0.25) +
  labs(x="bins",y="standard deviation") + theme(axis.text.x=element_text(angle=45,hjust=1))
dev.off()

t.test(combo.metadata.polyA.half.median$Std.Dev.all.introns, subset(sampling.data.combo.half.median, type=="all" & set==1)$ActualDist.stddev)
t.test(combo.metadata.polyA.half.median$Std.Dev.remaining.introns, subset(sampling.data.combo.half.median, type=="remaining" & set==1)$ActualDist.stddev)

```

Do genes tend to have similar intron lengths?
```{r}

ggplot(combo.stddev.data.median, aes(x=factor(type),y=mean_intron_stddev,fill=factor(type))) + geom_bar(stat="identity",width=0.5) + 
  geom_errorbar(aes(ymin=mean_intron_stddev-sem_intron_stddev, ymax=mean_intron_stddev+sem_intron_stddev),width=0.25) +
  annotate("segment",x=1,xend=2,y=900,yend=900,size=0.25) + ylim(0,950) +
  annotate("text",x=1.5,y=910,label="***") +
  scale_fill_manual(values=c(wes_palette("FantasticFox")[3], wes_palette("Royal1")[1]),guide=F) + scale_x_discrete(labels=c("introns from\nsame gene","randomly sampled\nintrons")) +
  background_grid(major="y",minor="y") + labs(y="intron length SD (min)",x="",fill="") + theme(axis.text.x=element_text(size=7))

ggplot(combo.stddev.data.both, aes(x=factor(comp),y=sd,fill=factor(type))) + geom_boxplot(notch=T) + scale_y_log10()
ggplot(combo.stddev.data.both, aes(x=factor(comp),y=sd,fill=factor(type))) + geom_violin() + scale_y_log10()


```

Looking at gene-level RInE
```{r}

ggplot(combo.metadata.polyA.half.median, aes(x=percentage_exondef)) + geom_histogram() + scale_y_log10()
combo.metadata.polyA.half.median$numberexons <- combo.metadata.polyA.half.median$Number.of.detected.introns*combo.metadata.polyA.half.median$percentage_exondef
combo.metadata.polyA.half.median$numberintrons <- combo.metadata.polyA.half.median$Number.of.detected.introns- combo.metadata.polyA.half.median$numberexons 

combo.metadata.polyA.half.median$definition = "mixed"
combo.metadata.polyA.half.median$definition[which(combo.metadata.polyA.half.median$percentage_exondef == 0)] = "intron"
combo.metadata.polyA.half.median$definition[which(combo.metadata.polyA.half.median$percentage_exondef >= 0.66 & combo.metadata.polyA.half.median$numberintrons<2)] = "exon"
combo.metadata.polyA.half.median$definition <- factor(combo.metadata.polyA.half.median$definition, levels=c("intron","mixed","exon"))

combo.metadata.polyA.half.median$TPM <- adelman.txi$total[match(combo.metadata.polyA.half.median$Gene.ID, row.names(adelman.txi))]

ggplot(combo.metadata.polyA.half.median, aes(x=percentage_optimal)) + geom_histogram() + scale_y_log10()
combo.metadata.polyA.half.median$optimal <- "non-optimal"
combo.metadata.polyA.half.median$optimal[which(combo.metadata.polyA.half.median$percentage_optimal == 1)] = "optimal"

# get sampling for each subset
sampling.data.ID <- corr_halfs_all_median(subset(combo.metadata.polyA.half.median, definition=="intron"), c(3,5), subset(combo.juncratio.data.parsed.ordered, IEratio<1))
sampling.data.ID$both <- interaction(sampling.data.ID$type, sampling.data.ID$set)
sampling.data.ID$both <- factor(sampling.data.ID$both, levels=c(paste("all",seq(1,10),sep="."),paste("remaining",seq(1,10),sep=".")))

sampling.data.ED <- corr_halfs_all_median(subset(combo.metadata.polyA.half.median, definition=="exon"), c(3,5), subset(combo.juncratio.data.parsed.ordered, IEratio>1))
sampling.data.ED$both <- interaction(sampling.data.ED$type, sampling.data.ED$set)
sampling.data.ED$both <- factor(sampling.data.ED$both, levels=c(paste("all",seq(1,10),sep="."),paste("remaining",seq(1,10),sep=".")))

sampling.data.mixed <- corr_halfs_all_median(subset(combo.metadata.polyA.half.median, definition=="mixed"), c(3,5), combo.juncratio.data.parsed.ordered)
sampling.data.mixed$both <- interaction(sampling.data.mixed$type, sampling.data.mixed$set)
sampling.data.mixed$both <- factor(sampling.data.mixed$both, levels=c(paste("all",seq(1,10),sep="."),paste("remaining",seq(1,10),sep=".")))

combo.stddev.data.def <- data.frame(def = c(rep("intron",nrow(sampling.data.ID)/10), rep("mixed",nrow(sampling.data.mixed)/10), rep("exon",nrow(sampling.data.ED)/10)),
                                    type = c(rep(c("same gene","randomly sampled"),each=nrow(sampling.data.ID)/20),
                                             rep(c("same gene","randomly sampled"),each=nrow(sampling.data.mixed)/20),
                                             rep(c("same gene","randomly sampled"),each=nrow(sampling.data.ED)/20)),
                                    rate = c(subset(combo.metadata.polyA.half.median, definition=="intron")$Splicing.rate.all.introns, subset(sampling.data.ID, type=="all" & set==1)$ActualDist.splicing.rate,
                                             subset(combo.metadata.polyA.half.median, definition=="mixed")$Splicing.rate.all.introns, subset(sampling.data.mixed, type=="all" & set==1)$ActualDist.splicing.rate,
                                             subset(combo.metadata.polyA.half.median, definition=="exon")$Splicing.rate.all.introns, subset(sampling.data.ED, type=="all" & set==1)$ActualDist.splicing.rate),
                                    stddev = c(subset(combo.metadata.polyA.half.median, definition=="intron")$Std.Dev.all.introns, subset(sampling.data.ID, type=="all" & set==1)$ActualDist.stddev,
                                             subset(combo.metadata.polyA.half.median, definition=="mixed")$Std.Dev.all.introns, subset(sampling.data.mixed, type=="all" & set==1)$ActualDist.stddev,
                                             subset(combo.metadata.polyA.half.median, definition=="exon")$Std.Dev.all.introns, subset(sampling.data.ED, type=="all" & set==1)$ActualDist.stddev))
combo.stddev.data.def$def <- factor(combo.stddev.data.def$def, levels=c("intron","mixed","exon"))
combo.stddev.data.def$type <- factor(combo.stddev.data.def$type, levels=c("same gene","randomly sampled"))

ggplot(combo.stddev.data.def, aes(x=factor(def),y=stddev,fill=factor(type))) + geom_boxplot(notch=T) + ylim(0,5)
ggplot(combo.stddev.data.def, aes(x=stddev,color=factor(def),linetype=factor(type))) + stat_ecdf() + xlim(0,3)

pdf("genewise_RINE.pdf")
# with STD DEV
ggplot(subset(combo.metadata.polyA.half.median, Number.of.detected.introns>1), aes(x=factor(definition),y=Std.Dev.all.introns)) + geom_boxplot(notch=T) + ylim(0,3.5) +
  labs(x="",y="standard deviation")
# with mean Splicing rate
ggplot(subset(combo.metadata.polyA.half.median, Number.of.detected.introns>1), aes(x=factor(definition),y=Splicing.rate.all.introns)) + geom_boxplot(notch=T) + ylim(0,5) +
  labs(x="",y="half-life (min)")
# with mean expression
ggplot(subset(combo.metadata.polyA.half.median, Number.of.detected.introns>1), aes(x=factor(definition),y=TPM)) + geom_boxplot(notch=T) + scale_y_log10() +
  labs(x="",y="TPM")
dev.off()

### match for TPM

TPMmatch <- function(datahere, window){
  exon.data <- subset(datahere, definition=="exon" & !is.na(Std.Dev.all.introns))
  intron.data <- subset(datahere, definition=="intron" & !is.na(Std.Dev.all.introns))
  mixed.data <- subset(datahere, definition=="mixed" & !is.na(Std.Dev.all.introns))
  # bin by window size
  bin.vec <- seq(floor(min(exon.data$TPM)), ceiling(max(exon.data$TPM)), by=window)
  print(length(bin.vec))
  exon.inds <- intron.inds <- mixed.inds <- c()
  for(i in 2:length(bin.vec)){
    exon.hold <- which(exon.data$TPM > bin.vec[i-1] & exon.data$TPM <= bin.vec[i])
    intron.hold <- which(intron.data$TPM > bin.vec[i-1] & intron.data$TPM <= bin.vec[i])
    mixed.hold <- which(mixed.data$TPM > bin.vec[i-1] & mixed.data$TPM <= bin.vec[i])
    if(length(exon.hold) > 0 & length(intron.hold > 0) & length(mixed.hold > 0)){
      exon.inds <- c(exon.inds, exon.hold)
      intron.inds <- c(intron.inds, sample(intron.hold, length(exon.hold), replace=T))
      mixed.inds <- c(mixed.inds, sample(mixed.hold, length(exon.hold), replace=T))
    }
  }
  matched.data <- data.frame(type=c(rep("exon",length(exon.inds)),
                                    rep("intron",length(intron.inds)),
                                    rep("mixed",length(mixed.inds))),
                             rate = c(exon.data$Splicing.rate.all.introns[exon.inds], intron.data$Splicing.rate.all.introns[intron.inds], mixed.data$Splicing.rate.all.introns[mixed.inds]),
                             stddev = c(exon.data$Std.Dev.all.introns[exon.inds], intron.data$Std.Dev.all.introns[intron.inds], mixed.data$Std.Dev.all.introns[mixed.inds]),
                             TPM = c(exon.data$TPM[exon.inds], intron.data$TPM[intron.inds], mixed.data$TPM[mixed.inds]))
  return(matched.data)
}

tpmmatched.2 <- TPMmatch(combo.metadata.polyA.half.median, 5)
tpmmatched.2$type <- factor(tpmmatched.2$type, levels=c("intron","mixed","exon"))

ggplot(tpmmatched.2, aes(x=factor(type),y=TPM)) + geom_boxplot(notch=T) + scale_y_log10()
ggplot(tpmmatched.2, aes(x=factor(type),y=rate)) + geom_boxplot(notch=T) + scale_y_log10()
ggplot(tpmmatched.2, aes(x=factor(type),y=stddev)) + geom_boxplot(notch=T) + ylim(0,3)

```

GO categories
```{r}

# write out genes
all.genes <- as.character(unique(combo.metadata.polyA.half.median$Gene.ID))
write.table(all.genes, file="definition/GO/all_genes.txt",sep="\t",quote=F,row.names=F,col.names=F)

ID.genes <- as.character(unique(subset(combo.metadata.polyA.half.median, definition=="intron")$Gene.ID))
write.table(ID.genes, file="definition/GO/ID_genes.txt",sep="\t",quote=F,row.names=F,col.names=F)

mixed.genes <- as.character(unique(subset(combo.metadata.polyA.half.median, definition=="mixed")$Gene.ID))
write.table(mixed.genes, file="definition/GO/mixed_genes.txt",sep="\t",quote=F,row.names=F,col.names=F)

ED.genes <- as.character(unique(subset(combo.metadata.polyA.half.median, definition=="exon")$Gene.ID))
write.table(ED.genes, file="definition/GO/ED_genes.txt",sep="\t",quote=F,row.names=F,col.names=F)


### use clusterProfiler
#source("https://bioconductor.org/biocLite.R")
#biocLite("clusterProfiler")
#biocLite("org.Dm.eg.db")

library(clusterProfiler)
library(org.Dm.eg.db)

# get genes in quantiles
quant.vec <- quantile(subset(combo.metadata.polyA.half.median, percentage_exondef>0), seq(0.2,1,0.2))
genes.introndef <- data.frame(gene_name=subset(combo.metadata.polyA.half.median, definition=="intron")$Gene.ID)
genes.introndef.sub <- genes.introndef[sample(length(ID.genes), length(ED.genes)),]

#genes.mixeddef <- data.frame(gene_name=subset(combo.metadata.polyA.half.median, definition=="mixed")$Gene.ID)
genes.mixeddef.low <- data.frame(gene_name=subset(combo.metadata.polyA.half.median, definition=="mixed" & percentage_exondef<quantile(subset(combo.metadata.polyA.half.median, definition=="mixed")$percentage_exondef, 0.5))$Gene.ID)
genes.mixedlowdef.sub <- genes.mixeddef.low[sample(nrow(genes.mixeddef.low), length(ED.genes)),]
genes.mixeddef.high <- data.frame(gene_name=subset(combo.metadata.polyA.half.median, definition=="mixed" & percentage_exondef>=quantile(subset(combo.metadata.polyA.half.median, definition=="mixed")$percentage_exondef, 0.5))$Gene.ID)
genes.mixedhighdef.sub <- genes.mixeddef.high[sample(nrow(genes.mixeddef.high), length(ED.genes)),]

genes.exondef <- data.frame(gene_name=subset(combo.metadata.polyA.half.median, definition=="exon")$Gene.ID)
#genes.quant1 <- data.frame(gene_name=subset(combo.metadata.polyA.half.median, percentage_exondef > 0 & percentage_exondef < quant.vec[1])$Gene.ID)
#genes.quant2 <- data.frame(gene_name=subset(combo.metadata.polyA.half.median, percentage_exondef > quant.vec[1] & percentage_exondef < quant.vec[2])$Gene.ID)
#genes.quant3 <- data.frame(gene_name=subset(combo.metadata.polyA.half.median, percentage_exondef > quant.vec[2] & percentage_exondef < quant.vec[3])$Gene.ID)
#genes.quant4 <- data.frame(gene_name=subset(combo.metadata.polyA.half.median, percentage_exondef > quant.vec[3] & percentage_exondef < quant.vec[4])$Gene.ID)
#genes.quant5 <- data.frame(gene_name=subset(combo.metadata.polyA.half.median, percentage_exondef > quant.vec[4] & percentage_exondef < quant.vec[5])$Gene.ID)

# convert genes to ENTREZIDs
genes.all <- bitr(data.frame(gene_name=all.genes)$gene_name, "FLYBASE", "ENTREZID", "org.Dm.eg.db")
 genes.introndef.tr <- bitr(genes.introndef$gene_name, "FLYBASE", "ENTREZID", "org.Dm.eg.db")
# genes.mixeddef.tr <- bitr(genes.mixeddef$gene_name, "FLYBASE", "ENTREZID", "org.Dm.eg.db")
genes.exondef.tr <- bitr(genes.exondef$gene_name, "FLYBASE", "ENTREZID", "org.Dm.eg.db")

genes.introndef.sub.tr <- bitr(genes.introndef.sub, "FLYBASE", "ENTREZID", "org.Dm.eg.db")
genes.mixedlowdef.sub.tr <- bitr(genes.mixedlowdef.sub, "FLYBASE", "ENTREZID", "org.Dm.eg.db")
genes.mixedhighdef.sub.tr <- bitr(genes.mixedhighdef.sub, "FLYBASE", "ENTREZID", "org.Dm.eg.db")
genes.mixedlowdef.tr <- bitr(genes.mixeddef.low$gene_name, "FLYBASE", "ENTREZID", "org.Dm.eg.db")
genes.mixedhighdef.tr <- bitr(genes.mixeddef.high$gene_name, "FLYBASE", "ENTREZID", "org.Dm.eg.db")

#genes.quant1.tr <- bitr(genes.quant1$gene_name, "FLYBASE", "ENTREZID", "org.Dm.eg.db")
#genes.quant2.tr <- bitr(genes.quant2$gene_name, "FLYBASE", "ENTREZID", "org.Dm.eg.db")
#genes.quant3.tr <- bitr(genes.quant3$gene_name, "FLYBASE", "ENTREZID", "org.Dm.eg.db")
#genes.quant4.tr <- bitr(genes.quant4$gene_name, "FLYBASE", "ENTREZID", "org.Dm.eg.db")
#genes.quant5.tr <- bitr(genes.quant5$gene_name, "FLYBASE", "ENTREZID", "org.Dm.eg.db")

# get enrichments
genes.introndef.enrich <- summary(enrichGO(gene=genes.introndef.tr$ENTREZID, organism="fly", ont="BP", pvalueCutoff=1,pAdjustMethod="BH",universe=testallgenes_tr$ENTREZID, qvalueCutoff=1, readable=TRUE))
#genes.mixeddef.enrich <- summary(enrichGO(gene=genes.mixeddef.tr$ENTREZID, organism="fly", ont="BP", pvalueCutoff=1,pAdjustMethod="BH",universe=testallgenes_tr$ENTREZID, qvalueCutoff=1, readable=TRUE))

genes.introndef.sub.enrich <- summary(enrichGO(gene=genes.introndef.sub.tr$ENTREZID, organism="fly", ont="BP", pvalueCutoff=1,pAdjustMethod="BH",universe=testallgenes_tr$ENTREZID, qvalueCutoff=1, readable=TRUE))
genes.mixedlowdef.sub.enrich <- summary(enrichGO(gene=genes.mixedlowdef.sub.tr$ENTREZID, organism="fly", ont="BP", pvalueCutoff=1,pAdjustMethod="BH",universe=testallgenes_tr$ENTREZID, qvalueCutoff=1, readable=TRUE))
genes.mixedhighdef.sub.enrich <- summary(enrichGO(gene=genes.mixedhighdef.sub.tr$ENTREZID, organism="fly", ont="BP", pvalueCutoff=1,pAdjustMethod="BH",universe=testallgenes_tr$ENTREZID, qvalueCutoff=1, readable=TRUE))

genes.mixedlowdef.enrich <- summary(enrichGO(gene=genes.mixedlowdef.tr$ENTREZID, organism="fly", ont="BP", pvalueCutoff=1,pAdjustMethod="BH",universe=testallgenes_tr$ENTREZID, qvalueCutoff=1, readable=TRUE))
genes.mixedhighdef.enrich <- summary(enrichGO(gene=genes.mixedhighdef.tr$ENTREZID, organism="fly", ont="BP", pvalueCutoff=1,pAdjustMethod="BH",universe=testallgenes_tr$ENTREZID, qvalueCutoff=1, readable=TRUE))

genes.exondef.enrich <- summary(enrichGO(gene=genes.exondef.tr$ENTREZID, organism="fly", ont="BP", pvalueCutoff=1,pAdjustMethod="BH",universe=testallgenes_tr$ENTREZID, qvalueCutoff=1, readable=TRUE))
#genes.quant1.enrich <- summary(enrichGO(gene=genes.quant1.tr$ENTREZID, organism="fly", ont="BP", pvalueCutoff=1,pAdjustMethod="BH",universe=testallgenes_tr$ENTREZID, qvalueCutoff=1, readable=TRUE))
#genes.quant2.enrich <- summary(enrichGO(gene=genes.quant2.tr$ENTREZID, organism="fly", ont="BP", pvalueCutoff=1,pAdjustMethod="BH",universe=testallgenes_tr$ENTREZID, qvalueCutoff=1, readable=TRUE))
#genes.quant3.enrich <- summary(enrichGO(gene=genes.quant3.tr$ENTREZID, organism="fly", ont="BP", pvalueCutoff=1,pAdjustMethod="BH",universe=testallgenes_tr$ENTREZID, qvalueCutoff=1, readable=TRUE))
#genes.quant4.enrich <- summary(enrichGO(gene=genes.quant4.tr$ENTREZID, organism="fly", ont="BP", pvalueCutoff=1,pAdjustMethod="BH",universe=testallgenes_tr$ENTREZID, qvalueCutoff=1, readable=TRUE))
#genes.quant5.enrich <- summary(enrichGO(gene=genes.quant5.tr$ENTREZID, organism="fly", ont="BP", pvalueCutoff=1,pAdjustMethod="BH",universe=testallgenes_tr$ENTREZID, qvalueCutoff=1, readable=TRUE))

# calculate enrichment values
getenrich <- function(fractioncol){
  return(apply(matrix(unlist(strsplit(as.character(fractioncol), split="/")),byrow=T,ncol=2), 1, function(x){as.numeric(x[1])/(as.numeric(x[2])-as.numeric(x[1]))})) }

# genes.introndef.enrich$set_enrich <- getenrich(genes.introndef.enrich$GeneRatio)
# genes.introndef.enrich$bg_enrich <- getenrich(genes.introndef.enrich$BgRatio)
# genes.introndef.enrich$all_enrich <- genes.introndef.enrich$set_enrich/genes.introndef.enrich$bg_enrich
# genes.mixeddef.enrich$set_enrich <- getenrich(genes.mixeddef.enrich$GeneRatio)
# genes.mixeddef.enrich$bg_enrich <- getenrich(genes.mixeddef.enrich$BgRatio)
# genes.mixeddef.enrich$all_enrich <- genes.mixeddef.enrich$set_enrich/genes.mixeddef.enrich$bg_enrich

genes.introndef.sub.enrich$set_enrich <- getenrich(genes.introndef.sub.enrich$GeneRatio)
genes.introndef.sub.enrich$bg_enrich <- getenrich(genes.introndef.sub.enrich$BgRatio)
genes.introndef.sub.enrich$all_enrich <- genes.introndef.sub.enrich$set_enrich/genes.introndef.sub.enrich$bg_enrich

genes.mixedlowdef.sub.enrich$set_enrich <- getenrich(genes.mixedlowdef.sub.enrich$GeneRatio)
genes.mixedlowdef.sub.enrich$bg_enrich <- getenrich(genes.mixedlowdef.sub.enrich$BgRatio)
genes.mixedlowdef.sub.enrich$all_enrich <- genes.mixedlowdef.sub.enrich$set_enrich/genes.mixedlowdef.sub.enrich$bg_enrich

genes.mixedhighdef.sub.enrich$set_enrich <- getenrich(genes.mixedhighdef.sub.enrich$GeneRatio)
genes.mixedhighdef.sub.enrich$bg_enrich <- getenrich(genes.mixedhighdef.sub.enrich$BgRatio)
genes.mixedhighdef.sub.enrich$all_enrich <- genes.mixedhighdef.sub.enrich$set_enrich/genes.mixedhighdef.sub.enrich$bg_enrich

genes.mixedlowdef.enrich$set_enrich <- getenrich(genes.mixedlowdef.enrich$GeneRatio)
genes.mixedlowdef.enrich$bg_enrich <- getenrich(genes.mixedlowdef.enrich$BgRatio)
genes.mixedlowdef.enrich$all_enrich <- genes.mixedlowdef.enrich$set_enrich/genes.mixedlowdef.enrich$bg_enrich

genes.mixedhighdef.enrich$set_enrich <- getenrich(genes.mixedhighdef.enrich$GeneRatio)
genes.mixedhighdef.enrich$bg_enrich <- getenrich(genes.mixedhighdef.enrich$BgRatio)
genes.mixedhighdef.enrich$all_enrich <- genes.mixedhighdef.enrich$set_enrich/genes.mixedhighdef.enrich$bg_enrich


genes.exondef.enrich$set_enrich <- getenrich(genes.exondef.enrich$GeneRatio)
genes.exondef.enrich$bg_enrich <- getenrich(genes.exondef.enrich$BgRatio)
genes.exondef.enrich$all_enrich <- genes.exondef.enrich$set_enrich/genes.exondef.enrich$bg_enrich

#genes.quant1.enrich$set_enrich <- getenrich(genes.quant1.enrich$GeneRatio)
#genes.quant1.enrich$bg_enrich <- getenrich(genes.quant1.enrich$BgRatio)
#genes.quant1.enrich$all_enrich <- genes.quant1.enrich$set_enrich/genes.quant1.enrich$bg_enrich
#genes.quant2.enrich$set_enrich <- getenrich(genes.quant2.enrich$GeneRatio)
#genes.quant2.enrich$bg_enrich <- getenrich(genes.quant2.enrich$BgRatio)
#genes.quant2.enrich$all_enrich <- genes.quant2.enrich$set_enrich/genes.quant2.enrich$bg_enrich
#genes.quant3.enrich$set_enrich <- getenrich(genes.quant3.enrich$GeneRatio)
#genes.quant3.enrich$bg_enrich <- getenrich(genes.quant3.enrich$BgRatio)
#genes.quant3.enrich$all_enrich <- genes.quant3.enrich$set_enrich/genes.quant3.enrich$bg_enrich
#genes.quant4.enrich$set_enrich <- getenrich(genes.quant4.enrich$GeneRatio)
#genes.quant4.enrich$bg_enrich <- getenrich(genes.quant4.enrich$BgRatio)
#genes.quant4.enrich$all_enrich <- genes.quant4.enrich$set_enrich/genes.quant4.enrich$bg_enrich
#genes.quant5.enrich$set_enrich <- getenrich(genes.quant5.enrich$GeneRatio)
#genes.quant5.enrich$bg_enrich <- getenrich(genes.quant5.enrich$BgRatio)
#genes.quant5.enrich$all_enrich <- genes.quant5.enrich$set_enrich/genes.quant5.enrich$bg_enrich

# to do level enrichments - without background subset
secondenrichlevel <- groupGO(gene = testallgenes_tr$ENTREZID, organism="fly", ont="BP", level=2, readable = TRUE)
secondlevel.id <- as.character(summary(secondenrichlevel)$ID)
secondlevel.description <- as.character(summary(secondenrichlevel)$Description)

thirdenrichlevel <- groupGO(gene = testallgenes_tr$ENTREZID, organism="fly", ont="BP", level=3, readable = TRUE)
thirdlevel.id <- as.character(summary(thirdenrichlevel)$ID)
thirdlevel.description <- as.character(summary(thirdenrichlevel)$Description)

significanthits <- unique(c(subset(genes.introndef.enrich, qvalue < 0.1)$ID,
                            subset(genes.exondef.enrich, qvalue < 0.1)$ID))

significanthits.description <- genes.introndef.enrich$Description[match(significanthits, genes.introndef.enrich$ID)]


# get matches
#match.intron <- match(secondlevel.id, genes.introndef.enrich$ID)
#match.mixed <- match(secondlevel.id, genes.mixeddef.enrich$ID)

match2.intron.sub <- match(secondlevel.id, genes.introndef.sub.enrich$ID)
match2.mixedlow.sub <- match(secondlevel.id, genes.mixedlowdef.sub.enrich$ID)
match2.mixedhigh.sub <- match(secondlevel.id, genes.mixedhighdef.sub.enrich$ID)
match2.exon <- match(secondlevel.id, genes.exondef.enrich$ID)

match3.intron.sub <- match(thirdlevel.id, genes.introndef.sub.enrich$ID)
match3.mixedlow.sub <- match(thirdlevel.id, genes.mixedlowdef.sub.enrich$ID)
match3.mixedhigh.sub <- match(thirdlevel.id, genes.mixedhighdef.sub.enrich$ID)
match3.exon <- match(thirdlevel.id, genes.exondef.enrich$ID)

match.sig.intron.sub <- match(significanthits, genes.introndef.sub.enrich$ID)
match.sig.mixedlow.sub <- match(significanthits, genes.mixedlowdef.sub.enrich$ID)
match.sig.mixedhigh.sub <- match(significanthits, genes.mixedhighdef.sub.enrich$ID)

match.sig.intron <- match(significanthits, genes.introndef.enrich$ID)
match.sig.mixedlow <- match(significanthits, genes.mixedlowdef.enrich$ID)
match.sig.mixedhigh <- match(significanthits, genes.mixedhighdef.enrich$ID)

match.sig.exon <- match(significanthits, genes.exondef.enrich$ID)

#match1 <- match(secondlevel.id, genes.quant1.enrich$ID)
#match2 <- match(secondlevel.id, genes.quant2.enrich$ID)
#match3 <- match(secondlevel.id, genes.quant3.enrich$ID)
#match4 <- match(secondlevel.id, genes.quant4.enrich$ID)
#match5 <- match(secondlevel.id, genes.quant5.enrich$ID)

# secondlevel.data <- data.frame(id = rep(secondlevel.id, 6),
#                                description= rep(secondlevel.description, 6),
#                                type = rep(c("quant0","quant1","quant2","quant3","quant4","quant5"),each=length(secondlevel.id)),
#                                enrich = c(genes.introndef.enrich$all_enrich[match0], genes.quant1.enrich$all_enrich[match1],
#                                           genes.quant2.enrich$all_enrich[match2], genes.quant3.enrich$all_enrich[match3],
#                                           genes.quant4.enrich$all_enrich[match4], genes.quant5.enrich$all_enrich[match5]),
#                                p.adjust = c(genes.introndef.enrich$p.adjust[match0], genes.quant1.enrich$p.adjust[match1],
#                                           genes.quant2.enrich$p.adjust[match2], genes.quant3.enrich$p.adjust[match3],
#                                           genes.quant4.enrich$p.adjust[match4], genes.quant5.enrich$p.adjust[match5]))

secondlevel.data <- data.frame(id = rep(secondlevel.id, 4),
                               description= rep(secondlevel.description, 4),
                               type = rep(c("intron","mixedlow","mixedhigh","exon"),each=length(secondlevel.id)),
                               enrich = c(genes.introndef.sub.enrich$all_enrich[match2.intron.sub], genes.mixedlowdef.sub.enrich$all_enrich[match2.mixedlow.sub], 
                                          genes.mixedhighdef.sub.enrich$all_enrich[match2.mixedhigh.sub], genes.exondef.enrich$all_enrich[match2.exon]),
                               p.adjust = c(genes.introndef.sub.enrich$p.adjust[match2.intron.sub], genes.mixedlowdef.sub.enrich$p.adjust[match2.mixedlow.sub], 
                                          genes.mixedhighdef.sub.enrich$p.adjust[match2.mixedhigh.sub], genes.exondef.enrich$p.adjust[match2.exon]))
secondlevel.data$type <- factor(secondlevel.data$type, levels=c("intron","mixedlow","mixedhigh","exon"))
secondlevel.data <- subset(secondlevel.data, !is.na(enrich))

thirdlevel.data <- data.frame(id = rep(thirdlevel.id, 4),
                               description= rep(thirdlevel.description, 4),
                               type = rep(c("intron","mixedlow","mixedhigh","exon"),each=length(thirdlevel.id)),
                               enrich = c(genes.introndef.sub.enrich$all_enrich[match3.intron.sub], genes.mixedlowdef.sub.enrich$all_enrich[match3.mixedlow.sub], 
                                          genes.mixedhighdef.sub.enrich$all_enrich[match3.mixedhigh.sub], genes.exondef.enrich$all_enrich[match3.exon]),
                               p.adjust = c(genes.introndef.sub.enrich$p.adjust[match3.intron.sub], genes.mixedlowdef.sub.enrich$p.adjust[match3.mixedlow.sub], 
                                          genes.mixedhighdef.sub.enrich$p.adjust[match3.mixedhigh.sub], genes.exondef.enrich$p.adjust[match3.exon]))
thirdlevel.data$type <- factor(thirdlevel.data$type, levels=c("intron","mixedlow","mixedhigh","exon"))
thirdlevel.data <- subset(thirdlevel.data, !is.na(enrich))

siglevel.data <- data.frame(id = rep(significanthits, 4),
                               description= rep(significanthits.description, 4),
                               type = rep(c("intron","mixedlow","mixedhigh","exon"),each=length(significanthits)),
                               enrich = c(genes.introndef.sub.enrich$all_enrich[match.sig.intron.sub], genes.mixedlowdef.sub.enrich$all_enrich[match.sig.mixedlow.sub], 
                                          genes.mixedhighdef.sub.enrich$all_enrich[match.sig.mixedhigh.sub], genes.exondef.enrich$all_enrich[match.sig.exon]),
                               p.adjust = c(genes.introndef.sub.enrich$p.adjust[match.sig.intron.sub], genes.mixedlowdef.sub.enrich$p.adjust[match.sig.mixedlow.sub], 
                                          genes.mixedhighdef.sub.enrich$p.adjust[match.sig.mixedhigh.sub], genes.exondef.enrich$p.adjust[match.sig.exon]))
siglevel.data$type <- factor(siglevel.data$type, levels=c("intron","mixedlow","mixedhigh","exon"))
siglevel.data <- subset(siglevel.data, !is.na(enrich))

siglevel.data.all <- data.frame(id = rep(significanthits, 4),
                                description= rep(significanthits.description, 4),
                                 type = rep(c("intron","mixedlow","mixedhigh","exon"),each=length(significanthits)),
                                 enrich = c(genes.introndef.enrich$all_enrich[match.sig.intron], genes.mixedlowdef.enrich$all_enrich[match.sig.mixedlow], 
                                            genes.mixedhighdef.enrich$all_enrich[match.sig.mixedhigh], genes.exondef.enrich$all_enrich[match.sig.exon]),
                                 p.adjust = c(genes.introndef.enrich$p.adjust[match.sig.intron], genes.mixedlowdef.enrich$p.adjust[match.sig.mixedlow], 
                                            genes.mixedhighdef.enrich$p.adjust[match.sig.mixedhigh], genes.exondef.enrich$p.adjust[match.sig.exon]))
siglevel.data.all$type <- factor(siglevel.data.all$type, levels=c("intron","mixedlow","mixedhigh","exon"))
siglevel.data.all <- subset(siglevel.data.all, !is.na(enrich))


siglevel.categories <- unique(siglevel.data$id)
siglevel.data.parsed <- c()
for(i in 1:length(siglevel.categories)){
  print(i)
  hold <- subset(siglevel.data, id==siglevel.categories[i])
  if(nrow(hold) == 4 & max(log2(hold$enrich)) >0.5){
    siglevel.data.parsed <- rbind(siglevel.data.parsed, hold)
  }
}

siglevel.categories <- unique(siglevel.data$id)
siglevel.data.parsed.all <- c()
for(i in 1:length(siglevel.categories)){
  print(i)
  hold <- subset(siglevel.data.all, id==siglevel.categories[i])
  if(nrow(hold) == 4 & max(log2(hold$enrich)) >0.5){
    siglevel.data.parsed.all <- rbind(siglevel.data.parsed.all, hold)
  }
}


ggplot(secondlevel.data, aes(x=factor(type),y=enrich,fill=factor(description),alpha=-log10(p.adjust))) + geom_bar(stat="identity",position="dodge") + facet_wrap(~description)
ggplot(thirdlevel.data, aes(x=factor(type),y=enrich,fill=factor(type),alpha=-log10(p.adjust))) + geom_bar(stat="identity",position="dodge") + facet_wrap(~description)

ggplot(subset(siglevel.data, description!="biological_process"), aes(x=factor(type),y=enrich,fill=factor(type),alpha=-log10(p.adjust))) + geom_bar(stat="identity",position="dodge") + facet_wrap(~description)
ggplot(subset(siglevel.data.parsed, description!="biological_process"), aes(x=factor(type),y=enrich,fill=factor(type),alpha=-log10(p.adjust))) + geom_bar(stat="identity",position="dodge") + facet_wrap(~description)
ggplot(subset(siglevel.data.parsed.all, description!="biological_process"), aes(x=factor(type),y=enrich,fill=factor(type),alpha=-log10(p.adjust))) + geom_bar(stat="identity",position="dodge") + facet_wrap(~description)

id.cats <- c("mRNA metabolic process","mRNA processing","mRNA splicing, via spliceosome","glycoprotein metabolic process","ribonucleoprotein complex biogenesis","ribosome biogenesis")
ed.cats <- c("embryo development","response to oxidative stress","spindle elongation","morphogenesis of an epithelium","tissue morphogenesis")
all.cats <- c(id.cats, ed.cats)

siglevel.chosen <- c()
for(i in 1:length(ed.cats)){
  print(i)
  hold <- subset(siglevel.data.parsed, description==ed.cats[i])
  siglevel.chosen <- rbind(siglevel.chosen, hold)
}
siglevel.chosen$type <- factor(siglevel.chosen$type, levels=rev(c("intron","mixedlow","mixedhigh","exon")))

siglevel.chosen.all <- c()
for(i in 1:length(all.cats)){
  print(i)
  hold <- subset(siglevel.data.parsed.all, description==all.cats[i])
  siglevel.chosen.all <- rbind(siglevel.chosen.all, hold)
}
siglevel.chosen$type <- factor(siglevel.chosen$type, levels=rev(c("intron","mixedlow","mixedhigh","exon")))

ggplot(siglevel.chosen, aes(x=factor(description),y=enrich,fill=factor(type))) + geom_bar(stat="identity",position="dodge") + coord_flip()
ggplot(siglevel.chosen.all, aes(x=factor(description),y=enrich,fill=factor(type))) + geom_bar(stat="identity",position="dodge") + coord_flip()

siglevel.chosen$description <- factor(siglevel.chosen$description, levels=c("spindle elongation","response to oxidative stress","embryo development","morphogenesis of an epithelium","tissue morphogenesis"))
ggplot(siglevel.chosen, aes(x=factor(description),y=log2(enrich),fill=factor(type))) + geom_bar(stat="identity",position="dodge") + 
  scale_fill_manual(values=rev(brewer.pal(9,"Blues")[5:9])) + labs(x="",y="log2(enrichment over background)",fill="") + coord_flip()

```

GO for fast/slow genes
```{r}

# convert to ENTREZID
genes.all <- bitr(data.frame(gene_name=all.genes)$gene_name, "FLYBASE", "ENTREZID", "org.Dm.eg.db")
 genes.introndef.tr <- bitr(genes.introndef$gene_name, "FLYBASE", "ENTREZID", "org.Dm.eg.db")
# genes.mixeddef.tr <- bitr(genes.mixeddef$gene_name, "FLYBASE", "ENTREZID", "org.Dm.eg.db")
genes.exondef.tr <- bitr(genes.exondef$gene_name, "FLYBASE", "ENTREZID", "org.Dm.eg.db")

# get enrichments
genes.introndef.enrich <- summary(enrichGO(gene=genes.introndef.tr$ENTREZID, organism="fly", ont="BP", pvalueCutoff=1,pAdjustMethod="BH",universe=testallgenes_tr$ENTREZID, qvalueCutoff=1, readable=TRUE))
#genes.mixeddef.enrich <- summary(enrichGO(gene=genes.mixeddef.tr$ENTREZID, organism="fly", ont="BP", pvalueCutoff=1,pAdjustMethod="BH",universe=testallgenes_tr$ENTREZID, qvalueCutoff=1, readable=TRUE))


```

Number of introns per gene
```{r}

txpts.mat <- read.table("~/Dropbox (MIT)/Annotations/dm3_ensGene_annot/dmel-exons_numannotated.txt",header=T)
combo.metadata.polyA.half.median$Number.of.annotated.introns <- txpts.mat$num_exons[match(combo.metadata.polyA.half.median$Gene.ID, txpts.mat$gene)]

ggplot(subset(combo.metadata.polyA.half.median, Number.of.annotated.introns <= 10 & Number.of.detected.introns > 0), aes(x=Length.first.intron,color=factor(Number.of.annotated.introns))) + 
  stat_ecdf() + scale_x_log10(label=comma) + scale_color_brewer(palette = "RdBu",guide=guide_legend(ncol=4,byrow=T)) + 
  labs(x="first intron length (nt)",y="cumulative frequency",color="number of annotated introns") + background_grid(major="y",minor="y") +
  theme(legend.position=c(0.7,0.2),axis.text.x=element_text(size=7),legend.text=element_text(size=7),legend.title=element_text(size=6))
```

Enrichment for alternative splicing in definition genes
```{r}

exon_counts <- read.table("definition/exon_counts.txt",header=T)
combo.metadata.polyA.half.median$all_exon_count <- exon_counts$all[match(combo.metadata.polyA.half.median$Gene.ID, exon_counts$gene)]
combo.metadata.polyA.half.median$skipped_exon_count <- exon_counts$skipped[match(combo.metadata.polyA.half.median$Gene.ID, exon_counts$gene)]

ggplot(combo.metadata.polyA.half.median, aes(x=factor(definition),y=Number.of.detected.introns)) + geom_boxplot(notch=T)
ggplot(combo.metadata.polyA.half.median, aes(x=factor(definition),y=all_exon_count)) + geom_boxplot(notch=T)
ggplot(combo.metadata.polyA.half.median, aes(x=factor(definition),y=skipped_exon_count)) + geom_boxplot(notch=T)
ggplot(combo.metadata.polyA.half.median, aes(x=factor(definition),y=skipped_exon_count/all_exon_count)) + geom_boxplot(notch=T)

```

# First intron
```{r}
combo.juncratio.data.parsed$first <- "non-first"
combo.juncratio.data.parsed$first[which(combo.juncratio.data.parsed$intronnum == 0)] <- "first"

ggplot(combo.juncratio.data.parsed, aes(x=factor(intronnum),y=intronlen)) + geom_boxplot(notch=T) + scale_y_log10()
ggplot(combo.juncratio.data.parsed, aes(x=factor(intronnum),y=fitvalue)) + geom_boxplot(notch=T) + scale_y_log10()

ggplot(subset(combo.juncratio.data.parsed, intronnum<=9), aes(x=intronlen,y=fitvalue,color=factor(intronnum))) + geom_point() + scale_x_log10() + scale_y_log10()
```

Matching for first intron length
```{r}
get_combodata <- function(datahere){
  first.introns <- subset(datahere, intronnum == 0)
  twoplus.introns <- subset(datahere, intronnum >= 1)
  threeplus.introns <- subset(datahere, intronnum >= 2)
  combo.data <- data.frame(type = c(rep("first",nrow(first.introns)),
                                    rep("two.plus",nrow(twoplus.introns)),
                                    rep("three.plus",nrow(threeplus.introns))),
                           pos = c(first.introns$intronnum, twoplus.introns$intronnum, threeplus.introns$intronnum),
                           len = c(first.introns$intronlen, twoplus.introns$intronlen, threeplus.introns$intronlen),
                           half = c(first.introns$fitvalue, twoplus.introns$fitvalue, threeplus.introns$fitvalue))
  return(combo.data)
}
get_matcheddata <- function(datahere, window){
  first.introns <- subset(datahere, intronnum == 0)
  twoplus.introns <- subset(datahere, intronnum >= 1)
  threeplus.introns <- subset(datahere, intronnum >= 2)
  # bin by window size
  bin.vec <- seq(min(first.introns$intronlen), max(first.introns$intronlen), by=window)
  print(length(bin.vec))
  two.inds <- three.inds <- c()
  for(i in 2:length(bin.vec)){
    first.n <- nrow(subset(first.introns, intronlen > bin.vec[i-1] & intronlen <= bin.vec[i]))
    try(two.hold <- which(twoplus.introns$intronlen > bin.vec[i-1] & twoplus.introns$intronlen <= bin.vec[i]))
    try(two.inds <- c(two.inds, sample(two.hold, first.n,replace=T)))
    try(three.hold <- which(threeplus.introns$intronlen > bin.vec[i-1] & threeplus.introns$intronlen <= bin.vec[i]))
    try(three.inds <- c(three.inds, sample(three.hold, first.n,replace=T)))
  }
  matched.data <- data.frame(type=c(rep("first",nrow(first.introns)),
                                    rep("two.plus",length(two.inds)),
                                    rep("three.plus",length(three.inds))),
                             pos = c(first.introns$intronnum, twoplus.introns$intronnum[two.inds], threeplus.introns$intronnum[three.inds]),
                             len = c(first.introns$intronlen, twoplus.introns$intronlen[two.inds], threeplus.introns$intronlen[three.inds]),
                             half = c(first.introns$fitvalue, twoplus.introns$fitvalue[two.inds], threeplus.introns$fitvalue[three.inds]))
  return(matched.data)
}

combo.combodata <- get_combodata(combo.juncratio.data.parsed)
combo.matcheddata.50 <- get_matcheddata(combo.juncratio.data.parsed, 50)
combo.combodata$type <- factor(combo.combodata$type, levels=c("first","two.plus","three.plus"))
combo.matcheddata.50$type <- factor(combo.matcheddata.50$type, levels=c("first","two.plus","three.plus"))

get50data <- function(combodata, matcheddata){
  data50 <- data.frame(type=c(rep("first",nrow(subset(combodata, type=="first"))),
                              rep("second+",nrow(subset(combodata, type=="two.plus"))),
                              rep("third+",nrow(subset(combodata, type=="three.plus"))),
                              rep("second+",nrow(subset(matcheddata, type=="two.plus"))),
                              rep("third+",nrow(subset(matcheddata, type=="three.plus")))),
                       corr = c(rep("all-first",nrow(subset(combodata, type=="first"))),
                                rep("all-two",nrow(subset(combodata, type=="two.plus"))),
                                rep("all-three",nrow(subset(combodata, type=="three.plus"))),
                                rep("matched-two",nrow(subset(matcheddata, type=="two.plus"))),
                                rep("matched-three",nrow(subset(matcheddata, type=="three.plus")))),
                       half=c(subset(combodata, type=="first")$half,
                              subset(combodata, type=="two.plus")$half,
                              subset(combodata, type=="three.plus")$half,
                              subset(matcheddata, type=="two.plus")$half,
                              subset(matcheddata, type=="three.plus")$half))
  return(data50)
}
combo.50data <- get50data(combo.combodata, combo.matcheddata.50)

combo.50data.new <- data.frame(type = c(as.character(subset(combo.50data, type=="first" & corr=="all-first")$type),
                                        as.character(subset(combo.50data, type=="third+" & corr=="matched-three")$type),
                                        rep("non-first",nrow(subset(combo.juncratio.data.parsed, intronnum > 0 & intronnum < 5)))),
                               num = c(rep(0, nrow(subset(combo.50data, type=="first" & corr=="all-first"))),
                                       rep(10,nrow(subset(combo.50data, type=="third+" & corr=="matched-three"))),
                                       subset(combo.juncratio.data.parsed, intronnum > 0 & intronnum < 5)$intronnum),
                               half = c(subset(combo.50data, type=="first" & corr=="all-first")$half,
                                        subset(combo.50data, type=="third+" & corr=="matched-three")$half,
                                        subset(combo.juncratio.data.parsed, intronnum > 0 & intronnum < 5)$fitvalue))
combo.50data.new.left <- rbind(subset(combo.50data.new, type=="first"),
                              data.frame(type=c("non-first","third+"),num=c(0,0),half=c(0,0)))
combo.50data.new.mid <- rbind(subset(combo.50data.new, type=="non-first"),
                             data.frame(type=c("first","third+"),num=c(0,0),half=c(0,0)))
combo.50data.new.right <- rbind(subset(combo.50data.new, type=="third+"),
                               data.frame(type=c("first","non-first"),num=c(0,0),half=c(0,0)))

t.test(subset(combo.50data.new, type=="first")$half, subset(combo.50data.new, type=="non-first")$half)
t.test(subset(combo.50data.new, type=="first")$half, subset(combo.50data.new, type=="third+")$half)
```

Correlation of first intron with median half-life of other introns
```{r}

# length of first intron
ggplot(combo.metadata.polyA.half.median, aes(x=Length.first.intron, Splicing.rate.remaining.introns)) + geom_point(alpha=0.25) + scale_x_log10() + scale_y_log10()
# half-life of first intron
ggplot(combo.metadata.polyA.half.median, aes(x=Splicing.rate.first.intron, Splicing.rate.remaining.introns)) + geom_point(alpha=0.25) + scale_x_log10() + scale_y_log10()

# binning by first length
combo.metadata.polyA.half.median$firstlen_thirds <- "1"
combo.metadata.polyA.half.median$firstlen_thirds[which(combo.metadata.polyA.half.median$Length.first.intron >= 62)] <- "2"
combo.metadata.polyA.half.median$firstlen_thirds[which(combo.metadata.polyA.half.median$Length.first.intron >= 79)] <- "3"
combo.metadata.polyA.half.median$firstlen_thirds[which(combo.metadata.polyA.half.median$Length.first.intron >= 387)] <- "4"

combo.first.thirds.data <- data.frame(bin = c("< 62 nt","62-79 nt","80-387 nt","> 388 nt"),
                                     mean = c(mean(subset(combo.metadata.polyA.half.median, firstlen_thirds=="1")$Splicing.rate.remaining.introns, na.rm=T),
                                              mean(subset(combo.metadata.polyA.half.median, firstlen_thirds=="2")$Splicing.rate.remaining.introns, na.rm=T),
                                              mean(subset(combo.metadata.polyA.half.median, firstlen_thirds=="3")$Splicing.rate.remaining.introns, na.rm=T),
                                              mean(subset(combo.metadata.polyA.half.median, firstlen_thirds=="4")$Splicing.rate.remaining.introns, na.rm=T)),
                                     sderr = c(sd(subset(combo.metadata.polyA.half.median, firstlen_thirds=="1")$Splicing.rate.remaining.introns, na.rm=T)/nrow(subset(combo.metadata.polyA.half.median, firstlen_thirds=="1")),
                                               sd(subset(combo.metadata.polyA.half.median, firstlen_thirds=="2")$Splicing.rate.remaining.introns, na.rm=T)/nrow(subset(combo.metadata.polyA.half.median, firstlen_thirds=="2")),
                                               sd(subset(combo.metadata.polyA.half.median, firstlen_thirds=="3")$Splicing.rate.remaining.introns, na.rm=T)/nrow(subset(combo.metadata.polyA.half.median, firstlen_thirds=="3")),
                                               sd(subset(combo.metadata.polyA.half.median, firstlen_thirds=="4")$Splicing.rate.remaining.introns, na.rm=T)/nrow(subset(combo.metadata.polyA.half.median, firstlen_thirds=="4"))))
combo.first.thirds.data$bin <- factor(combo.first.thirds.data$bin, levels=c("< 62 nt","62-79 nt","80-387 nt","> 388 nt"))

# mann-whitney U test
wilcox.test(subset(combo.metadata.polyA.half.median, firstlen_thirds==1)$Splicing.rate.remaining.introns, subset(combo.metadata.polyA.half.median, firstlen_thirds==2)$Splicing.rate.remaining.introns)
wilcox.test(subset(combo.metadata.polyA.half.median, firstlen_thirds==2)$Splicing.rate.remaining.introns, subset(combo.metadata.polyA.half.median, firstlen_thirds==3)$Splicing.rate.remaining.introns)
wilcox.test(subset(combo.metadata.polyA.half.median, firstlen_thirds==3)$Splicing.rate.remaining.introns, subset(combo.metadata.polyA.half.median, firstlen_thirds==4)$Splicing.rate.remaining.introns)

ggplot(combo.first.thirds.data, aes(x=factor(bin),y=mean)) + geom_bar(aes(alpha=factor(bin)),stat="identity",fill=wes_palette("FantasticFox")[3],width=0.85) + 
  geom_errorbar(aes(ymin=mean-sderr,ymax=mean+sderr),color=wes_palette("Royal1")[1],width=0.5) + scale_y_continuous(limits=c(0,6),breaks=c(0,2,4,6)) +
  annotate("segment",x=c(2,3),xend=c(3,4),y=c(5.6,5.4),yend=c(5.6,5.4),size=0.25) +
  annotate("text",x=c(2.5,3.5),y=c(5.7,5.5),label=c("*","***")) +
  scale_alpha_manual(values=c(0.625,0.75,0.875,1),guide=F) + labs(x="first intron length (nt)",y="median half-life (min)") + theme(axis.text.x=element_text(angle=45,hjust=1,size=6))

```

Looking at first intron differences for ID/ED introns
```{r}

ggplot(subset(combo.juncratio.data.parsed, intronnum==0), aes(x=factor(IEratio_bin),y=intronlen)) + geom_boxplot(notch=T) + scale_y_log10()
ggplot(subset(combo.juncratio.data.parsed, intronnum==0), aes(x=factor(IEratio_bin),y=fitvalue)) + geom_boxplot(notch=T) + scale_y_log10()
ggplot(subset(combo.juncratio.data.parsed, intronnum==0 & len_bin=="100%"), aes(x=factor(IEratio_bin),y=intronlen)) + geom_boxplot(notch=T) + scale_y_log10()

```

Slower and Longer
```{r}

getpointrange.data <- function(datahere){
  pointrange.data <- data.frame(intron_num = c(1:10),
                                len = c(mean(subset(datahere, intronnum ==0)$intronlen,na.rm=T),mean(subset(datahere, intronnum ==1)$intronlen,na.rm=T),
                                        mean(subset(datahere, intronnum ==2)$intronlen,na.rm=T),mean(subset(datahere, intronnum ==3)$intronlen,na.rm=T),
                                        mean(subset(datahere, intronnum ==4)$intronlen,na.rm=T),mean(subset(datahere, intronnum ==5)$intronlen,na.rm=T),
                                        mean(subset(datahere, intronnum ==6)$intronlen,na.rm=T),mean(subset(datahere, intronnum ==7)$intronlen,na.rm=T),
                                        mean(subset(datahere, intronnum ==8)$intronlen,na.rm=T),mean(subset(datahere, intronnum ==9)$intronlen,na.rm=T)),
                                half = c(mean(subset(datahere, intronnum ==0)$fitvalue,na.rm=T),mean(subset(datahere, intronnum ==1)$fitvalue,na.rm=T),
                                         mean(subset(datahere, intronnum ==2)$fitvalue,na.rm=T),mean(subset(datahere, intronnum ==3)$fitvalue,na.rm=T),
                                         mean(subset(datahere, intronnum ==4)$fitvalue,na.rm=T),mean(subset(datahere, intronnum ==5)$fitvalue,na.rm=T),
                                         mean(subset(datahere, intronnum ==6)$fitvalue,na.rm=T),mean(subset(datahere, intronnum ==7)$fitvalue,na.rm=T),
                                         mean(subset(datahere, intronnum ==8)$fitvalue,na.rm=T),mean(subset(datahere, intronnum ==9)$fitvalue,na.rm=T)),
                                len_se = c(sd(subset(datahere, intronnum ==0)$intronlen)/sqrt(length(which(datahere$intronnum == 0))),
                                           sd(subset(datahere, intronnum ==1)$intronlen)/sqrt(length(which(datahere$intronnum == 1))),
                                           sd(subset(datahere, intronnum ==2)$intronlen)/sqrt(length(which(datahere$intronnum == 2))),
                                           sd(subset(datahere, intronnum ==3)$intronlen)/sqrt(length(which(datahere$intronnum == 3))),
                                           sd(subset(datahere, intronnum ==4)$intronlen)/sqrt(length(which(datahere$intronnum == 4))),
                                           sd(subset(datahere, intronnum ==5)$intronlen)/sqrt(length(which(datahere$intronnum == 5))),
                                           sd(subset(datahere, intronnum ==6)$intronlen)/sqrt(length(which(datahere$intronnum == 6))),
                                           sd(subset(datahere, intronnum ==7)$intronlen)/sqrt(length(which(datahere$intronnum == 7))),
                                           sd(subset(datahere, intronnum ==8)$intronlen)/sqrt(length(which(datahere$intronnum == 8))),
                                           sd(subset(datahere, intronnum ==9)$intronlen)/sqrt(length(which(datahere$intronnum == 9)))),
                                half_se = c(sd(subset(datahere, intronnum ==0)$fitvalue,na.rm=T)/sqrt(length(which(datahere$intronnum == 0))),
                                            sd(subset(datahere, intronnum ==1)$fitvalue,na.rm=T)/sqrt(length(which(datahere$intronnum == 1))),
                                            sd(subset(datahere, intronnum ==2)$fitvalue,na.rm=T)/sqrt(length(which(datahere$intronnum == 2))),
                                            sd(subset(datahere, intronnum ==3)$fitvalue,na.rm=T)/sqrt(length(which(datahere$intronnum == 3))),
                                            sd(subset(datahere, intronnum ==4)$fitvalue,na.rm=T)/sqrt(length(which(datahere$intronnum == 4))),
                                            sd(subset(datahere, intronnum ==5)$fitvalue,na.rm=T)/sqrt(length(which(datahere$intronnum == 5))),
                                            sd(subset(datahere, intronnum ==6)$fitvalue,na.rm=T)/sqrt(length(which(datahere$intronnum == 6))),
                                            sd(subset(datahere, intronnum ==7)$fitvalue,na.rm=T)/sqrt(length(which(datahere$intronnum == 7))),
                                            sd(subset(datahere, intronnum ==8)$fitvalue,na.rm=T)/sqrt(length(which(datahere$intronnum == 8))),
                                            sd(subset(datahere, intronnum ==9)$fitvalue,na.rm=T)/sqrt(length(which(datahere$intronnum == 9)))))
  return(pointrange.data)
}

pointrange.data <- getpointrange.data(combo.juncratio.data.parsed)


```

# Enhancer
```{r}
arnold_S2 <- read.table("enhancers/arnold_supp_table07.txt",sep="\t",header=T)

arnold_S2_peak <- data.frame(chr = arnold_S2$Chr,
                             start = arnold_S2$Start+250,
                             end = arnold_S2$End+250,
                             name = paste(arnold_S2$Peak, arnold_S2$Enrichment..log2., arnold_S2$P.value,sep=";"))
write.table(arnold_S2_peak, file="enhancers/arnold_S2_peak.txt",sep="\t",quote=F,row.names=F,col.names=F)

arnoldS2_introns <- read.table("Figures/figure4_variance/kmers/everything/everything_introns_arnoldS2enhancer_overlap.bed",sep="\t")

combo.juncratio.data.parsed$arnold_enhancers <- arnoldS2_introns$V7[match(rownames(combo.juncratio.data.parsed), arnoldS2_introns$V4)]

combo.juncratio.data.parsed$arnold_binary <- "no enhancer"
combo.juncratio.data.parsed$arnold_binary[which(combo.juncratio.data.parsed$arnold_enhancers >= 1)] <- "enhancer"
```

# Regression

investigate all introns
```{r}

# length - in lm.miso.matrix
# 5'ss - in lm.miso.matrix
# 3'ss - in lm.miso.matrix
# AU% - in lm.miso.matrix (1-GC%)
# intron position - in lm.miso.matrix
# 1st intron length - add column
# 1st intron speed - add column
# length of upstream flanking exon - add column
# length of downstream flanking exon - add column

# add nucleotide content
introns.nucper <- read.table("~/Dropbox (MIT)/Annotations/dm3_ensGene_annot/introns.sense.nochr.nucper",header=T)
names.nucper <- unlist(lapply(strsplit(as.character(introns.nucper$gene),split=";"),"[",1))

combo.juncratio.data.parsed <- cbind(combo.juncratio.data.parsed, introns.nucper[match(combo.juncratio.data.parsed$intron,names.nucper),c(2:6)])


# add first intron length & speed
combo.juncratio.data.parsed.nonfirst <- subset(combo.juncratio.data.parsed, intronnum>0)
combo.juncratio.data.parsed.first <- subset(combo.juncratio.data.parsed, intronnum==0)

combo.juncratio.data.parsed.nonfirst$first_len <- combo.juncratio.data.parsed.first$intronlen[match(combo.juncratio.data.parsed.nonfirst$gene, combo.juncratio.data.parsed.first$gene)]
combo.juncratio.data.parsed.nonfirst$first_halflife <- combo.juncratio.data.parsed.first$fitvalue[match(combo.juncratio.data.parsed.nonfirst$gene, combo.juncratio.data.parsed.first$gene)]
combo.juncratio.data.parsed.nonfirst$first_enhancer <- combo.juncratio.data.parsed.first$arnold_binary[match(combo.juncratio.data.parsed.nonfirst$gene, combo.juncratio.data.parsed.first$gene)]

# add length of flanking exon

strands <- unlist(lapply(strsplit(rownames(combo.juncratio.data.parsed.nonfirst), split=":"),"[",7))
firstexon <- as.numeric(unlist(lapply(strsplit(rownames(combo.juncratio.data.parsed.nonfirst), split=":"),"[",3))) - 
             as.numeric(unlist(lapply(strsplit(rownames(combo.juncratio.data.parsed.nonfirst), split=":"),"[",2)))
secondexon <- as.numeric(unlist(lapply(strsplit(rownames(combo.juncratio.data.parsed.nonfirst), split=":"),"[",6))) - 
              as.numeric(unlist(lapply(strsplit(rownames(combo.juncratio.data.parsed.nonfirst), split=":"),"[",5)))

combo.juncratio.data.parsed.nonfirst$upexon_len <- firstexon
combo.juncratio.data.parsed.nonfirst$upexon_len[which(strands =="-")] <- secondexon[which(strands=="-")]

combo.juncratio.data.parsed.nonfirst$downexon_len <- secondexon
combo.juncratio.data.parsed.nonfirst$downexon_len[which(strands =="-")] <- firstexon[which(strands=="-")]

# add AUU motif number
auu.introns <- read.table("Figures/figure4_variance/kmers/everything/everything_introns.bed.fa.ATT.all")
auu.upexon <- read.table("Figures/figure4_variance/kmers/everything/everything_upexon.bed.fa.ATT.all")
auu.downexon <- read.table("Figures/figure4_variance/kmers/everything/everything_downexon.bed.fa.ATT.all")

combo.juncratio.data.parsed.nonfirst$intron_auu <- auu.introns$V2[match(rownames(combo.juncratio.data.parsed.nonfirst), auu.introns$V1)]
combo.juncratio.data.parsed.nonfirst$upexon_auu <- auu.upexon$V2[match(rownames(combo.juncratio.data.parsed.nonfirst), auu.upexon$V1)]
combo.juncratio.data.parsed.nonfirst$downexon_auu <- auu.downexon$V2[match(rownames(combo.juncratio.data.parsed.nonfirst), auu.downexon$V1)]

# add nonSS nucleotides + separated intronic regions
noss_intron <- read.table("Figures/figure4_variance/kmers/everything/everything_noss_introns.bed.fa.nuc",header=T)
noss_branchpoint <- read.table("Figures/figure4_variance/kmers/everything/everything_noss_branchpoint.bed.fa.nuc",header=T)
noss_nonBP <- read.table("Figures/figure4_variance/kmers/everything/everything_noss_nonBP.bed.fa.nuc",header=T)

combo.juncratio.data.parsed.nonfirst$intron_GC <- noss_intron$GC_per[match(rownames(combo.juncratio.data.parsed.nonfirst), noss_intron$gene)]
combo.juncratio.data.parsed.nonfirst$intron_A <- noss_intron$A_per[match(rownames(combo.juncratio.data.parsed.nonfirst), noss_intron$gene)]
combo.juncratio.data.parsed.nonfirst$intron_T <- noss_intron$T_per[match(rownames(combo.juncratio.data.parsed.nonfirst), noss_intron$gene)]
combo.juncratio.data.parsed.nonfirst$intron_G <- noss_intron$G_per[match(rownames(combo.juncratio.data.parsed.nonfirst), noss_intron$gene)]
combo.juncratio.data.parsed.nonfirst$intron_C <- noss_intron$C_per[match(rownames(combo.juncratio.data.parsed.nonfirst), noss_intron$gene)]

combo.juncratio.data.parsed.nonfirst$BP_GC <- noss_branchpoint$GC_per[match(rownames(combo.juncratio.data.parsed.nonfirst), noss_branchpoint$gene)]
combo.juncratio.data.parsed.nonfirst$BP_A <- noss_branchpoint$A_per[match(rownames(combo.juncratio.data.parsed.nonfirst), noss_branchpoint$gene)]
combo.juncratio.data.parsed.nonfirst$BP_T <- noss_branchpoint$T_per[match(rownames(combo.juncratio.data.parsed.nonfirst), noss_branchpoint$gene)]
combo.juncratio.data.parsed.nonfirst$BP_G <- noss_branchpoint$G_per[match(rownames(combo.juncratio.data.parsed.nonfirst), noss_branchpoint$gene)]
combo.juncratio.data.parsed.nonfirst$BP_C <- noss_branchpoint$C_per[match(rownames(combo.juncratio.data.parsed.nonfirst), noss_branchpoint$gene)]

combo.juncratio.data.parsed.nonfirst$nonBP_GC <- noss_nonBP$GC_per[match(rownames(combo.juncratio.data.parsed.nonfirst), noss_nonBP$gene)]
combo.juncratio.data.parsed.nonfirst$nonBP_A <- noss_nonBP$A_per[match(rownames(combo.juncratio.data.parsed.nonfirst), noss_nonBP$gene)]
combo.juncratio.data.parsed.nonfirst$nonBP_T <- noss_nonBP$T_per[match(rownames(combo.juncratio.data.parsed.nonfirst), noss_nonBP$gene)]
combo.juncratio.data.parsed.nonfirst$nonBP_G <- noss_nonBP$G_per[match(rownames(combo.juncratio.data.parsed.nonfirst), noss_nonBP$gene)]
combo.juncratio.data.parsed.nonfirst$nonBP_C <- noss_nonBP$C_per[match(rownames(combo.juncratio.data.parsed.nonfirst), noss_nonBP$gene)]


### set binary variable for logit
#logit.value <- as.numeric(quantile(lm.miso.matrix.polyA.6070.nonfirst$half.lives, 0.5, na.rm=T))
#lm.miso.matrix.polyA.6070.nonfirst$logit <- 0
#lm.miso.matrix.polyA.6070.nonfirst$logit[which(lm.miso.matrix.polyA.6070.nonfirst$half.lives < logit.value)] <- 1

lin.reg.data <- data.frame(length = combo.juncratio.data.parsed.nonfirst$intronlen,
                           TPM = combo.juncratio.data.parsed.nonfirst$TPM_total,
                           ss3 = combo.juncratio.data.parsed.nonfirst$ss3,
                           ss5 = combo.juncratio.data.parsed.nonfirst$ss5,
                           AUper = (1-combo.juncratio.data.parsed.nonfirst$intron_GC)*100,
                           Uper = combo.juncratio.data.parsed.nonfirst$intron_T*100,
                           Aper = combo.juncratio.data.parsed.nonfirst$intron_A*100,
                           Gper = combo.juncratio.data.parsed.nonfirst$intron_G*100,
                           Cper = combo.juncratio.data.parsed.nonfirst$intron_C*100,
                           BP_AUper = (1-combo.juncratio.data.parsed.nonfirst$BP_GC)*100,
                           nonBP_AUper = (1-combo.juncratio.data.parsed.nonfirst$nonBP_GC)*100,
                           #AUUintrons = lm.miso.matrix.6070.nonfirst$intron_auu,
                           #AUUupexon = lm.miso.matrix.6070.nonfirst$upexon_auu,
                           #AUUdownexon = lm.miso.matrix.6070.nonfirst$downexon_auu,
                           position = combo.juncratio.data.parsed.nonfirst$intronnum,
                           length_first = combo.juncratio.data.parsed.nonfirst$first_len,
                           halflife_first = combo.juncratio.data.parsed.nonfirst$first_halflife,
                           enhancer_first = combo.juncratio.data.parsed.nonfirst$first_enhancer,
                           length_upexon = combo.juncratio.data.parsed.nonfirst$upexon_len,
                           length_downexon = combo.juncratio.data.parsed.nonfirst$downexon_len,
                           enhancer = combo.juncratio.data.parsed.nonfirst$arnold_binary,
                           definitionbin = combo.juncratio.data.parsed.nonfirst$IEratio_bin,
                           definition = combo.juncratio.data.parsed.nonfirst$IEratio,
                           half = combo.juncratio.data.parsed.nonfirst$fitvalue)

lm.test <- lm(log10(half) ~ log10(length) + log10(TPM) + ss3 + ss5 + AUper + BP_AUper + nonBP_AUper + 
                position + log10(length_first) + log10(halflife_first) + enhancer_first + 
                log10(length_upexon) + log10(length_downexon) + enhancer + definition, data=lin.reg.data)

relimp.log.lm.test <- calc.relimp(lm.test, type="lmg",rela=T)
boot.lm <- booteval.relimp(boot.relimp(lm.test, b=1000, type=c("lmg"),rank=T,diff=T,rela=T))

test.lm.data <- data.frame(parameter = rownames(summary(lm.test)$coefficients)[-1],
                           names = c("length", "TPM","ss3","ss5","AUper","BP_AUper","nonBP_AUper",
                                     "position","length_first","halflife_first","enhancer_first","length_upexon","length_downexon","enhancer","definition"),
                           estimate = summary(lm.test)$coefficients[-1,1],
                           err = summary(lm.test)$coefficients[-1,2],
                           pval = summary(lm.test)$coefficients[-1,4],
                           relimp = as.numeric(relimp.log.lm.test$lmg))#,
                           #relimp_lower = as.numeric(boot.lm$lmg.lower),
                           #relimp_upper = as.numeric(boot.lm$lmg.upper))
test.lm.data <- rbind(test.lm.data, 
                      data.frame(parameter = c("blank1","blank2","blank3","blank4"), names = c("blank1","blank2","blank3","blank4"),
                                 estimate = rep(NA,4), err = rep(NA,4), pval = rep(NA,4), 
                                 relimp = rep(NA, 4)))#, relimp_lower = rep(NA,4), relimp_upper = rep(NA, 4)))
test.lm.data$names <- factor(test.lm.data$names, levels=rev(c("definition","length","position","enhancer","blank1","ss5","ss3","blank2","length_upexon","length_downexon","blank3",
                                                              "AUper","BP_AUper","nonBP_AUper","blank4","length_first","halflife_first","enhancer_first","TPM")))
test.lm.data$sign <- sign(test.lm.data$estimate)
pdf("~/Dropbox (MIT)/Projects/Adelman/timecourse/Figures/multreg/allintrons.pdf")
ggplot(test.lm.data, aes(x=factor(names),y=relimp*100)) + geom_bar(stat="identity",aes(fill=factor(sign))) + 
#  geom_errorbar(aes(ymin=relimp_lower*100,ymax=relimp_upper*100),width=0.5,color=wes_palette("Rushmore")[4]) + 
  scale_fill_manual(values=c(wes_palette("FantasticFox")[3],wes_palette("Darjeeling")[1]),labels=c("shorter half-life","longer half-life")) +
  scale_x_discrete(labels=rev(c("definition","intron length","intron position","enhancer in intron","","5'ss strength","3'ss strength","",
                                "upstream exon length","downstream exon length","","A+U%","A+U% in 3' region","A+U% in 5' region","",
                                "first intron length","first intron half-life","enhancer in first intron","gene expression"))) +
  ylim(0,40) + labs(y="relative importance (%)",x="",fill="correlated with") + coord_flip() + background_grid(major="x") + theme(legend.position=c(0.75,0.2))
dev.off()

```

investigate 60-70nt introns
```{r}

lin.reg.data.6070 <- subset(lin.reg.data, length >= 60 & length <= 70)

lm.test.6070 <- lm(log10(half) ~ log10(length) + log10(TPM) + ss3 + ss5 + AUper + BP_AUper + nonBP_AUper + 
                position + log10(length_first) + log10(halflife_first) + enhancer_first + 
                log10(length_upexon) + log10(length_downexon) + enhancer, data=lin.reg.data.6070)

relimp.log.lm.test.6070 <- calc.relimp(lm.test.6070, type="lmg",rela=T)
boot.lm <- booteval.relimp(boot.relimp(lm.test, b=1000, type=c("lmg"),rank=T,diff=T,rela=T))

test.lm.data <- data.frame(parameter = rownames(summary(lm.test.6070)$coefficients)[-1],
                           names = c("length", "TPM","ss3","ss5","AUper","BP_AUper","nonBP_AUper",
                                     "position","length_first","halflife_first","enhancer_first","length_upexon","length_downexon","enhancer"),
                           estimate = summary(lm.test.6070)$coefficients[-1,1],
                           err = summary(lm.test.6070)$coefficients[-1,2],
                           pval = summary(lm.test.6070)$coefficients[-1,4],
                           relimp = as.numeric(relimp.log.lm.test.6070$lmg))#,
                           #relimp_lower = as.numeric(boot.lm$lmg.lower),
                           #relimp_upper = as.numeric(boot.lm$lmg.upper))
test.lm.data <- rbind(test.lm.data, 
                      data.frame(parameter = c("blank1","blank2","blank3","blank4"), names = c("blank1","blank2","blank3","blank4"),
                                 estimate = rep(NA,4), err = rep(NA,4), pval = rep(NA,4), 
                                 relimp = rep(NA, 4)))#, relimp_lower = rep(NA,4), relimp_upper = rep(NA, 4)))
test.lm.data$names <- factor(test.lm.data$names, levels=rev(c("length","position","enhancer","blank1","ss5","ss3","blank2","length_upexon","length_downexon","blank3",
                                                              "AUper","BP_AUper","nonBP_AUper","blank4","length_first","halflife_first","enhancer_first","TPM")))
test.lm.data$sign <- sign(test.lm.data$estimate)
pdf("~/Dropbox (MIT)/Projects/Adelman/timecourse/Figures/multreg/introns6070.pdf")
ggplot(test.lm.data, aes(x=factor(names),y=relimp*100)) + geom_bar(stat="identity",aes(fill=factor(sign))) + 
#  geom_errorbar(aes(ymin=relimp_lower*100,ymax=relimp_upper*100),width=0.5,color=wes_palette("Rushmore")[4]) + 
  scale_fill_manual(values=c(wes_palette("FantasticFox")[3],wes_palette("Darjeeling")[1]),labels=c("shorter half-life","longer half-life")) +
  scale_x_discrete(labels=rev(c("intron length","intron position","enhancer in intron","","5'ss strength","3'ss strength","",
                                "upstream exon length","downstream exon length","","A+U%","A+U% in 3' region","A+U% in 5' region","",
                                "first intron length","first intron half-life","enhancer in first intron","gene expression"))) +
  ylim(0,40) + labs(y="relative importance (%)",x="",fill="correlated with") + coord_flip() + background_grid(major="x") + theme(legend.position=c(0.75,0.2))
dev.off()


```

investigate introndef introns
```{r}
lin.reg.data.ID <- subset(lin.reg.data, definitionbin =="introndef")

lm.test.ID <- lm(log10(half) ~ log10(length) + log10(TPM) + ss3 + ss5 + AUper + BP_AUper + nonBP_AUper + 
                position + log10(length_first) + log10(halflife_first) + enhancer_first + 
                log10(length_upexon) + log10(length_downexon) + enhancer, data=lin.reg.data.ID)

relimp.log.lm.test.ID <- calc.relimp(lm.test.ID, type="lmg",rela=T)
boot.lm.ID <- booteval.relimp(boot.relimp(lm.test.ID, b=1000, type=c("lmg"),rank=T,diff=T,rela=T))

test.lm.data.ID <- data.frame(parameter = rownames(summary(lm.test.ID)$coefficients)[-1],
                           names = c("length", "TPM","ss3","ss5","AUper","BP_AUper","nonBP_AUper",
                                     "position","length_first","halflife_first","enhancer_first","length_upexon","length_downexon","enhancer"),
                           estimate = summary(lm.test.ID)$coefficients[-1,1],
                           err = summary(lm.test.ID)$coefficients[-1,2],
                           pval = summary(lm.test.ID)$coefficients[-1,4],
                           relimp = as.numeric(relimp.log.lm.test.ID$lmg),
                           relimp_lower = as.numeric(boot.lm.ID$lmg.lower),
                           relimp_upper = as.numeric(boot.lm.ID$lmg.upper))
test.lm.data.ID <- rbind(test.lm.data.ID, 
                      data.frame(parameter = c("blank1","blank2","blank3","blank4"), names = c("blank1","blank2","blank3","blank4"),
                                 estimate = rep(NA,4), err = rep(NA,4), pval = rep(NA,4), 
                                 relimp = rep(NA, 4), relimp_lower = rep(NA,4), relimp_upper = rep(NA, 4)))
test.lm.data.ID$names <- factor(test.lm.data.ID$names, levels=rev(c("length","position","enhancer","blank1","ss5","ss3","blank2","length_upexon","length_downexon","blank3",
                                                              "AUper","BP_AUper","nonBP_AUper","blank4","length_first","halflife_first","enhancer_first","TPM")))
test.lm.data.ID$sign <- sign(test.lm.data.ID$estimate)
pdf("~/Dropbox (MIT)/Projects/Adelman/timecourse/Figures/multreg/intronsID.pdf.pdf")
ggplot(test.lm.data, aes(x=factor(names),y=relimp*100)) + geom_bar(stat="identity",aes(fill=factor(sign))) + 
#  geom_errorbar(aes(ymin=relimp_lower*100,ymax=relimp_upper*100),width=0.5,color=wes_palette("Rushmore")[4]) + 
  scale_fill_manual(values=c(wes_palette("FantasticFox")[3],wes_palette("Darjeeling")[1]),labels=c("shorter half-life","longer half-life")) +
  scale_x_discrete(labels=rev(c("intron length","intron position","enhancer in intron","","5'ss strength","3'ss strength","",
                                "upstream exon length","downstream exon length","","A+U%","A+U% in 3' region","A+U% in 5' region","",
                                "first intron length","first intron half-life","enhancer in first intron","gene expression"))) +
  ylim(0,40) + labs(y="relative importance (%)",x="",fill="correlated with") + coord_flip() + background_grid(major="x") + theme(legend.position=c(0.75,0.2))
dev.off()


```

investigate exondef introns
```{r}
lin.reg.data.ED <- subset(lin.reg.data, definitionbin =="exondef")

lm.test.ED <- lm(log10(half) ~ log10(length) + log10(TPM) + ss3 + ss5 + AUper + BP_AUper + nonBP_AUper + 
                position + log10(length_first) + log10(halflife_first) + enhancer_first + 
                log10(length_upexon) + log10(length_downexon) + enhancer, data=lin.reg.data.ED)

relimp.log.lm.test.ED <- calc.relimp(lm.test.ED, type="lmg",rela=T)
boot.lm.ED <- booteval.relimp(boot.relimp(lm.test.ED, b=1000, type=c("lmg"),rank=T,diff=T,rela=T))

test.lm.data.ED <- data.frame(parameter = rownames(summary(lm.test.ED)$coefficients)[-1],
                           names = c("length", "TPM","ss3","ss5","AUper","BP_AUper","nonBP_AUper",
                                     "position","length_first","halflife_first","enhancer_first","length_upexon","length_downexon","enhancer"),
                           estimate = summary(lm.test.ED)$coefficients[-1,1],
                           err = summary(lm.test.ED)$coefficients[-1,2],
                           pval = summary(lm.test.ED)$coefficients[-1,4],
                           relimp = as.numeric(relimp.log.lm.test.ED$lmg),
                           relimp_lower = as.numeric(boot.lm.ED$lmg.lower),
                           relimp_upper = as.numeric(boot.lm.ED$lmg.upper))
test.lm.data.ED <- rbind(test.lm.data.ED, 
                      data.frame(parameter = c("blank1","blank2","blank3","blank4"), names = c("blank1","blank2","blank3","blank4"),
                                 estimate = rep(NA,4), err = rep(NA,4), pval = rep(NA,4), 
                                 relimp = rep(NA, 4), relimp_lower = rep(NA,4), relimp_upper = rep(NA, 4)))
test.lm.data.ED$names <- factor(test.lm.data.ED$names, levels=rev(c("length","position","enhancer","blank1","ss5","ss3","blank2","length_upexon","length_downexon","blank3",
                                                              "AUper","BP_AUper","nonBP_AUper","blank4","length_first","halflife_first","enhancer_first","TPM")))
test.lm.data.ED$sign <- sign(test.lm.data.ED$estimate)
pdf("~/Dropbox (MIT)/Projects/Adelman/timecourse/Figures/multreg/intronsID.pdf.pdf")
ggplot(test.lm.data, aes(x=factor(names),y=relimp*100)) + geom_bar(stat="identity",aes(fill=factor(sign))) + 
#  geom_errorbar(aes(ymin=relimp_lower*100,ymax=relimp_upper*100),width=0.5,color=wes_palette("Rushmore")[4]) + 
  scale_fill_manual(values=c(wes_palette("FantasticFox")[3],wes_palette("Darjeeling")[1]),labels=c("shorter half-life","longer half-life")) +
  scale_x_discrete(labels=rev(c("intron length","intron position","enhancer in intron","","5'ss strength","3'ss strength","",
                                "upstream exon length","downstream exon length","","A+U%","A+U% in 3' region","A+U% in 5' region","",
                                "first intron length","first intron half-life","enhancer in first intron","gene expression"))) +
  ylim(0,40) + labs(y="relative importance (%)",x="",fill="correlated with") + coord_flip() + background_grid(major="x") + theme(legend.position=c(0.75,0.2))
dev.off()
```

Plot ID and ED regressions together
```{r}

test.lm.data.ID$def <- "intron definition (RIME < 0.75)"
test.lm.data.ED$def <- "exon definition (RIME > 1.33)"

ggplot(test.lm.data.ID, aes(x=factor(names),y=relimp*100)) + geom_bar(stat="identity",aes(fill=factor(sign))) + 
#  geom_errorbar(aes(ymin=relimp_lower*100,ymax=relimp_upper*100),width=0.5,color=wes_palette("Rushmore")[4]) + 
  scale_fill_manual(values=c(wes_palette("FantasticFox")[3],wes_palette("Darjeeling")[1]),labels=c("shorter half-life","longer half-life")) +
  scale_x_discrete(labels=rev(c("intron length","intron position","enhancer in intron","","5'ss strength","3'ss strength","",
                                "upstream exon length","downstream exon length","","A+U%","A+U% in 3' region","A+U% in 5' region","",
                                "first intron length","first intron half-life","enhancer in first intron","gene expression"))) +
  ylim(0,40) + labs(y="relative importance (%)",x="",fill="correlated with") + coord_flip() + background_grid(major="x") + theme(legend.position=c(0.75,0.2)) + facet_wrap(~def)

```

# Expression
```{r}
combo.juncratio.data.parsed$optimal <- "60-70nt"
combo.juncratio.data.parsed$optimal[which(combo.juncratio.data.parsed$intronlen < 60)] <- "<60-70nt"
combo.juncratio.data.parsed$optimal[which(combo.juncratio.data.parsed$intronlen > 70)] <- ">60-70nt"

ggplot(combo.juncratio.data.parsed, aes(x=factor(optimal),y=TPM_total,fill=factor(first))) + geom_boxplot(notch=T) + scale_y_log10()
```
# Accuracy
```{r}

sumreps_junc <- function(time){
  # read files
  rep1 <- read.table(paste0("~/Dropbox (MIT)/Projects/Adelman/timecourse/accuracy/junctioncombine/Adelman4sU_",time,".rep1.junction.combine"),header=T)
  rep2 <- read.table(paste0("~/Dropbox (MIT)/Projects/Adelman/timecourse/accuracy/junctioncombine/Adelman4sU_",time,".rep2.junction.combine"),header=T)
  rep3 <- read.table(paste0("~/Dropbox (MIT)/Projects/Adelman/timecourse/accuracy/junctioncombine/Adelman4sU_",time,".rep3.junction.combine"),header=T)
  # dataframe
  hold.data <- data.frame(intron = rep1$intron, len = rep1$len,total = rowSums(cbind(rep1$junc, rep2$junc, rep3$junc)),
                          junc0 = rowSums(cbind(rep1$X0, rep2$X0, rep3$X0)),junc1 = rowSums(cbind(rep1$X1, rep2$X1, rep3$X1)),junc2 = rowSums(cbind(rep1$X2, rep2$X2, rep3$X2)),
                          junc3 = rowSums(cbind(rep1$X3, rep2$X3, rep3$X3)),junc4 = rowSums(cbind(rep1$X4, rep2$X4, rep3$X4)),junc5 = rowSums(cbind(rep1$X5, rep2$X5, rep3$X5)),
                          junc6 = rowSums(cbind(rep1$X6, rep2$X6, rep3$X6)),
                          junc20 = rowSums(cbind(rep1$X20, rep2$X20, rep3$X20)),junc21 = rowSums(cbind(rep1$X21, rep2$X21, rep3$X21)),junc22 = rowSums(cbind(rep1$X22, rep2$X22, rep3$X22)),
                          junc23 = rowSums(cbind(rep1$X23, rep2$X23, rep3$X23)),junc24 = rowSums(cbind(rep1$X24, rep2$X24, rep3$X24)),junc25 = rowSums(cbind(rep1$X25, rep2$X25, rep3$X25)),
                          junc26 = rowSums(cbind(rep1$X26, rep2$X26, rep3$X26)))
  hold.data$unannotated <- rowSums(cbind(hold.data$junc0, hold.data$junc1, hold.data$junc2, hold.data$junc3, hold.data$junc4, hold.data$junc5, hold.data$junc6))
  hold.data$annotated <- rowSums(cbind(hold.data$junc20, hold.data$junc21, hold.data$junc22, hold.data$junc23, hold.data$junc24, hold.data$junc25, hold.data$junc26))
  hold.data$unanno_canno <- rowSums(cbind(hold.data$junc1, hold.data$junc2, hold.data$junc3, hold.data$junc4, hold.data$junc5, hold.data$junc6))
  # get intron coord
  return(hold.data)
}

accuracy.5m <- sumreps_junc("5m")
accuracy.10m <- sumreps_junc("10m")
accuracy.20m <- sumreps_junc("20m")

accuracy.all <- data.frame(time = rep(c("5m","10m","20m"),each=nrow(accuracy.5m)),
                           rbind(accuracy.5m, accuracy.10m, accuracy.20m))
accuracy.all$time <- factor(accuracy.all$time, levels=c("5m","10m","20m"))

ggplot(accuracy.all, aes(x=len,y=annotated/total)) + geom_point() + scale_x_log10() + scale_y_log10()
ggplot(accuracy.all, aes(x=len,y=unannotated/total)) + geom_point() + scale_x_log10() + scale_y_log10()
ggplot(accuracy.all, aes(x=len,y=junc0/total)) + geom_point() + scale_x_log10() + scale_y_log10()
ggplot(accuracy.all, aes(x=len,y=(junc0+1)/(total+1))) + geom_point() + scale_x_log10() + scale_y_log10() + geom_smooth() + facet_wrap(~time)

accuracy.inds <- match(combo.juncratio.data.parsed$intron, accuracy.5m$intron)

combo.juncratio.data.parsed$unannotated.5m <- accuracy.5m$unannotated[accuracy.inds]
combo.juncratio.data.parsed$junc0.5m <- accuracy.5m$junc0[accuracy.inds]
combo.juncratio.data.parsed$total.5m <- accuracy.5m$total[accuracy.inds]

pdf("Accuracy.pdf")
ggplot(combo.juncratio.data.parsed, aes(x=factor(IEratio_bin), y=junc0.5m/total.5m)) + geom_boxplot(notch=T) + ylim(0,0.01)
ggplot(combo.juncratio.data.parsed, aes(x=factor(IEratio_bin), y=(junc0.5m+1)/(total.5m+1))) + geom_boxplot(notch=T) + ylim(0,0.1)
ggplot(combo.juncratio.data.parsed, aes(x=factor(IEratio_bin), y=(junc0.5m+1)/(total.5m+1))) + geom_boxplot(notch=T) + scale_y_log10()
dev.off()

### match for length

LENmatch <- function(datahere, window){
  exon.data <- subset(datahere, IEratio_bin=="exondef")
  intron.data <- subset(datahere, IEratio_bin=="introndef")
  mixed.data <- subset(datahere, IEratio_bin=="confused")
  # bin by window size
  bin.vec <- seq(floor(min(exon.data$intronlen)), ceiling(max(exon.data$intronlen)), by=window)
  print(length(bin.vec))
  exon.inds <- intron.inds <- mixed.inds <- c()
  for(i in 2:length(bin.vec)){
    exon.hold <- which(exon.data$intronlen > bin.vec[i-1] & exon.data$intronlen <= bin.vec[i])
    intron.hold <- which(intron.data$intronlen > bin.vec[i-1] & intron.data$intronlen <= bin.vec[i])
    mixed.hold <- which(mixed.data$intronlen > bin.vec[i-1] & mixed.data$intronlen <= bin.vec[i])
    if(length(exon.hold) > 0 & length(intron.hold > 0) & length(mixed.hold > 0)){
      exon.inds <- c(exon.inds, exon.hold)
      intron.inds <- c(intron.inds, sample(intron.hold, length(exon.hold), replace=T))
      mixed.inds <- c(mixed.inds, sample(mixed.hold, length(exon.hold), replace=T))
    }
  }
  matched.data <- data.frame(type=c(rep("exon",length(exon.inds)),
                                    rep("intron",length(intron.inds)),
                                    rep("mixed",length(mixed.inds))),
                             rate = c(exon.data$fitvalue[exon.inds], intron.data$fitvalue[intron.inds], mixed.data$fitvalue[mixed.inds]),
                             intronlen = c(exon.data$intronlen[exon.inds], intron.data$intronlen[intron.inds], mixed.data$intronlen[mixed.inds]),
                             TPM = c(exon.data$TPM[exon.inds], intron.data$TPM[intron.inds], mixed.data$TPM[mixed.inds]),
                             IEratio = c(exon.data$IEratio[exon.inds], intron.data$IEratio[intron.inds], mixed.data$IEratio[mixed.inds]),
                             unannotated = c(exon.data$unannotated.5m[exon.inds], intron.data$unannotated.5m[intron.inds], mixed.data$unannotated.5m[mixed.inds]),
                             junc0 = c(exon.data$junc0.5m[exon.inds], intron.data$junc0.5m[intron.inds], mixed.data$junc0.5m[mixed.inds]),
                             total = c(exon.data$total.5m[exon.inds], intron.data$total.5m[intron.inds], mixed.data$total.5m[mixed.inds]))
  return(matched.data)
}
TPMmatch <- function(datahere, window){
  exon.data <- subset(datahere, IEratio_bin=="exondef")
  intron.data <- subset(datahere, IEratio_bin=="introndef")
  mixed.data <- subset(datahere, IEratio_bin=="confused")
  # bin by window size
  bin.vec <- seq(floor(min(exon.data$TPM)), ceiling(max(exon.data$TPM)), by=window)
  print(length(bin.vec))
  exon.inds <- intron.inds <- mixed.inds <- c()
  for(i in 2:length(bin.vec)){
    exon.hold <- which(exon.data$TPM > bin.vec[i-1] & exon.data$TPM <= bin.vec[i])
    intron.hold <- which(intron.data$TPM > bin.vec[i-1] & intron.data$TPM <= bin.vec[i])
    mixed.hold <- which(mixed.data$TPM > bin.vec[i-1] & mixed.data$TPM <= bin.vec[i])
    if(length(exon.hold) > 0 & length(intron.hold > 0) & length(mixed.hold > 0)){
      exon.inds <- c(exon.inds, exon.hold)
      intron.inds <- c(intron.inds, sample(intron.hold, length(exon.hold), replace=F))
      mixed.inds <- c(mixed.inds, sample(mixed.hold, length(exon.hold), replace=F))
    }
  }
  matched.data <- data.frame(type=c(rep("exon",length(exon.inds)),
                                    rep("intron",length(intron.inds)),
                                    rep("mixed",length(mixed.inds))),
                             rate = c(exon.data$fitvalue[exon.inds], intron.data$fitvalue[intron.inds], mixed.data$fitvalue[mixed.inds]),
                             intronlen = c(exon.data$intronlen[exon.inds], intron.data$intronlen[intron.inds], mixed.data$intronlen[mixed.inds]),
                             TPM = c(exon.data$TPM[exon.inds], intron.data$TPM[intron.inds], mixed.data$TPM[mixed.inds]),
                             IEratio = c(exon.data$IEratio[exon.inds], intron.data$IEratio[intron.inds], mixed.data$IEratio[mixed.inds]),
                             unannotated = c(exon.data$unannotated.5m[exon.inds], intron.data$unannotated.5m[intron.inds], mixed.data$unannotated.5m[mixed.inds]),
                             junc0 = c(exon.data$junc0.5m[exon.inds], intron.data$junc0.5m[intron.inds], mixed.data$junc0.5m[mixed.inds]),
                             total = c(exon.data$total.5m[exon.inds], intron.data$total.5m[intron.inds], mixed.data$total.5m[mixed.inds]))
  return(matched.data)
}

lenmatched <- LENmatch(combo.juncratio.data.parsed, 50)
lenmatched$type <- factor(lenmatched$type, levels=c("intron","mixed","exon"))

tpmmatched <- TPMmatch(combo.juncratio.data.parsed, 2)
tpmmatched$type <- factor(tpmmatched$type, levels=c("intron","mixed","exon"))

pdf("Accuracy_matched.pdf")
ggplot(lenmatched, aes(x=factor(type),y=intronlen)) + geom_boxplot(notch=T) + scale_y_log10()
ggplot(lenmatched, aes(x=factor(type),y=rate)) + geom_boxplot(notch=T) + scale_y_log10()
ggplot(lenmatched, aes(x=factor(type),y=TPM)) + geom_boxplot(notch=T) + scale_y_log10()
ggplot(lenmatched, aes(x=factor(type),y=(junc0+1)/(total+1))) + geom_boxplot(notch=T) + ylim(0,0.25)
ggplot(lenmatched, aes(color=factor(type),x=(junc0+1)/(total+1))) + stat_ecdf() + xlim(0,0.25)

ggplot(tpmmatched, aes(x=factor(type),y=intronlen)) + geom_boxplot(notch=T) + scale_y_log10()
ggplot(tpmmatched, aes(x=factor(type),y=rate)) + geom_boxplot(notch=T) + scale_y_log10()
ggplot(tpmmatched, aes(x=factor(type),y=TPM)) + geom_boxplot(notch=T) + scale_y_log10()
ggplot(tpmmatched, aes(x=factor(type),y=(junc0+1)/(total+1))) + geom_boxplot(notch=T) + ylim(0,0.1)
ggplot(tpmmatched, aes(color=factor(type),x=(junc0+1)/(total+1))) + stat_ecdf() + xlim(0,0.25)

dev.off()

ggplot(subset(combo.juncratio.data.parsed, junc0.5m!=0), aes(x=fitvalue,y=junc0.5m/total.5m)) + geom_point(alpha=0.5) + scale_x_log10() + scale_y_log10()
ggplot(subset(combo.juncratio.data.parsed, junc0.5m>0), aes(x=factor(IEratio_bin),y=junc0.5m/total.5m)) + geom_boxplot(notch=T) + scale_y_log10()


lenmatched$accuracy <- (lenmatched$junc0 + 1)/(lenmatched$total+1)
# len matched but not
ggplot(lenmatched, aes(color=factor(type),x=accuracy)) + stat_ecdf() + xlim(0,0.25) + 
  scale_color_manual(values=c("deeppink4","grey57","dodgerblue4"))

ks.test(subset(lenmatched, type=="intron")$accuracy, subset(lenmatched, type=="mixed")$accuracy)
ks.test(subset(lenmatched, type=="intron")$accuracy, subset(lenmatched, type=="exon")$accuracy)
ks.test(subset(lenmatched, type=="mixed")$accuracy, subset(lenmatched, type=="mixed")$accuracy)


```


# Order of splicing
```{r}

twointrons_order <- function(x, r){
  h1 <- x[1]
  h2 <- x[2]
  e <- x[3]
  i2 <- x[4]
  hexp <- h1/(h1+h2)
  eexp <- (e + i2)/(h1*r)
  prob <- hexp*exp(-eexp)
  return(prob)
}

threeintrons_order <- function(x, r){
  h1 <- x[1]; h2 <- x[2]; h3 <- x[3]
  e1 <- x[4]; e2 <- x[5]
  i2 <- x[6]; i3 <- x[7]
  hexp1 <- h1/(h1+h2)
  hexp2 <- (h1*h3)/((h2*h3) + (h1*h3) + (h1*h2))
  eexp1 <- (e1+i2)/(h1*r)
  eexp2 <- (e1+i2+e2+i3)/(h1*r)
  eexp3 <- (e2+i3)/(h2*r)
  eexp4 <- (e1+i2+e2+i3)/(h1*r)
  eexp5 <- (e1+i2+e2+i3)/(h2*r)
  prob <- (hexp1*(exp(-eexp1) - (exp(-eexp2)*exp(-eexp3)))) + (hexp2*exp(-eexp4)*exp(-eexp5))
  return(prob)
}
  
combo.metadata.polyA.half.median$twoprob = NA
combo.metadata.polyA.half.median$threeprob = NA
for(i in 1:nrow(combo.metadata.polyA.half.median)){
  print(i)
  genehere = as.character(combo.metadata.polyA.half.median$Gene.ID[i])
  r = 1500
  h1 = h2 = h3 = NA
  i1 = i2 = i3 = NA
  e1 = e2 = e3 = NA
  twoprob = threeprob = NA
  firstdata <- subset(combo.juncratio.data.parsed, gene==genehere & intronnum==0)[1,]
  if(nrow(firstdata) == 1){
    h1 = firstdata$fitvalue; i1 = firstdata$intronlen; e1 = firstdata$downexon_len
    seconddata <- subset(combo.juncratio.data.parsed, gene==genehere & intronnum==1)[1,]
    if(nrow(seconddata) == 1){
      h2 = seconddata$fitvalue; i2 = seconddata$intronlen; e2 = seconddata$downexon_len
      twoprob = twointrons_order(c(h1, h2, e1, i2), 1500)
      thirddata <- subset(combo.juncratio.data.parsed, gene==genehere & intronnum==2)[1,]
      if(nrow(thirddata) == 1){
        h3 = thirddata$fitvalue; i3 = thirddata$intronlen; e3 = thirddata$downexon_len
        threeprob = threeintrons_order(c(h1, h2, h3, e1, e2, i2, i3), 1500)
      }
    }
  }
  combo.metadata.polyA.half.median$twoprob[i] = twoprob
  combo.metadata.polyA.half.median$threeprob[i] = threeprob
}

ggplot(combo.metadata.polyA.half.median, aes(x=twoprob,fill=factor(definition),y=..density..)) + geom_histogram(position="dodge")
ggplot(combo.metadata.polyA.half.median, aes(x=threeprob,fill=factor(definition),y=..density..)) + geom_histogram(position="dodge")

```

# U12 introns
```{r}

u12introns <- c("chr2L:16743405:16743570:+@chr2L:16743760:16744067:+","chr3R:22788880:22789242:-@chr3R:22789840:22790007:-","chrX:9231731:9231925:-@chrX:9232086:9232258:-","chrX:17517254:17518238:-@chrX:17519563:17519691:-","chr2R:14708110:14708343:+@chr2R:14708546:14708623:+","chr3R:4837767:4838027:-@chr3R:4838559:4838983:-","chr3R:20895649:20895772:+@chr3R:20895925:20896636:+")

combo.juncratio.data.parsed$u12 <- "u2"
combo.juncratio.data.parsed$u12[match(u12introns, combo.juncratio.data.parsed$intron)] <- "u12"

ggplot(combo.juncratio.data.parsed, aes(x=factor(u12),y=fitvalue)) + geom_boxplot(notch=T) + scale_y_log10()

median(subset(combo.juncratio.data.parsed, u12=="u2")$fitvalue)
median(subset(combo.juncratio.data.parsed, u12=="u12")$fitvalue)

median(subset(combo.juncratio.data.parsed, u12=="u2")$intronlen)
median(subset(combo.juncratio.data.parsed, u12=="u12")$intronlen)


```

# Last introns
```{r}

combo.juncratio.data.parsed$last <- "non-last"

genes <- unique(combo.juncratio.data.parsed$gene)
for(i in 1:length(genes)){
  print(i)
  hold <- subset(combo.juncratio.data.parsed, gene==genes[i])
  intronname <- as.character(hold$intron[which.max(hold$intronnum)])
  combo.juncratio.data.parsed$last[which(combo.juncratio.data.parsed$intron == intronname)] <- "last"
}

ggplot(combo.juncratio.data.parsed, aes(x=factor(last),y=fitvalue)) + geom_boxplot(notch=T) + scale_y_log10()
ggplot(combo.juncratio.data.parsed, aes(x=factor(last),y=fitvalue,fill=factor(IEratio_bin))) + geom_boxplot(notch=T) + scale_y_log10()

ggplot(combo.juncratio.data.parsed, aes(linetype=factor(last),x=fitvalue,color=factor(IEratio_bin))) + stat_ecdf() + scale_x_log10()

# time to transcribe last exon
combo.juncratio.data.parsed$downexon_txntime <- combo.juncratio.data.parsed$downexon_len/1500

dim(subset(combo.juncratio.data.parsed, last=="last"))
dim(subset(combo.juncratio.data.parsed, last=="last" & downexon_txntime < fitvalue ))

```

# Fast/Slow gene ontology
```{r}

fastest.tenth <- quantile(combo.metadata.polyA.half.median$Splicing.rate.remaining.introns, na.rm=T, 0.1)
slowest.tenth <- quantile(combo.metadata.polyA.half.median$Splicing.rate.remaining.introns, na.rm=T, 0.9)

combo.metadata.polyA.half.median$fastslow <- "middle"
combo.metadata.polyA.half.median$fastslow[combo.metadata.polyA.half.median$Splicing.rate.remaining.introns <= fastest.tenth] <- "fastest"
combo.metadata.polyA.half.median$fastslow[combo.metadata.polyA.half.median$Splicing.rate.remaining.introns >= slowest.tenth] <- "slowest"

ggplot(combo.metadata.polyA.half.median, aes(x=factor(fastslow),y=TPM)) + geom_boxplot(notch=T) + scale_y_log10()
ggplot(combo.metadata.polyA.half.median, aes(x=factor(fastslow),y=percentage_exondef)) + geom_boxplot(notch=T)

write.table(as.character(subset(combo.metadata.polyA.half.median, fastslow == "fastest")$Gene.ID),
            file="fastest.tenth.genes",sep="\t",quote=F,row.names=F,col.names = F)
write.table(as.character(subset(combo.metadata.polyA.half.median, fastslow == "slowest")$Gene.ID),
            file="slowest.tenth.genes",sep="\t",quote=F,row.names=F,col.names = F)
write.table(as.character(combo.metadata.polyA.half.median$Gene.ID),
            file="all.genes",sep="\t",quote=F,row.names=F,col.names = F)


```