-
Notifications
You must be signed in to change notification settings - Fork 45
Ramdom Regression Marker Effect Model
paper:
- Qu, Jiayi, Gota Morota, and Hao Cheng. "A Bayesian random regression method using mixture priors for genome‐enabled analysis of time‐series high‐throughput phenotyping data." The Plant Genome 15, no. 3 (2022): e20228; https://doi.org/10.1002/tpg2.20228.
# Load packages
using JWAS, DataFrames, CSV, DelimitedFiles, LinearAlgebra, Statistics, Random;
using GSL,Kronecker;
using Plots;
using HTTP;
# read data from github
function dataset_big(file_name::AbstractString; dataset_url::AbstractString="https://raw.githubusercontent.com/zhaotianjing/bio_protocol/refs/heads/main/data/")
rdaname = joinpath(dataset_url, string(file_name))
http_obj = HTTP.get(rdaname)
df = CSV.File(IOBuffer(http_obj.body)) |> DataFrame
#df = CSV.File(IOBuffer(http_obj.body); delim = ',', missingstring="NA", types=Dict(:ID => String)) |> DataFrame
return df
end
phenotypes = dataset_big("RR_PSA_rep1.csv")
genotypes = dataset_big("Genotype2_first5k.csv")
# simulated true QTL
trueQTLs = dataset_big("QTLpos_rep1.csv")
In this code, we assume that Legendre polynomials are used to model the standardized time points. The order of the polynomials is determined by the parameter ncoef
, which specifies the number of polynomial coefficients to be estimated. The function generatefullPhi(timevec, ncoeff)
constructs the Legendre polynomial matrix (Φ) for a given set of time points. Each row of the matrix represents a time point, while each column corresponds to a Legendre polynomial of increasing order.
ncoef = 2
### Generate Lengendre Polynomial Covariates
function generatefullPhi(timevec, ncoeff=3)
# timevec: a vector of time points for all observations
# ncoeff : the number of polynomial coefficients to be estimated
times = sort(unique(timevec)) # vector of whole timepoints sorted
tmin = minimum(times)
tmax = maximum(times)
# standardized time points
qi = 2 * (times .- tmin) ./ (tmax .- tmin) .- 1
Φ = Matrix{Float64}(undef, length(times), ncoeff) # Phi of size ntimes x ncoeff
for i = 1:ncoeff
n = i - 1
## Given the normalized function from L. R. Schaeffer
Φ[:, i] = sqrt((2 * n + 1) / 2) * sf_legendre_Pl.(n, qi) # construct the matrix
end
return Φ
end
# the Ledengre polynomial matrix
myPhi = generatefullPhi(phenotypes[!, :time], ncoef)
This part of the code splits the dataset into training and testing subsets for genomic prediction.
# number of testing individual = 119 (1/3)
IDs = unique(phenotypes[!, :ID])
nind = size(IDs, 1)
nt = 119
testingInd = shuffle(IDs)[1:nt]
trainingInd = filter!(x -> x ∉ testingInd, IDs);
pheno_training = filter(row -> row.ID in trainingInd, copy(phenotypes))
pheno_testing = filter(row -> row.ID in testingInd, copy(phenotypes));
# Add Legendre Polynomial Covariates to Training Data
wholePhi = myPhi[pheno_training[!, :time], :]
for i in 1:ncoef
pheno_training[!, Symbol("Phi$i")] = wholePhi[:, i]
end
# Add Day$i columns to pheno_training to account for heterogeneous residual variance
tp = length(unique(pheno_training[!, :time]))
for i in 1:tp
colname = Symbol("Day$i")
colvalue = Vector{Union{Missing,Int}}(missing, size(pheno_training, 1)) # Allows missing + int
index = findall(pheno_training[!, :time] .== i)
colvalue[index] .= 1:length(index) # Assign sequential values
pheno_training[!, colname] = colvalue
end
# Add a column to pheno_training to represent permanent environmental effects
pheno_training[!, :pe] = pheno_training[!, :ID]
# get_genotypes
geno = get_genotypes(genotypes, header=true, separator=',',method="BayesC")
#model equation
fixed_effect = join(" + Phi" .* string.(collect(2:ncoef)))
# add permanent environmental effects
pe = join(" + Phi" .* string.(collect(1:ncoef)) .* "*pe")
#add heterogeneous variance
hetero_residual = join(" + Day" .* string.(collect(1:tp)))
model_equation = "PSA = Phi1" * fixed_effect * pe * hetero_residual * "+ geno"
model = build_model(model_equation);
#Set Model Parameters
# set covariates
set_covariate(model, "Phi1")
set_covariate(model, "Phi2");
# set random effects for pe
set_random(model, "Phi1*pe")
set_random(model, "Phi2*pe")
# set random effects for heterogeneous residual variance
for d in 1:tp
set_random(model, "Day$d")
end
# run MCMC sampling
@time output = runMCMC(model, pheno_training, burnin=10,chain_length=100, RRM=myPhi, seed=123);
This section of the code estimates time-specific breeding values (BVs) for individuals by using estimated breeding values (EBVs) of random regression coefficients. It also evaluates prediction accuracy for testing individuals across different time points.
#### Validation
# estimated EBVs (coefficients) for individuals
u1 = CSV.read("results/EBV_1.txt", DataFrame)
u2 = CSV.read("results/EBV_2.txt", DataFrame)
EBVs = DataFrame(ID=u1[!, :ID],u1=u1[!, :EBV],u2=u2[!, :EBV])
# Compute gBLUP for Each Time Point
# Calculate the BV for individuals by time
n = size(EBVs, 1)
Idt = Matrix{Float64}(I, n, n)
Phi = generatefullPhi(collect(1:tp), ncoef)
Z = Idt ⊗ Phi
us = [[] for i = 1:n]
for i = 1:n
us[i] = Matrix(EBVs)[i, 2:end]
end
uhat = vcat(us...)
gBLUP = Z * uhat
gBLUP = convert(Array{Float64,1}, gBLUP)
#Organize time-specific EBVs by time
correctEBVs = DataFrame(
ID=repeat(EBVs[!, :ID], inner=tp),
time=repeat(collect(1:tp), outer=n),
EBV=gBLUP)
# choose EBV for testing Individuals
testingEBVs = filter(row -> row.ID in testingInd, correctEBVs)
# calculate prediction accuracy across time for testing individuals
accuracy = []
for timei = 1:tp
println("Day$timei.")
pheno_testingi = filter(row -> row.time in [timei], pheno_testing)
select!(pheno_testingi, Not(:time))
EBV_testingi = filter(row -> row.time in [timei], testingEBVs)
select!(EBV_testingi, Not(:time))
final_dfi = innerjoin(pheno_testingi, EBV_testingi, on=:ID)
res = cor(final_dfi[!, :EBV], final_dfi[!, :PSA])
push!(accuracy, res)
end
This section of the code estimates the posterior inclusion probability (PIP) for single nucleotide polymorphisms (SNPs) that influence a specific random regression coefficient in a Bayesian regression model.
# calculate the model frequency for coef i (1: intercept, 2: slope)
coef = 2
println("Coef $coef")
mrk_eff_sample_c = "results/MCMC_samples_marker_effects_geno_$coef.txt"
mrk_samples, markerID = readdlm(mrk_eff_sample_c, header=true, ',')
markerID = vec(strip.(markerID, ['\"']))
model_freq = vec(mean(mrk_samples .!= 0, dims=1))
# markerOrder is used to order SNPs in the plots
RR_MFc = DataFrame(markerID=markerID, modelfrequency=model_freq)
Joint Analysis of Continuous, Censored and Categorical Traits
Integrating Phenotypic Causal Networks in GWAS
single trait and multiple trait GBLUP by providing the relationship matrix directly
User-defined Prediction Equation
Description of Mixed Effects Model
Constraint on variance components