Skip to content

Ramdom Regression Marker Effect Model

Jiayi Qu edited this page Feb 1, 2025 · 17 revisions

Example of Random regression model with mixture prior

paper:

  • Qu, Jiayi, Gota Morota, and Hao Cheng. "A Bayesian random regression method using mixture priors for genome‐enabled analysis of time‐series high‐throughput phenotyping data." The Plant Genome 15, no. 3 (2022): e20228; https://doi.org/10.1002/tpg2.20228.
# Load packages
using JWAS, DataFrames, CSV, DelimitedFiles, LinearAlgebra, Statistics, Random;
using GSL,Kronecker;
using Plots;
using HTTP;

# read data from github
function dataset_big(file_name::AbstractString; dataset_url::AbstractString="https://raw.githubusercontent.com/zhaotianjing/bio_protocol/refs/heads/main/data/")

    rdaname = joinpath(dataset_url, string(file_name))
    http_obj = HTTP.get(rdaname)
    df = CSV.File(IOBuffer(http_obj.body)) |> DataFrame
    #df = CSV.File(IOBuffer(http_obj.body); delim = ',', missingstring="NA", types=Dict(:ID => String)) |> DataFrame 
    return df
end

phenotypes = dataset_big("RR_PSA_rep1.csv")
genotypes = dataset_big("Genotype2_first5k.csv")
# simulated true QTL
trueQTLs = dataset_big("QTLpos_rep1.csv")

In this code, we assume that Legendre polynomials are used to model the standardized time points. The order of the polynomials is determined by the parameter ncoef, which specifies the number of polynomial coefficients to be estimated. The function generatefullPhi(timevec, ncoeff) constructs the Legendre polynomial matrix (Φ) for a given set of time points. Each row of the matrix represents a time point, while each column corresponds to a Legendre polynomial of increasing order.

ncoef = 2
### Generate Lengendre Polynomial Covariates
function generatefullPhi(timevec, ncoeff=3)
    # timevec: a vector of time points for all observations
    # ncoeff : the number of polynomial coefficients to be estimated
    times = sort(unique(timevec)) # vector of whole timepoints sorted
    tmin = minimum(times)
    tmax = maximum(times)
    # standardized time points
    qi = 2 * (times .- tmin) ./ (tmax .- tmin) .- 1
    Φ = Matrix{Float64}(undef, length(times), ncoeff) # Phi of size ntimes x ncoeff
    for i = 1:ncoeff
        n = i - 1
        ## Given the normalized function from L. R. Schaeffer
        Φ[:, i] = sqrt((2 * n + 1) / 2) * sf_legendre_Pl.(n, qi) # construct the matrix
    end
    return Φ
end
# the Ledengre polynomial matrix 
myPhi = generatefullPhi(phenotypes[!, :time], ncoef)

This part of the code splits the dataset into training and testing subsets for genomic prediction.

# number of testing individual = 119 (1/3)
IDs = unique(phenotypes[!, :ID])
nind = size(IDs, 1)
nt = 119
testingInd = shuffle(IDs)[1:nt]
trainingInd = filter!(x -> x  testingInd, IDs);
pheno_training = filter(row -> row.ID in trainingInd, copy(phenotypes))
pheno_testing = filter(row -> row.ID in testingInd, copy(phenotypes));

# Add Legendre Polynomial Covariates to Training Data
wholePhi = myPhi[pheno_training[!, :time], :]

for i in 1:ncoef
    pheno_training[!, Symbol("Phi$i")] = wholePhi[:, i]
end

# Add Day$i columns to pheno_training to account for heterogeneous residual variance
tp = length(unique(pheno_training[!, :time]))

for i in 1:tp
    colname = Symbol("Day$i")
    colvalue = Vector{Union{Missing,Int}}(missing, size(pheno_training, 1))  # Allows missing + int
    index = findall(pheno_training[!, :time] .== i)
    colvalue[index] .= 1:length(index)  # Assign sequential values
    pheno_training[!, colname] = colvalue
end

# Add a column to pheno_training to represent permanent environmental effects
pheno_training[!, :pe] = pheno_training[!, :ID]

Fit the RR-BayesC model

# get_genotypes
geno = get_genotypes(genotypes, header=true, separator=',',method="BayesC")

#model equation 
fixed_effect = join(" + Phi" .* string.(collect(2:ncoef)))

# add permanent environmental effects 
pe = join(" + Phi" .* string.(collect(1:ncoef)) .* "*pe")

#add heterogeneous variance 
hetero_residual = join(" + Day" .* string.(collect(1:tp)))

model_equation = "PSA = Phi1" * fixed_effect * pe * hetero_residual * "+ geno"

model = build_model(model_equation);

#Set Model Parameters

# set covariates
set_covariate(model, "Phi1")
set_covariate(model, "Phi2");

# set random effects for pe
set_random(model, "Phi1*pe")
set_random(model, "Phi2*pe")

# set random effects for heterogeneous residual variance
for d in 1:tp
    set_random(model, "Day$d")
end

# run MCMC sampling 
@time output = runMCMC(model, pheno_training, burnin=10,chain_length=100, RRM=myPhi, seed=123);

This section of the code estimates time-specific breeding values (BVs) for individuals by using estimated breeding values (EBVs) of random regression coefficients. It also evaluates prediction accuracy for testing individuals across different time points.

#### Validation 
# estimated EBVs (coefficients) for individuals
u1 = CSV.read("results/EBV_1.txt", DataFrame)
u2 = CSV.read("results/EBV_2.txt", DataFrame)
EBVs = DataFrame(ID=u1[!, :ID],u1=u1[!, :EBV],u2=u2[!, :EBV])

# Compute gBLUP for Each Time Point
# Calculate the BV for individuals by time 
n = size(EBVs, 1)
Idt = Matrix{Float64}(I, n, n)
Phi = generatefullPhi(collect(1:tp), ncoef)
Z = Idt  Phi
us = [[] for i = 1:n]
for i = 1:n
    us[i] = Matrix(EBVs)[i, 2:end]
end
uhat = vcat(us...)
gBLUP = Z * uhat
gBLUP = convert(Array{Float64,1}, gBLUP)

#Organize time-specific EBVs by time
correctEBVs = DataFrame(
    ID=repeat(EBVs[!, :ID], inner=tp),
    time=repeat(collect(1:tp), outer=n),
    EBV=gBLUP)

# choose EBV for testing Individuals
testingEBVs = filter(row -> row.ID in testingInd, correctEBVs)

# calculate prediction accuracy across time for testing individuals
accuracy = []
for timei = 1:tp
    println("Day$timei.")
    pheno_testingi = filter(row -> row.time in [timei], pheno_testing)
    select!(pheno_testingi, Not(:time))
    EBV_testingi = filter(row -> row.time in [timei], testingEBVs)
    select!(EBV_testingi, Not(:time))
    final_dfi = innerjoin(pheno_testingi, EBV_testingi, on=:ID)
    res = cor(final_dfi[!, :EBV], final_dfi[!, :PSA])
    push!(accuracy, res)
end

This section of the code estimates the posterior inclusion probability (PIP) for single nucleotide polymorphisms (SNPs) that influence a specific random regression coefficient in a Bayesian regression model.

# calculate the model frequency for coef i (1: intercept, 2: slope)
coef = 2
println("Coef $coef")
mrk_eff_sample_c = "results/MCMC_samples_marker_effects_geno_$coef.txt"
mrk_samples, markerID = readdlm(mrk_eff_sample_c, header=true, ',')
markerID = vec(strip.(markerID, ['\"']))
model_freq = vec(mean(mrk_samples .!= 0, dims=1))
# markerOrder is used to order SNPs in the plots
RR_MFc = DataFrame(markerID=markerID, modelfrequency=model_freq)
Clone this wiki locally