From 6531e7f244a86025bb22d093628c8ad3aa324305 Mon Sep 17 00:00:00 2001 From: Gavin Hunsche Date: Mon, 24 Jun 2024 09:50:38 -0400 Subject: [PATCH] save 1 6/24/24 --- src/track_a_drift_cpu.jl | 7 +++-- src/track_a_drift_gpu.jl | 67 ++++++++++++++++++++++++++++------------ 2 files changed, 51 insertions(+), 23 deletions(-) diff --git a/src/track_a_drift_cpu.jl b/src/track_a_drift_cpu.jl index 58242ed..3becbdc 100644 --- a/src/track_a_drift_cpu.jl +++ b/src/track_a_drift_cpu.jl @@ -1,4 +1,4 @@ -using CUDA, BenchmarkTools, Test, Adapt +using CUDA, BenchmarkTools include("low_level/sqrt_one.jl") include("low_level/structures.jl") @@ -37,7 +37,7 @@ function track_a_drift(p_in, drift) end - +"""test input""" x = fill(1.0, 1024) y = fill(1.0, 1024) z = fill(0.75, 1024) @@ -54,4 +54,5 @@ drift = Drift(L) p_in = Particle(x, px, y, py, z, pz, s, p0c, mc2) -track_a_drift(p_in, drift) +track_a_drift(p_in, drift) # consistent output with python code +@btime track_a_drift(p_in, drift) # first test: 97.650 μs; second test: 100.063 μs \ No newline at end of file diff --git a/src/track_a_drift_gpu.jl b/src/track_a_drift_gpu.jl index 58242ed..74e8052 100644 --- a/src/track_a_drift_gpu.jl +++ b/src/track_a_drift_gpu.jl @@ -3,10 +3,16 @@ include("low_level/sqrt_one.jl") include("low_level/structures.jl") + +"""Adapting structs to bitstype""" +Adapt.@adapt_structure GPU_Particle +Adapt.@adapt_structure GPU_Drift +Adapt.@adapt_structure Intermediate -function track_a_drift(p_in, drift) - """Tracks the incoming Particle p_in though drift element - and returns the outgoing particle. + +function track_a_drift_GPU!(p_in, drift, inter) + """Parallelized tracking of incoming particles p_in + through drift elements, computed on the GPU. See Bmad manual section 24.9 """ L = drift.L @@ -15,6 +21,7 @@ function track_a_drift(p_in, drift) mc2 = p_in.mc2 x, px, y, py, z, pz = p_in.x, p_in.px, p_in.y, p_in.py, p_in.z, p_in.pz + P, Px, Py, Pxy2, Pl, dz = inter.P, inter.Px, inter.Py, inter.Pxy2, inter.Pl P = 1 .+ pz # Particle's total momentum over p0 Px = px ./ P # Particle's 'x' momentum over p0 @@ -23,35 +30,55 @@ function track_a_drift(p_in, drift) Pl = sqrt.(1 .- Pxy2) # Particle's longitudinal momentum over p0 x .+= L .* Px ./ Pl y .+= L .* Py ./ Pl - + # z = z + L * ( beta/beta_ref - 1.0/Pl ) but numerically accurate: dz = L .* (sqrt_one.((mc2.^2 .* (2 .*pz.+pz.^2))./((p0c.*P).^2 .+ mc2.^2)) .+ sqrt_one.(-Pxy2)./Pl) - z .+= dz s .+= L particle = p_in - - return particle + + return end + + +"""helper arrays to determine array memory allocation""" +P = CUDA.fill(undef, 1000); +Px = CUDA.fill(undef, 1000); +Py = CUDA.fill(undef, 1000); +Pxy2 = CUDA.fill(undef, 1000); +Pl = CUDA.fill(undef, 1000); +dz = CUDA.fill(undef, 1000); +"""sample input of 1000 particles""" +x = CUDA.fill(1.0, 1000); +y = CUDA.fill(1.0, 1000); +z = CUDA.fill(0.75, 1000); +px = CUDA.fill(0.5, 1000); +py = CUDA.fill(0.2, 1000); +pz = CUDA.fill(0.5, 1000); +s = CUDA.fill(1.0, 1000); +p0c = CUDA.fill(1.0, 1000); +mc2 = CUDA.fill(.511, 1000); -x = fill(1.0, 1024) -y = fill(1.0, 1024) -z = fill(0.75, 1024) -px = fill(0.5, 1024) -py = fill(0.2, 1024) -pz = fill(0.5, 1024) -s = fill(1.0, 1024) -p0c = fill(1.0, 1024) -mc2 = fill(.511, 1024) +L = CUDA.fill(0.005, 1000); -L = fill(0.005, 1024) +drift = GPU_Drift(L) +inter = Intermediate(P, Px, Py, Pxy2, Pl, dz) +p_in = GPU_Particle(x, px, y, py, z, pz, s, p0c, mc2) -drift = Drift(L) +"""benchmarking""" +function bench_track_a_drift(p_in, drift, inter) + kernel = @cuda launch=false track_a_drift_GPU!(p_in, drift, inter) + config = launch_configuration(kernel.fun) + threads = config.threads + blocks = config.blocks -p_in = Particle(x, px, y, py, z, pz, s, p0c, mc2) + CUDA.@sync begin + kernel(p_in, drift, inter; threads, blocks) + end +end -track_a_drift(p_in, drift) +@btime bench_track_a_drift($p_in, $drift, $inter)