diff --git a/src/low_level/structures.jl b/src/low_level/structures.jl
index d896f95..e67c8a0 100644
--- a/src/low_level/structures.jl
+++ b/src/low_level/structures.jl
@@ -22,7 +22,12 @@ struct Intermediate{K}
     Py::K
     Pxy2::K
     Pl::K
+    X::K
+    Y::K
     dz::K
+    dz1::K
+    dz2::K
+    dz3::K
 end
 
 struct GPU_Drift{K}
diff --git a/src/track_a_drift_gpu_test.jl b/src/track_a_drift_gpu_test.jl
new file mode 100644
index 0000000..23dad78
--- /dev/null
+++ b/src/track_a_drift_gpu_test.jl
@@ -0,0 +1,86 @@
+using CUDA, BenchmarkTools, Adapt
+include("low_level/sqrt_one.jl")
+include("low_level/structures.jl")
+
+Adapt.@adapt_structure GPU_Particle
+Adapt.@adapt_structure GPU_Drift
+Adapt.@adapt_structure Intermediate
+    
+function track_a_drift_GPU2!(p_in, drift, inter)
+    """Tracks the incoming Particle p_in though drift element
+    and returns the outgoing particle. 
+    See Bmad manual section 24.9 
+    """
+    L = drift.L
+    s = p_in.s
+    p0c = p_in.p0c
+    mc2 = p_in.mc2
+
+    x, px, y, py, z, pz = p_in.x, p_in.px, p_in.y, p_in.py, p_in.z, p_in.pz
+    P, Px, Py, Pxy2, Pl, X, Y, dz, dz1, dz2, dz3 = inter.P, inter.Px, inter.Py, 
+    inter.Pxy2, inter.Pl, inter.X, inter.Y, inter.dz, inter.dz1, inter.dz2, inter.dz3
+    
+    @inbounds P .= pz .+ 1
+    @inbounds Px .= px ./ P
+    @inbounds Py .= py ./ P
+    @inbounds Pxy2 .= Px.^2 .+ Py.^2
+    @inbounds Pl .= sqrt.(1 .- Pxy2)
+
+    @inbounds X .= L .* Px; Y .= L .* Py
+    @inbounds X ./= Pl; Y ./= Pl
+    @inbounds x .+= X; y .+= Y
+
+    # z = z + L * ( beta/beta_ref - 1.0/Pl ) but numerically accurate:
+
+
+    @inbounds dz1 .= 2 .*pz .+ pz.^2
+    @inbounds dz1 .*= mc2.^2
+    @inbounds dz2 .= p0c.*P
+    @inbounds dz2 .^= 2
+    @inbounds dz2 .+= mc2.^2
+    @inbounds dz3 .= sqrt_one.(.-Pxy2)
+    @inbounds dz3 ./= Pl
+    @inbounds dz .= dz1 ./ dz2
+    @inbounds dz .= sqrt_one.(dz)
+    @inbounds dz .+= dz3
+    @inbounds dz .*= L
+
+    @inbounds z .+= dz
+    @inbounds s .+= L
+    
+    
+
+    return
+end
+
+"""helper arrays to determine array memory allocation"""
+P = CUDA.fill(0.0, 1000);
+Px = CUDA.fill(0.0, 1000);
+Py = CUDA.fill(0.0, 1000);
+Pxy2 = CUDA.fill(0.0, 1000);
+Pl = CUDA.fill(0.0, 1000);
+dz = CUDA.fill(0.0, 1000);
+dz1 = CUDA.fill(0.0, 1000);
+dz2 = CUDA.fill(0.0, 1000);
+dz3 = CUDA.fill(0.0, 1000);
+X = CUDA.fill(0.0, 1000);
+Y = CUDA.fill(0.0, 1000);
+
+"""sample input of 1000 particles"""
+x = CUDA.fill(1.0, 1000);
+y = CUDA.fill(1.0, 1000);
+z = CUDA.fill(0.75, 1000);
+px = CUDA.fill(0.5, 1000);
+py = CUDA.fill(0.2, 1000);
+pz = CUDA.fill(0.5, 1000);
+s = CUDA.fill(1.0, 1000);
+p0c = CUDA.fill(1.0, 1000);
+mc2 = CUDA.fill(.511, 1000);
+L = CUDA.fill(0.005, 1000);
+
+
+drift = GPU_Drift(L)
+inter = Intermediate(P, Px, Py, Pxy2, Pl, X, Y, dz, dz1, dz2, dz3)
+p_in = GPU_Particle(x, px, y, py, z, pz, s, p0c, mc2)
+
+@cuda track_a_drift_GPU2!(p_in, drift, inter)