From f6dd3e5e164d741fb05029ee88ba4ea1fdddcdf3 Mon Sep 17 00:00:00 2001 From: Esha Choukse Date: Sat, 7 Oct 2023 03:27:02 +0000 Subject: [PATCH] perf fix --- include/mscclpp/fifo_device.hpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/include/mscclpp/fifo_device.hpp b/include/mscclpp/fifo_device.hpp index 67fbe9013..c48cef274 100644 --- a/include/mscclpp/fifo_device.hpp +++ b/include/mscclpp/fifo_device.hpp @@ -63,13 +63,9 @@ struct FifoDeviceHandle { ProxyTrigger* triggerPtr = (ProxyTrigger*)&(this->triggers[curFifoHead % size]); - // only the fst needs to atomically stored thanks to memory_order_release. On the host side we only need to - // atomically load the fst. If fst is non-zero, snd is guaranteed to be the right value - triggerPtr->snd = trigger.snd; - cuda::atomic_ref{triggerPtr->fst}.store(trigger.fst, - cuda::memory_order_release); + // store with memory order release so that the while loop does not go pass this. + asm volatile("st.global.release.cta.v2.u64 [%0], {%1,%2};" ::"l"(triggerPtr), "l"(trigger.fst), "l"(trigger.snd)); - // asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" ::"l"(triggerPtr), "l"(trigger.fst), "l"(trigger.snd)); return curFifoHead; }