Skip to content

Commit

Permalink
Merge pull request #57 from gpudirect/volta_features
Browse files Browse the repository at this point in the history
enable Volta features when available
  • Loading branch information
e-ago authored Apr 26, 2018
2 parents 8907454 + 9308b09 commit c7aace1
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 41 deletions.
132 changes: 92 additions & 40 deletions src/gdsync.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,6 @@ const size_t GDS_GPU_MAX_INLINE_SIZE = 256;
//bool gds_has_weak_consistency = GDS_HAS_WEAK_API;
//bool gds_has_membar = GDS_HAS_MEMBAR;

static bool gpu_does_support_nor(gds_peer *peer) { return false; }

//-----------------------------------------------------------------------------

// BUG: this feature is GPU device dependent
Expand Down Expand Up @@ -427,6 +425,58 @@ int gds_fill_poke(gds_op_list_t &ops, uint32_t *ptr, uint32_t value, int flags)
}

//-----------------------------------------------------------------------------

static int gds_fill_poke64(gds_op_list_t &ops, CUdeviceptr addr, uint64_t value, int flags)
{
int retcode = 0;
#if CUDA_VERSION >= 9000
CUdeviceptr dev_ptr = addr;

// TODO: convert into errors
assert(addr);
assert((((unsigned long)addr) & 0x7) == 0);

bool need_barrier = (flags & GDS_WRITE_PRE_BARRIER ) ? true : false;

CUstreamBatchMemOpParams param;
param.operation = CU_STREAM_MEM_OP_WRITE_VALUE_64;
param.writeValue.address = dev_ptr;
param.writeValue.value64 = value;
param.writeValue.flags = CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER;
if (need_barrier)
param.writeValue.flags = 0;
gds_dbg("op=%d addr=%p value=%08x flags=%08x\n",
param.operation,
(void*)param.writeValue.address,
param.writeValue.value,
param.writeValue.flags);
ops.push_back(param);
#else
retcode = EINVAL;
#endif
return retcode;
}

int gds_fill_poke64(gds_op_list_t &ops, uint64_t *ptr, uint64_t value, int flags)
{
int retcode = 0;
CUdeviceptr dev_ptr = 0;

gds_dbg("addr=%p value=%016lx flags=%08x\n", ptr, value, flags);

retcode = gds_map_mem(ptr, sizeof(*ptr), memtype_from_flags(flags), &dev_ptr);
if (retcode) {
gds_err("error %d while looking up %p\n", retcode, ptr);
goto out;
}

retcode = gds_fill_poke64(ops, dev_ptr, value, flags);
out:
return retcode;
}

//-----------------------------------------------------------------------------

static int gds_fill_poll(gds_op_list_t &ops, CUdeviceptr ptr, uint32_t magic, int cond_flag, int flags)
{
int retcode = 0;
Expand Down Expand Up @@ -460,6 +510,16 @@ static int gds_fill_poll(gds_op_list_t &ops, CUdeviceptr ptr, uint32_t magic, in
param.waitValue.flags = CU_STREAM_WAIT_VALUE_AND;
cond_str = "CU_STREAM_WAIT_VALUE_AND";
break;

case GDS_WAIT_COND_NOR:
#if CUDA_VERSION >= 9000
param.waitValue.flags = CU_STREAM_WAIT_VALUE_NOR;
#else
gds_err("GDS_WAIT_COND_NOR requires CUDA 9.0 at least\n");
retcode = EINVAL;
#endif
cond_str = "CU_STREAM_WAIT_VALUE_NOR";
break;
default:
gds_err("invalid wait condition flag\n");
retcode = EINVAL;
Expand Down Expand Up @@ -709,9 +769,7 @@ int gds_post_ops(gds_peer *peer, size_t n_ops, struct peer_op_wr *op, gds_op_lis
retcode = gds_fill_inlcpy(ops, dev_ptr, &data, sizeof(data), flags);
}
else if (peer->has_write64) {
gds_err("write64 is not supported\n");
retcode = EINVAL;
break;
retcode = gds_fill_poke64(ops, dev_ptr, data, flags);
}
else {
uint32_t datalo = gds_qword_lo(op->wr.qword_va.data);
Expand Down Expand Up @@ -771,11 +829,11 @@ int gds_post_ops(gds_peer *peer, size_t n_ops, struct peer_op_wr *op, gds_op_lis

switch(op->type) {
case IBV_EXP_PEER_OP_POLL_NOR_DWORD:
//poll_cond = GDS_WAIT_COND_NOR;
// TODO: lookup and pass peer down
assert(gpu_does_support_nor(NULL));
retcode = EINVAL;
goto out;
if (!peer->has_wait_nor) {
gds_err("IBV_EXP_PEER_OP_POLL_NOR_DWORD should have not been enabled\n");
retcode = EINVAL;
}
poll_cond = GDS_WAIT_COND_NOR;
break;
case IBV_EXP_PEER_OP_POLL_GEQ_DWORD:
poll_cond = GDS_WAIT_COND_GEQ;
Expand Down Expand Up @@ -1243,7 +1301,7 @@ static bool support_memops(CUdevice dev)
#else
#error "GCC error CUDA MemOp APIs is missing prior to CUDA 8.0"
#endif
gds_warn("dev=%d has_memops=%d\n", dev, flag);
gds_dbg("dev=%d has_memops=%d\n", dev, flag);
return !!flag;
}

Expand All @@ -1255,6 +1313,7 @@ static bool support_remote_flush(CUdevice dev)
#else
#warning "Assuming CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES=0 prior to CUDA 9.2"
#endif
gds_dbg("dev=%d has_remote_flush=%d\n", dev, flag);
return !!flag;
}

Expand All @@ -1264,6 +1323,7 @@ static bool support_write64(CUdevice dev)
#if CUDA_VERSION >= 9000
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS, dev));
#endif
gds_dbg("dev=%d has_write64=%d\n", dev, flag);
return !!flag;
}

Expand All @@ -1273,6 +1333,7 @@ static bool support_wait_nor(CUdevice dev)
#if CUDA_VERSION >= 9000
CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR, dev));
#endif
gds_dbg("dev=%d has_wait_nor=%d\n", dev, flag);
return !!flag;
}

Expand Down Expand Up @@ -1311,45 +1372,38 @@ static void gds_init_peer(gds_peer *peer, int gpu_id)
peer->has_wait_nor = support_wait_nor(dev);
peer->has_inlcpy = support_inlcpy(dev) && gds_enable_inlcpy();
peer->has_membar = support_membar(dev);
}

//-----------------------------------------------------------------------------

static void gds_init_peer_attr(gds_peer_attr *attr, gds_peer *peer)
{
assert(peer);

peer->alloc_type = gds_peer::NONE;
peer->alloc_flags = 0;

attr->peer_id = peer_to_id(peer);
attr->buf_alloc = gds_buf_alloc;
attr->buf_release = gds_buf_release;
attr->register_va = gds_register_va;
attr->unregister_va = gds_unregister_va;
peer->attr.peer_id = peer_to_id(peer);
peer->attr.buf_alloc = gds_buf_alloc;
peer->attr.buf_release = gds_buf_release;
peer->attr.register_va = gds_register_va;
peer->attr.unregister_va = gds_unregister_va;

attr->caps = ( IBV_EXP_PEER_OP_STORE_DWORD_CAP |
IBV_EXP_PEER_OP_STORE_QWORD_CAP |
IBV_EXP_PEER_OP_FENCE_CAP |
IBV_EXP_PEER_OP_POLL_AND_DWORD_CAP );
peer->attr.caps = ( IBV_EXP_PEER_OP_STORE_DWORD_CAP |
IBV_EXP_PEER_OP_STORE_QWORD_CAP |
IBV_EXP_PEER_OP_FENCE_CAP |
IBV_EXP_PEER_OP_POLL_AND_DWORD_CAP );

if (gpu_does_support_nor(peer))
attr->caps |= IBV_EXP_PEER_OP_POLL_NOR_DWORD_CAP;
if (peer->has_wait_nor)
peer->attr.caps |= IBV_EXP_PEER_OP_POLL_NOR_DWORD_CAP;
else
attr->caps |= IBV_EXP_PEER_OP_POLL_GEQ_DWORD_CAP;
peer->attr.caps |= IBV_EXP_PEER_OP_POLL_GEQ_DWORD_CAP;

if (peer->has_inlcpy) {
attr->caps |= IBV_EXP_PEER_OP_COPY_BLOCK_CAP;
peer->attr.caps |= IBV_EXP_PEER_OP_COPY_BLOCK_CAP;
}
else if (peer->has_write64 || gds_simulate_write64()) {
attr->caps |= IBV_EXP_PEER_OP_STORE_QWORD_CAP;
peer->attr.caps |= IBV_EXP_PEER_OP_STORE_QWORD_CAP;
}
gds_dbg("caps=%016lx\n", attr->caps);
attr->peer_dma_op_map_len = GDS_GPU_MAX_INLINE_SIZE;
attr->comp_mask = IBV_EXP_PEER_DIRECT_VERSION;
attr->version = 1;
gds_dbg("caps=%016lx\n", peer->attr.caps);
peer->attr.peer_dma_op_map_len = GDS_GPU_MAX_INLINE_SIZE;
peer->attr.comp_mask = IBV_EXP_PEER_DIRECT_VERSION;
peer->attr.version = 1;

gds_dbg("peer_attr: peer_id=%"PRIx64"\n", attr->peer_id);
gds_dbg("peer_attr: peer_id=%"PRIx64"\n", peer->attr.peer_id);
}

//-----------------------------------------------------------------------------
Expand Down Expand Up @@ -1459,21 +1513,19 @@ int gds_register_peer_ex(struct ibv_context *context, unsigned gpu_id, gds_peer
}

gds_peer *peer = &gpu_peer[gpu_id];
gds_peer_attr *peer_attr = &gpu_peer_attr[gpu_id];

if (gpu_registered[gpu_id]) {
gds_dbg("gds_peer for GPU%u already initialized\n", gpu_id);
} else {
gds_init_peer(peer, gpu_id);
gds_init_peer_attr(peer_attr, peer);
gpu_registered[gpu_id] = true;
}

if (p_peer)
*p_peer = peer;

if (p_peer_attr)
*p_peer_attr = peer_attr;
*p_peer_attr = &peer->attr;

return ret;
}
Expand Down
7 changes: 6 additions & 1 deletion src/objs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,14 @@ struct gds_peer {
bool has_wait_nor;
bool has_inlcpy;
bool has_membar;
gds_peer_attr attr;

// before calling ibv_exp_create_cq(), patch flags with appropriate values
enum obj_type { NONE, CQ, WQ, N_IBV_OBJS } alloc_type;
// This field works as a ugly run-time parameters passing
// mechanism, as it carries tracking info during the QP creation
// phase, so no more than one outstanding call per peer is
// supported. In practice, before calling ibv_exp_create_cq(), we
// patch this field with the appropriate value
int alloc_flags; // out of gds_flags_t

// register peer memory
Expand Down
1 change: 1 addition & 0 deletions src/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ void gds_dump_params(gds_op_list_t &params);
int gds_fill_membar(gds_op_list_t &param, int flags);
int gds_fill_inlcpy(gds_op_list_t &param, void *ptr, void *data, size_t n_bytes, int flags);
int gds_fill_poke(gds_op_list_t &param, uint32_t *ptr, uint32_t value, int flags);
int gds_fill_poke64(gds_op_list_t &param, uint64_t *ptr, uint64_t value, int flags);
int gds_fill_poll(gds_op_list_t &param, uint32_t *ptr, uint32_t magic, int cond_flag, int flags);
int gds_stream_batch_ops(CUstream stream, gds_op_list_t &params, int flags);

Expand Down

0 comments on commit c7aace1

Please sign in to comment.