diff --git a/CMakeLists.txt b/CMakeLists.txt index ba5ed98..18f52dc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,6 +24,7 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") #================================================================================================== option(BUILD_LOCAL_GPU_TARGET_ONLY "Build only for GPUs detected on this machine" OFF) option(ENABLE_NIC_EXEC "Enable RDMA NIC Executor in TransferBench" OFF) +option(ENABLE_IBV_DIRECT "Link libibverbs symbols directly (OFF: resolve via dlsym)" ON) option(ENABLE_MPI_COMM "Enable MPI Communicator support" OFF) option(ENABLE_DMA_BUF "Enable DMA-BUF support for GPU Direct RDMA" OFF) option(ENABLE_AMD_SMI "Enable AMD-SMI pod membership queries" OFF) @@ -146,6 +147,11 @@ else() set_target_properties(ibverbs PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${IBVERBS_INCLUDE_DIR}" IMPORTED_LOCATION "${IBVERBS_LIBRARY}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${IBVERBS_INCLUDE_DIR}") set(IBVERBS_FOUND 1) message(STATUS "- Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable") + if(ENABLE_IBV_DIRECT) + message(STATUS "- IBV_DIRECT enabled (direct libibverbs linkage); set -DENABLE_IBV_DIRECT=OFF for dlsym path") + else() + message(STATUS "- IBV_DIRECT disabled: libibverbs symbols resolved via dlsym at runtime") + endif() else() if(NOT IBVERBS_LIBRARY) message(WARNING "- IBVerbs library not found") @@ -318,6 +324,9 @@ if(IBVERBS_FOUND) target_include_directories(TransferBench PRIVATE ${IBVERBS_INCLUDE_DIR}) target_link_libraries(TransferBench PRIVATE ${IBVERBS_LIBRARY}) target_compile_definitions(TransferBench PRIVATE NIC_EXEC_ENABLED) + if(ENABLE_IBV_DIRECT) + target_compile_definitions(TransferBench PRIVATE IBV_DIRECT=1) + endif() endif() if(MPI_COMM_FOUND) if(TARGET MPI::MPI_CXX) diff --git a/Makefile b/Makefile index 993eedd..c60e3b8 100644 --- a/Makefile +++ b/Makefile @@ -9,10 +9,13 @@ MPI_PATH ?= /usr/local/openmpi # Optional features (set to 0 to disable, 1 to enable) # DISABLE_NIC_EXEC: Disable RDMA/NIC executor support (default: 0) +# DISABLE_IBV_DIRECT: When NIC support is on, use dlsym for libibverbs instead of direct linkage (default: 0) # DISABLE_MPI_COMM: Disable MPI communicator support (default: 0) # DISABLE_DMA_BUF: Disable DMA-BUF support for GPU Direct RDMA (default: 1) # DISABLE_AMD_SMI: Disable AMD-SMI pod membership checking support (default: 0) # DISABLE_NVML: Disable NVML pod membership detection for CUDA builds (default: 0) +# DISABLE_POD_COMM: Disable pod communication support (default: 0) +# DISABLE_CUMEM: Disable CUDA driver API (default: 0). On CUDA, POD_COMM_ENABLED requires CUMEM_ENABLED. HIPCC ?= $(ROCM_PATH)/bin/amdclang++ NVCC ?= $(CUDA_PATH)/bin/nvcc @@ -85,7 +88,9 @@ ifeq ($(filter clean,$(MAKECMDGOALS)),) # 1) DISABLE_NIC_EXEC is not set to 1 # 2) IBVerbs is found in the Dynamic Linker cache # 3) infiniband/verbs.h is found in the default include path + # When enabled, -DIBV_DIRECT=1 is added unless DISABLE_IBV_DIRECT=1 (verbs via direct link + constexpr pfn_*) DISABLE_NIC_EXEC ?= 0 + DISABLE_IBV_DIRECT ?= 0 ifneq ($(DISABLE_NIC_EXEC),1) $(info Attempting to build with NIC executor support) ifeq ("$(shell ldconfig -p | grep -c ibverbs)", "0") @@ -96,6 +101,9 @@ ifeq ($(filter clean,$(MAKECMDGOALS)),) COMMON_FLAGS += -DNIC_EXEC_ENABLED LDFLAGS += -libverbs NIC_ENABLED = 1 + ifneq ($(DISABLE_IBV_DIRECT),1) + COMMON_FLAGS += -DIBV_DIRECT=1 + endif # Disable DMA-BUF support by default (set DISABLE_DMA_BUF=0 to enable) DISABLE_DMA_BUF ?= 1 @@ -123,6 +131,9 @@ ifeq ($(filter clean,$(MAKECMDGOALS)),) $(info - To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed) else $(info - Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable) + ifeq ($(DISABLE_IBV_DIRECT),1) + $(info - IBV_DIRECT disabled: libibverbs via dlsym, DISABLE_IBV_DIRECT=1) + endif endif endif @@ -218,6 +229,18 @@ ifeq ($(filter clean,$(MAKECMDGOALS)),) endif endif + # TransferBenchCuda: CUDA driver API (libcuda). Independent of POD, but POD on CUDA requires CUMEM. + DISABLE_CUMEM ?= 0 + ifeq ($(MAKECMDGOALS),TransferBenchCuda) + ifneq ($(DISABLE_CUMEM),1) + $(info - Building with CUMEM_ENABLED (CUDA driver API, -lcuda)) + COMMON_FLAGS += -DCUMEM_ENABLED + LDFLAGS += -lcuda + else + $(info - CUDA driver API disabled (DISABLE_CUMEM=1); POD comm unavailable on CUDA) + endif + endif + POD_ENABLED = 0 # Compile with pod support if # 1) DISABLE_POD_COMM is not set to 1 @@ -245,9 +268,12 @@ ifeq ($(filter clean,$(MAKECMDGOALS)),) ifeq ($(CUDA_VERSION_OK),yes) $(info - Detected CUDA version $(CUDA_MAJOR).$(CUDA_MINOR) which has MNNVL support) - COMMON_FLAGS += -DPOD_COMM_ENABLED - LDFLAGS += -lcuda - POD_ENABLED = 1 + ifeq ($(DISABLE_CUMEM),1) + $(info - Pod communication skipped on CUDA: requires CUMEM_ENABLED (DISABLE_CUMEM=1)) + else + COMMON_FLAGS += -DPOD_COMM_ENABLED + POD_ENABLED = 1 + endif else $(info - Detected CUDA version $(CUDA_MAJOR).$(CUDA_MINOR) which does not have MNNVL support) $(info - Pod support will require CUDA version of at least $(CUDA_MIN_MAJOR).$(CUDA_MIN_MINOR)) diff --git a/src/header/IbvDynload.hpp b/src/header/IbvDynload.hpp new file mode 100644 index 0000000..eac4654 --- /dev/null +++ b/src/header/IbvDynload.hpp @@ -0,0 +1,187 @@ +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/// @file IbvDynload.hpp +/// @brief libibverbs function pointers and optional dlopen/dlsym when not IBV_DIRECT. +/// @note Include when `NIC_EXEC_ENABLED` is defined (e.g. from `TransferBench.hpp` alongside other headers). + +#pragma once + +#include +#include +#include + +#if IBV_DIRECT +#define IBV_FN(name, rettype, arglist) constexpr rettype(*pfn_##name)arglist = name; +#else +#define IBV_FN(name, rettype, arglist) rettype(*pfn_##name)arglist = nullptr; +#endif + +namespace { + +IBV_FN(ibv_alloc_pd, ibv_pd*, (ibv_context*)) +IBV_FN(ibv_close_device, int, (ibv_context*)) +IBV_FN(ibv_create_cq, ibv_cq*, (ibv_context*, int, void*, ibv_comp_channel*, int)) +IBV_FN(ibv_create_qp, ibv_qp*, (ibv_pd*, ibv_qp_init_attr*)) +IBV_FN(ibv_dealloc_pd, int, (ibv_pd*)) +IBV_FN(ibv_dereg_mr, int, (ibv_mr*)) +IBV_FN(ibv_destroy_cq, int, (ibv_cq*)) +IBV_FN(ibv_destroy_qp, int, (ibv_qp*)) +IBV_FN(ibv_free_device_list, void, (ibv_device**)) +IBV_FN(ibv_get_device_list, ibv_device**, (int*)) +IBV_FN(ibv_get_device_name, const char*, (ibv_device*)) +IBV_FN(ibv_modify_qp, int, (ibv_qp*, ibv_qp_attr*, int)) +IBV_FN(ibv_open_device, ibv_context*, (ibv_device*)) +IBV_FN(ibv_poll_cq, int, (ibv_cq*, int, ibv_wc*)) +IBV_FN(ibv_post_send, int, (ibv_qp*, ibv_send_wr*, ibv_send_wr**)) +IBV_FN(ibv_query_device, int, (ibv_context*, ibv_device_attr*)) +IBV_FN(ibv_query_gid, int, (ibv_context*, uint8_t, int, ibv_gid*)) +#if IBV_DIRECT +// On older versions of libibverbs, ibv_query_port is not defined in the header file. +constexpr int (*pfn_ibv_query_port)(ibv_context*, uint8_t, ibv_port_attr*) = ___ibv_query_port; +#else +IBV_FN(ibv_query_port, int, (ibv_context*, uint8_t, ibv_port_attr*)) +#endif +#ifdef HAVE_DMABUF_SUPPORT +IBV_FN(ibv_reg_dmabuf_mr, ibv_mr*, (ibv_pd*, uint64_t, size_t, uint64_t, int, int)) +#endif +IBV_FN(ibv_reg_mr, ibv_mr*, (ibv_pd*, void*, size_t, int)) + +} // namespace + +#if IBV_DIRECT + +inline void TbIbvEnsureLoaded() {} +inline bool TbIbvSymbolsReady() { return true; } +inline void* TbIbvDlHandle() { return nullptr; } +inline void TbIbvUnload() {} + +#else + +struct IbvDynloadState { + std::once_flag once{}; + void* handle = nullptr; + bool loaded = false; + + void tryLoad() + { + handle = dlopen("libibverbs.so.1", RTLD_NOW); + if (handle == nullptr) + return; + + struct Symbol { void **ppfn; char const *name; }; + + Symbol symbols[] = { + {(void**)&pfn_ibv_alloc_pd, "ibv_alloc_pd"}, + {(void**)&pfn_ibv_close_device, "ibv_close_device"}, + {(void**)&pfn_ibv_create_cq, "ibv_create_cq"}, + {(void**)&pfn_ibv_create_qp, "ibv_create_qp"}, + {(void**)&pfn_ibv_dealloc_pd, "ibv_dealloc_pd"}, + {(void**)&pfn_ibv_dereg_mr, "ibv_dereg_mr"}, + {(void**)&pfn_ibv_destroy_cq, "ibv_destroy_cq"}, + {(void**)&pfn_ibv_destroy_qp, "ibv_destroy_qp"}, + {(void**)&pfn_ibv_free_device_list, "ibv_free_device_list"}, + {(void**)&pfn_ibv_get_device_list, "ibv_get_device_list"}, + {(void**)&pfn_ibv_get_device_name, "ibv_get_device_name"}, + {(void**)&pfn_ibv_modify_qp, "ibv_modify_qp"}, + {(void**)&pfn_ibv_open_device, "ibv_open_device"}, + {(void**)&pfn_ibv_poll_cq, "ibv_poll_cq"}, + {(void**)&pfn_ibv_post_send, "ibv_post_send"}, + {(void**)&pfn_ibv_query_device, "ibv_query_device"}, + {(void**)&pfn_ibv_query_gid, "ibv_query_gid"}, + {(void**)&pfn_ibv_query_port, "ibv_query_port"}, +#ifdef HAVE_DMABUF_SUPPORT + {(void**)&pfn_ibv_reg_dmabuf_mr, "ibv_reg_dmabuf_mr"}, +#endif + {(void**)&pfn_ibv_reg_mr, "ibv_reg_mr"}, + }; + + for (Symbol const& s : symbols) { + void* sym = dlsym(handle, s.name); + if (sym == nullptr) { + dlclose(handle); + handle = nullptr; + return; + } + *s.ppfn = sym; + } + loaded = true; + } +}; + +inline IbvDynloadState& ibvDynloadState() +{ + static IbvDynloadState s; + return s; +} + +inline void TbIbvEnsureLoaded() +{ + IbvDynloadState& st = ibvDynloadState(); + std::call_once(st.once, [&]() { st.tryLoad(); }); +} + +inline bool TbIbvSymbolsReady() +{ + TbIbvEnsureLoaded(); + return ibvDynloadState().loaded; +} + +inline void* TbIbvDlHandle() +{ + TbIbvEnsureLoaded(); + return ibvDynloadState().handle; +} + +inline void TbIbvUnload() +{ + IbvDynloadState& st = ibvDynloadState(); + if (st.handle != nullptr) { + dlclose(st.handle); + st.handle = nullptr; + st.loaded = false; + pfn_ibv_alloc_pd = nullptr; + pfn_ibv_close_device = nullptr; + pfn_ibv_create_cq = nullptr; + pfn_ibv_create_qp = nullptr; + pfn_ibv_dealloc_pd = nullptr; + pfn_ibv_dereg_mr = nullptr; + pfn_ibv_destroy_cq = nullptr; + pfn_ibv_destroy_qp = nullptr; + pfn_ibv_free_device_list = nullptr; + pfn_ibv_get_device_list = nullptr; + pfn_ibv_get_device_name = nullptr; + pfn_ibv_modify_qp = nullptr; + pfn_ibv_open_device = nullptr; + pfn_ibv_poll_cq = nullptr; + pfn_ibv_post_send = nullptr; + pfn_ibv_query_device = nullptr; + pfn_ibv_query_gid = nullptr; + pfn_ibv_query_port = nullptr; +#ifdef HAVE_DMABUF_SUPPORT + pfn_ibv_reg_dmabuf_mr = nullptr; +#endif + pfn_ibv_reg_mr = nullptr; + } +} + +#endif // !IBV_DIRECT \ No newline at end of file diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp index 023991b..339d97c 100644 --- a/src/header/TransferBench.hpp +++ b/src/header/TransferBench.hpp @@ -54,7 +54,7 @@ THE SOFTWARE. #include #ifdef NIC_EXEC_ENABLED -#include +#include "IbvDynload.hpp" #endif #ifdef MPI_COMM_ENABLED @@ -737,10 +737,11 @@ namespace TransferBench } while (0) // Helper macros for calling RDMA functions and reporting errors +#ifdef NIC_EXEC_ENABLED #ifdef VERBS_DEBUG #define IBV_CALL(__func__, ...) \ do { \ - int error = __func__(__VA_ARGS__); \ + int error = pfn_##__func__(__VA_ARGS__); \ if (error != 0) { \ return {ERR_FATAL, "Encountered IbVerbs error (%d) at line (%d) " \ "and function (%s)", (error), __LINE__, #__func__}; \ @@ -749,7 +750,7 @@ namespace TransferBench #define IBV_PTR_CALL(__ptr__, __func__, ...) \ do { \ - __ptr__ = __func__(__VA_ARGS__); \ + __ptr__ = pfn_##__func__(__VA_ARGS__); \ if (__ptr__ == nullptr) { \ return {ERR_FATAL, "Encountered IbVerbs nullptr error at line (%d) " \ "and function (%s)", __LINE__, #__func__}; \ @@ -758,7 +759,7 @@ namespace TransferBench #else #define IBV_CALL(__func__, ...) \ do { \ - int error = __func__(__VA_ARGS__); \ + int error = pfn_##__func__(__VA_ARGS__); \ if (error != 0) { \ return {ERR_FATAL, "Encountered IbVerbs error (%d=%s) in func (%s)" \ , error, strerror(errno), #__func__}; \ @@ -767,13 +768,14 @@ namespace TransferBench #define IBV_PTR_CALL(__ptr__, __func__, ...) \ do { \ - __ptr__ = __func__(__VA_ARGS__); \ + __ptr__ = pfn_##__func__(__VA_ARGS__); \ if (__ptr__ == nullptr) { \ return {ERR_FATAL, "Encountered IbVerbs nullptr error (%s) in func (%s) " \ , strerror(errno), #__func__}; \ } \ } while (0) #endif +#endif // NIC_EXEC_ENABLED namespace TransferBench { @@ -1006,6 +1008,7 @@ namespace { bool IsSamePod(int targetRank, int sourceRank) const; std::string GetExecutorName(ExeDevice exeDevice) const; int NicIsActive(int nicIndex, int targetRank) const; + bool IbvLoaded() const; #if !defined(__NVCC__) ErrResult GetHsaAgent(ExeDevice const& exeDevice, hsa_agent_t& agent) const; @@ -1032,6 +1035,7 @@ namespace { bool verbose = false; bool rankDoesOutput = true; FILE* dumpCfgFile = nullptr; + bool ibvLoaded = false; #if !defined(__NVCC__) std::vector cpuAgents; @@ -2341,6 +2345,11 @@ namespace { case EXE_NIC: case EXE_NIC_NEAREST: #ifdef NIC_EXEC_ENABLED { + if (!System::Get().IbvLoaded()) { + errors.push_back({ERR_FATAL, "Transfer %d: NIC executor is requested but IB verbs is not loaded.", i}); + hasFatalError = true; + break; + } // NIC Executors can only execute a copy operation if (t.srcs.size() != 1 || t.dsts.size() != 1) { errors.push_back({ERR_FATAL, "Transfer %d: NIC executor requires single SRC and single DST", i}); @@ -2671,7 +2680,7 @@ namespace { #ifdef NIC_EXEC_ENABLED // Function to collect information about IBV devices //======================================================================================== -static bool IsConfiguredGid(union ibv_gid const& gid) + static bool IsConfiguredGid(union ibv_gid const& gid) { const struct in6_addr *a = (struct in6_addr *) gid.raw; int trailer = (a->s6_addr32[1] | a->s6_addr32[2] | a->s6_addr32[3]); @@ -2696,7 +2705,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid) int const& gidIndex, int& version) { - char const* deviceName = ibv_get_device_name(context->device); + char const* deviceName; + IBV_PTR_CALL(deviceName, ibv_get_device_name, context->device); char gidRoceVerStr[16] = {}; char roceTypePath[PATH_MAX] = {}; sprintf(roceTypePath, "/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d", @@ -2783,6 +2793,11 @@ static bool IsConfiguredGid(union ibv_gid const& gid) static bool isInitialized = false; static vector ibvDeviceList = {}; +#if !defined(IBV_DIRECT) + if (!TbIbvSymbolsReady() && !isInitialized) { + isInitialized = true; + } +#endif // Build list on first use if (!isInitialized) { @@ -4886,7 +4901,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) // Use DMA copy engine do { -#if defined(__NVCC__) +#if defined(CUMEM_ENABLED) ERR_CHECK(cuMemcpyAsync((CUdeviceptr)resources.dstMem[0], (CUdeviceptr)resources.srcMem[0], resources.numBytes, stream)); @@ -5023,32 +5038,32 @@ static bool IsConfiguredGid(union ibv_gid const& gid) } } -#if defined(__NVCC__) - ErrResult::ErrResult(CUresult err) +#if !defined(__NVCC__) + ErrResult::ErrResult(hsa_status_t err) { - if (err == CUDA_SUCCESS) { + if (err == HSA_STATUS_SUCCESS) { this->errType = ERR_NONE; this->errMsg = ""; } else { - const char *errString = NULL, *errName = NULL; - cuGetErrorName(err, &errName); - cuGetErrorString(err, &errString); + const char *errString = NULL; + hsa_status_string(err, &errString); this->errType = ERR_FATAL; - this->errMsg = std::string("CUDA Driver Error: ") + errName - + " (" + errString + ")"; + this->errMsg = std::string("HSA Error: ") + errString; } } -#else - ErrResult::ErrResult(hsa_status_t err) +#elif defined(CUMEM_ENABLED) + ErrResult::ErrResult(CUresult err) { - if (err == HSA_STATUS_SUCCESS) { + if (err == CUDA_SUCCESS) { this->errType = ERR_NONE; this->errMsg = ""; } else { - const char *errString = NULL; - hsa_status_string(err, &errString); + const char *errString = NULL, *errName = NULL; + cuGetErrorName(err, &errName); + cuGetErrorString(err, &errString); this->errType = ERR_FATAL; - this->errMsg = std::string("HSA Error: ") + errString; + this->errMsg = std::string("CUDA Driver Error: ") + errName + + " (" + errString + ")"; } } #endif @@ -5810,6 +5825,16 @@ static bool IsConfiguredGid(union ibv_gid const& gid) Log("[INFO] Running in single node mode\n"); } +#ifdef NIC_EXEC_ENABLED + TbIbvEnsureLoaded(); + ibvLoaded = TbIbvSymbolsReady(); +#if !defined(IBV_DIRECT) + if (!ibvLoaded) { + Log("[WARN] Failed to load libibverbs.so.1 or required symbols\n"); + } +#endif +#endif + // Collect topology and distribute across all ranks CollectTopology(); } @@ -5842,9 +5867,13 @@ static bool IsConfiguredGid(union ibv_gid const& gid) fclose(dumpCfgFile); } +#ifdef NIC_EXEC_ENABLED + TbIbvUnload(); +#endif + #ifdef AMD_SMI_ENABLED amdsmi_shut_down(); -#elif defined(__NVCC__) && defined(POD_COMM_ENABLED) +#elif defined(NVML_ENABLED) nvmlShutdown(); #endif } @@ -7016,6 +7045,11 @@ static bool IsConfiguredGid(union ibv_gid const& gid) return rankInfo[targetRank].nicIsActive.at(nicIndex); } + bool System::IbvLoaded() const + { + return ibvLoaded; + } + int GetNumExecutors(ExeType exeType, int targetRank) { return System::Get().GetNumExecutors(exeType, targetRank);