diff --git a/src/client/Presets/NicPeerToPeer.hpp b/src/client/Presets/NicPeerToPeer.hpp index f0e0def..8c54baa 100644 --- a/src/client/Presets/NicPeerToPeer.hpp +++ b/src/client/Presets/NicPeerToPeer.hpp @@ -22,100 +22,6 @@ THE SOFTWARE. // Helper functions -// Returns a schedule of round robin pairing of N elements, using Circle Method -// if parallel, each round contains N/2 pairs, otherwise serial -void RoundRobinSchedule(std::vector>>& schedule, - int N, int parallel = 0) { - if (N == 1) { - schedule.push_back({{0,0}}); - return; - } - // Generate standard round-robin tournament (maximum parallelism) - std::vector>> fullSchedule; - - // Pad odd number of ranks with a dummy round (N+1) - int paddedN = N + N%2; - // Round-robin tournament scheduling - for (int round = 0; round < paddedN - 1; round++) { - std::vector> roundPairs; - std::vector> roundPairsReversed; - for (int i = 0; i < paddedN / 2; i++) { - int item1 = i; - int item2 = paddedN - 1 - i; - if (round > 0) { - // Rotate all except the first item - if (item1 > 0) item1 = ((item1 - 1 + round) % (paddedN - 1)) + 1; - if (item2 > 0) item2 = ((item2 - 1 + round) % (paddedN - 1)) + 1; - } - // Ignore dummy round, its partner sits out this ronud - if (item1 < N && item2 < N){ - roundPairs.push_back({item1, item2}); - roundPairsReversed.push_back({item2, item1}); - } - } - fullSchedule.push_back(roundPairs); - fullSchedule.push_back(roundPairsReversed); - } - - // A loopback round where all run in parallel - std::vector> selfRound; - for (int i = 0; i < N; i++) { - selfRound.push_back({i, i}); - } - fullSchedule.push_back(selfRound); - - if (parallel) { - schedule = std::move(fullSchedule); - } else { - // Serialize each round if needed - for (auto const& fullRound : fullSchedule) { - for (auto const& match : fullRound) { - std::vector> subRound; - subRound.push_back({match.first, match.second}); - schedule.push_back(subRound); - } - } - } -} - -// Returns a schedule for ordered 2-combination of N elements -// by pairing the list with its rotating self, -// each round contains n pairs, where 1 <= n <= N and N is divisible by n -// and an element cannot appear more than twice in a round, -void CombinationSchedule(std::vector>>& schedule, - int N, int n = 0) { - std::vector>> fullSchedule; - - if (n <= 0) n = N; - if (N <= 0 || n > N || N % n != 0) // Assuming balanced load for each round - { - n = 1; - Utils::Print("[WARN] cannot create round robin schedule, falling back to serial"); - } - - // Generate rounds of combination based on incrementing distance - for (int i = 0; i < N; i++) { - std::vector> round; - for (int j = 0; j < N; j++) { - round.push_back({j, (j+i)%N}); - } - fullSchedule.push_back(round); - } - - // Step 2: Split each full round into sub-rounds with at most n pairs - for (auto const& fullRound : fullSchedule) { - for (size_t start = 0; start < fullRound.size(); start += n) { - std::vector> subRound; - for (size_t i = start; i < start + n && i < fullRound.size(); i++) { - subRound.push_back(fullRound[i]); - } - if (!subRound.empty()) { - schedule.push_back(subRound); - } - } - } -} - int GetClosestDeviceToNic(MemType memType, int nicIdx, int rank) { return TransferBench::IsCpuMemType(memType) ? TransferBench::GetClosestCpuNumaToNic(nicIdx, rank) : @@ -203,8 +109,8 @@ int NicPeerToPeerPreset(EnvVars& ev, std::vector>> schedule; std::vector>> nicSchedule; - RoundRobinSchedule(schedule, numRanks, nodeParallel); - CombinationSchedule(nicSchedule, numNicsPerRank, nicParLevel); + Utils::RoundRobinSchedule(schedule, numRanks, nodeParallel); + Utils::CombinationSchedule(nicSchedule, numNicsPerRank, nicParLevel); int totalTransfers = numRanks * numNicsPerRank * numRanks * numNicsPerRank; int counter = 0; diff --git a/src/client/Presets/PodAllToAll.hpp b/src/client/Presets/PodAllToAll.hpp index e03d388..b449ca8 100644 --- a/src/client/Presets/PodAllToAll.hpp +++ b/src/client/Presets/PodAllToAll.hpp @@ -20,27 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -// Reorder elements of list by stepping through with stride k, wrapping around. -// When gcd(k, n) > 1 the single cycle breaks into gcd(k, n) orbits which are -// concatenated, so every element appears exactly once in the output. -// The reordered list will be further separated into different groups. -void StrideGenerate(std::vector& list, int k) { - int n = list.size(); - k = ((k % n) + n) % n; // normalize to 0..n-1 - if (k == 0) return; - - int d = std::gcd(k, n); - std::vector out; - out.reserve(n); - - for (int s = 0; s < d; s++) { - for (int j = 0; j < n / d; j++) { - out.push_back(list[(s + j * k) % n]); - } - } - list = std::move(out); -} - int PodAllToAllPreset(EnvVars& ev, size_t const numBytesPerTransfer, std::string const presetName, @@ -164,7 +143,7 @@ int PodAllToAllPreset(EnvVars& ev, std::vector devices(n); std::vector indices(n); for (int k = 0; k < n; k++) indices[k] = k; - StrideGenerate(indices, stride); + Utils::StrideGenerate(indices, stride); int idx = 0; for (int rank : ranks) { for (int devIdx = 0; devIdx < numGpus; devIdx++) { @@ -172,9 +151,17 @@ int PodAllToAllPreset(EnvVars& ev, } } + // Build transfers for every group, then run once per pod so all groups share the same + // timed iterations (traffic across groups is concurrent within RunTransfers). + std::vector podTransfers; + std::vector groupTransferBase(numGroups); + std::vector>> groupReIndexes(numGroups); + for (int group = 0; group < numGroups; group++) { - std::vector> groupReIndex(groupSize, std::vector(groupSize, -1)); - std::vector transfers; + groupTransferBase[group] = podTransfers.size(); + groupReIndexes[group].assign(groupSize, std::vector(groupSize, -1)); + std::vector>& groupReIndex = groupReIndexes[group]; + for (int i = group * groupSize; i < (group + 1) * groupSize; i++) { for (int j = group * groupSize; j < (group + 1) * groupSize; j++) { if (i == j) { @@ -192,8 +179,9 @@ int PodAllToAllPreset(EnvVars& ev, transfer.numSubExecs = numSubExecs; int const localI = i - group * groupSize; int const localJ = j - group * groupSize; - groupReIndex[localI][localJ] = (int)transfers.size(); - transfers.push_back(transfer); + groupReIndex[localI][localJ] = + (int)(podTransfers.size() - groupTransferBase[group]); + podTransfers.push_back(transfer); } if (numQueuePairs > 0) { @@ -206,19 +194,47 @@ int PodAllToAllPreset(EnvVars& ev, (int32_t)devices[i].memIndex, (int32_t)devices[i].memRank}; transfer.exeSubIndex = devices[next].memIndex; transfer.numSubExecs = numQueuePairs; - transfers.push_back(transfer); + podTransfers.push_back(transfer); } } - TransferBench::TestResults results; - if (!TransferBench::RunTransfers(cfg, transfers, results)) { - for (auto const& err : results.errResults) - Utils::Print("%s\n", err.errMsg.c_str()); - return 1; - } - if (showDetails) { - Utils::PrintResults(ev, 1, transfers, results); + } + + if (Utils::RankDoesOutput()) { + for (int g = 0; g < numGroups; g++) { + int const gb = g * groupSize; + Utils::Print("A2A group %d:", g); + std::vector ord(groupSize); + for (int i = 0; i < groupSize; i++) ord[i] = i; + std::sort(ord.begin(), ord.end(), [&](int a, int b) { + MemDevice const& da = devices[gb + a]; + MemDevice const& db = devices[gb + b]; + if (da.memRank != db.memRank) return da.memRank < db.memRank; + return da.memIndex < db.memIndex; + }); + for (size_t si = 0; si < ord.size(); si++) { + MemDevice const& d = devices[gb + ord[si]]; + Utils::Print("%s R%d:G%d", si ? "," : "", d.memRank, d.memIndex); + } Utils::Print("\n"); } + } + + TransferBench::TestResults results; + if (!TransferBench::RunTransfers(cfg, podTransfers, results)) { + for (auto const& err : results.errResults) + Utils::Print("%s\n", err.errMsg.c_str()); + return 1; + } + if (showDetails) { + if (Utils::RankDoesOutput()) + Utils::Print("\n--- Pod AllToAll (all %d groups concurrent) ---\n", numGroups); + Utils::PrintResults(ev, 1, podTransfers, results); + Utils::Print("\n"); + } + + for (int group = 0; group < numGroups; group++) { + std::vector> const& groupReIndex = groupReIndexes[group]; + size_t const tfrBase = groupTransferBase[group]; // Per-group bandwidth table std::vector> groupBw(groupSize, std::vector(groupSize, -1.0)); @@ -226,14 +242,42 @@ int PodAllToAllPreset(EnvVars& ev, for (int localJ = 0; localJ < groupSize; localJ++) { int const k = groupReIndex[localI][localJ]; if (k >= 0) - groupBw[localI][localJ] = results.tfrResults[k].avgBandwidthGbPerSec; + groupBw[localI][localJ] = results.tfrResults[tfrBase + k].avgBandwidthGbPerSec; } } if (Utils::RankDoesOutput()) { Utils::Print("\n--- Pod AllToAll Group %d ---\n", group); int const groupBase = group * groupSize; + + // Display order: group devices by MPI rank, then GPU index (stride only affects execution order). + std::vector order(groupSize); + for (int i = 0; i < groupSize; i++) order[i] = i; + std::sort(order.begin(), order.end(), [&](int a, int b) { + MemDevice const& da = devices[groupBase + a]; + MemDevice const& db = devices[groupBase + b]; + if (da.memRank != db.memRank) return da.memRank < db.memRank; + return da.memIndex < db.memIndex; + }); + std::vector colRanks; + for (int slot : order) { + int const r = devices[groupBase + slot].memRank; + if (colRanks.empty() || colRanks.back() != r) colRanks.push_back(r); + } + std::vector> localsPerCol; + localsPerCol.reserve(colRanks.size()); + for (int dr : colRanks) { + std::vector loc; + for (int li = 0; li < groupSize; li++) { + if (devices[groupBase + li].memRank == dr) loc.push_back(li); + } + std::sort(loc.begin(), loc.end(), [&](int a, int b) { + return devices[groupBase + a].memIndex < devices[groupBase + b].memIndex; + }); + localsPerCol.push_back(std::move(loc)); + } + int const numRows = 2 + groupSize; - int const numCols = 2 + groupSize; + int const numCols = 2 + (int)colRanks.size(); int const precision = 2; Utils::TableHelper table(numRows, numCols, precision); table.DrawRowBorder(0); @@ -245,35 +289,46 @@ int PodAllToAllPreset(EnvVars& ev, table.DrawColBorder(1); table.Set(1, 1, " Mem Device "); - // Column headers - int colPrevRank = -1; - for (int j = 0; j < groupSize; j++) { - int colIdx = 2 + j; - int r = devices[groupBase + j].memRank; - if (r != colPrevRank) { - table.DrawColBorder(colIdx); - table.Set(0, colIdx, " Rank %02d ", r); - colPrevRank = r; + for (size_t c = 0; c < colRanks.size(); c++) { + int const colIdx = 2 + (int)c; + table.DrawColBorder(colIdx); + table.Set(0, colIdx, " Rank %02d ", colRanks[c]); + std::string gpuHdr; + for (int li : localsPerCol[c]) { + char t[24]; + snprintf(t, sizeof(t), " GPU %02d ", devices[groupBase + li].memIndex); + gpuHdr += t; } - table.Set(1, colIdx, " GPU %02d ", devices[groupBase + j].memIndex); + table.Set(1, colIdx, "%s", gpuHdr.c_str()); + table.SetColAlignment((int)c + 2, Utils::TableHelper::ALIGN_LEFT); } - // Row headers and data int rowPrevRank = -1; - for (int localI = 0; localI < groupSize; localI++) { - int rowIdx = 2 + localI; - int r = devices[groupBase + localI].memRank; + for (int disp = 0; disp < groupSize; disp++) { + int const localI = order[disp]; + int const rowIdx = 2 + disp; + int const r = devices[groupBase + localI].memRank; if (r != rowPrevRank) { table.DrawRowBorder(rowIdx); table.Set(rowIdx, 0, " Rank %02d ", r); rowPrevRank = r; + } else { + table.Set(rowIdx, 0, " "); } table.Set(rowIdx, 1, " GPU %02d ", devices[groupBase + localI].memIndex); - for (int localJ = 0; localJ < groupSize; localJ++) { - if (groupBw[localI][localJ] >= 0) - table.Set(rowIdx, 2 + localJ, " %.2f ", groupBw[localI][localJ]); - else - table.Set(rowIdx, 2 + localJ, " N/A "); + for (size_t c = 0; c < colRanks.size(); c++) { + std::string cell; + for (int localJ : localsPerCol[c]) { + char t[16]; + if (groupBw[localI][localJ] >= 0) + snprintf(t, sizeof(t), " %7.2f", groupBw[localI][localJ]); + else + snprintf(t, sizeof(t), " %7s", "N/A"); + cell += t; + } + int const colIdx = 2 + (int)c; + table.Set(rowIdx, colIdx, "%s", cell.c_str()); + table.SetCellAlignment(rowIdx, colIdx, Utils::TableHelper::ALIGN_LEFT); } } table.PrintTable(ev.outputToCsv, ev.showBorders); diff --git a/src/client/Presets/PodPeerToPeer.hpp b/src/client/Presets/PodPeerToPeer.hpp index 2148bd4..9ea8ca7 100644 --- a/src/client/Presets/PodPeerToPeer.hpp +++ b/src/client/Presets/PodPeerToPeer.hpp @@ -126,7 +126,7 @@ int PodPeerToPeerPreset(EnvVars& ev, } else { // parallelLevel == 1: node pairs run concurrently, one device pair at a time per node pair std::vector>> nodePairSchedule; - RoundRobinSchedule(nodePairSchedule, (int)ranks.size(), 1); + Utils::RoundRobinSchedule(nodePairSchedule, (int)ranks.size(), 1); for (auto const& roundNodePairs : nodePairSchedule) { for (int srcDev = 0; srcDev < numGpuDevices; srcDev++) { diff --git a/src/client/Presets/PodRing.hpp b/src/client/Presets/PodRing.hpp new file mode 100644 index 0000000..5b449e5 --- /dev/null +++ b/src/client/Presets/PodRing.hpp @@ -0,0 +1,280 @@ +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +int PodRingPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + bool const bytesSpecified) +{ + // Assuming single pod, for now + if (Utils::GetNumRankGroups() > 1) { + Utils::Print("[ERROR] PodRing preset can only be run across ranks that are homogenous\n"); + Utils::Print("[ERROR] Run ./TransferBench without any args to display topology information\n"); + Utils::Print("[ERROR] TB_NIC_FILTER may also be used to limit NIC visibility\n"); + return 1; + } + if (Utils::GetRankPerPodMap().empty()) { + Utils::Print("[ERROR] No pods detected. Set TB_FORCE_SINGLE_POD=1 to treat all ranks as a single pod.\n"); + return 1; + } + + ev.gfxUnroll = EnvVars::GetEnvVar("GFX_UNROLL", 2); + + int numRanks = TransferBench::GetNumRanks(); + int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX); + + int memTypeIdx = EnvVars::GetEnvVar("MEM_TYPE" , 0); + int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus); + int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 0); + int numSubExecs = EnvVars::GetEnvVar("NUM_SUB_EXEC" , 8); + int showDetails = EnvVars::GetEnvVar("SHOW_DETAILS" , 0); + int useDmaExec = EnvVars::GetEnvVar("USE_DMA_EXEC" , 0); + int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0); + int stride = EnvVars::GetEnvVar("STRIDE" , 1); + int groupSize = EnvVars::GetEnvVar("GROUP_SIZE" , numRanks * numGpus); + + if (numGpus <= 0 || numGpus > numDetectedGpus) { + Utils::Print("[ERROR] Cannot use %d GPUs. Detected %d GPUs\n", numGpus, numDetectedGpus); + return 1; + } + if (groupSize < 2) { + Utils::Print("[ERROR] Group size must be at least 2 to form a ring\n"); + return 1; + } + if (numRanks * numGpus % groupSize) { + Utils::Print("[ERROR] Group size %d cannot evenly divide %d total devices from %d ranks.\n", + groupSize, numRanks * numGpus, numRanks); + return 1; + } + + int numNics = TransferBench::GetNumExecutors(EXE_NIC, 0); + bool nicDifference = false; + for (int rank = 0; rank < numRanks; rank++) { + if (numGpus > TransferBench::GetNumExecutors(EXE_GPU_GFX, rank)) { + Utils::Print("[ERROR] PodRing preset requires each rank to have the same number of GPUs\n"); + return 1; + } + if (numQueuePairs > 0 && numNics != TransferBench::GetNumExecutors(EXE_NIC, rank)) + nicDifference = true; + } + if (nicDifference) + Utils::Print("[WARN] Not all ranks have the same number of NICs\n"); + + MemType memType = Utils::GetGpuMemType(memTypeIdx); + std::string devMemTypeStr = Utils::GetGpuMemTypeStr(memTypeIdx); + + if (Utils::RankDoesOutput()) { + ev.DisplayEnvVars(); + if (!ev.hideEnv) { + if (!ev.outputToCsv) printf("[PodRing Related]\n"); + ev.Print("MEM_TYPE" , memTypeIdx , "Using %s GPU memory (%s)", devMemTypeStr.c_str(), Utils::GetAllGpuMemTypeStr().c_str()); + ev.Print("NUM_GPU_DEVICES", numGpus , "Using %d GPUs", numGpus); + ev.Print("NUM_QUEUE_PAIRS", numQueuePairs, "Using %d queue pairs for NIC transfers", numQueuePairs); + ev.Print("NUM_SUB_EXEC" , numSubExecs , "Using %d subexecutors/CUs per Transfer", numSubExecs); + ev.Print("USE_DMA_EXEC" , useDmaExec , "Using %s executor", useDmaExec ? "DMA" : "GFX"); + ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor", useRemoteRead ? "DST" : "SRC"); + ev.Print("STRIDE" , stride , "Reordering devices by taking %d steps", stride); + ev.Print("GROUP_SIZE" , groupSize , "Dividing all devices into ring groups of %d", groupSize); + printf("\n"); + } + } + + Utils::Print("GPU-%s IntraPod Ring benchmark:\n", useDmaExec ? "DMA" : "GFX"); + Utils::Print("==============================\n"); + Utils::Print("[%lu bytes per Transfer] [%s:%d] [MemType:%s] [NIC QueuePairs:%d] [#Ranks:%d]\n", + numBytesPerTransfer, useDmaExec ? "DMA" : "GFX", numSubExecs, + devMemTypeStr.c_str(), numQueuePairs, numRanks); + + TransferBench::ConfigOptions cfg = ev.ToConfigOptions(); + ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX; + + int n = numRanks * numGpus; + int numGroups = n / groupSize; + + std::vector indices(n); + for (int k = 0; k < n; k++) indices[k] = k; + Utils::StrideGenerate(indices, stride); + + std::vector devices(n); + for (int i = 0; i < n; i++) { + int const globalIdx = indices[i]; + int const rank = globalIdx / numGpus; + int const devIdx = globalIdx % numGpus; + devices[i] = {memType, devIdx, rank}; + } + + Utils::Print("%d ring(s) of %d devices:\n", numGroups, groupSize); + for (int group = 0; group < numGroups; group++) { + int const groupBase = group * groupSize; + Utils::Print(" Ring %d: ", group); + for (int i = 0; i < groupSize; i++) { + Utils::Print("R%d:G%d -> ", devices[groupBase + i].memRank, devices[groupBase + i].memIndex); + } + Utils::Print("R%d:G%d\n", devices[groupBase].memRank, devices[groupBase].memIndex); + } + Utils::Print("\n"); + + for (int group = 0; group < numGroups; group++) { + int const groupBase = group * groupSize; + std::vector transfers; + + for (int i = 0; i < groupSize; i++) { + int srcIdx = groupBase + i; + int dstIdx = groupBase + (i + 1) % groupSize; + + TransferBench::Transfer transfer; + transfer.numBytes = numBytesPerTransfer; + transfer.srcs.push_back(devices[srcIdx]); + transfer.dsts.push_back(devices[dstIdx]); + transfer.exeDevice = {exeType, + (int32_t)(useRemoteRead ? devices[dstIdx].memIndex : devices[srcIdx].memIndex), + (int32_t)(useRemoteRead ? devices[dstIdx].memRank : devices[srcIdx].memRank)}; + transfer.exeSubIndex = -1; + transfer.numSubExecs = numSubExecs; + transfers.push_back(transfer); + + if (numQueuePairs > 0) { + TransferBench::Transfer nicTransfer; + nicTransfer.numBytes = numBytesPerTransfer; + nicTransfer.srcs.push_back(devices[srcIdx]); + nicTransfer.dsts.push_back(devices[dstIdx]); + nicTransfer.exeDevice = {TransferBench::EXE_NIC_NEAREST, + (int32_t)devices[srcIdx].memIndex, (int32_t)devices[srcIdx].memRank}; + nicTransfer.exeSubIndex = devices[dstIdx].memIndex; + nicTransfer.numSubExecs = numQueuePairs; + transfers.push_back(nicTransfer); + } + } + + TransferBench::TestResults results; + if (!TransferBench::RunTransfers(cfg, transfers, results)) { + for (auto const& err : results.errResults) + Utils::Print("%s\n", err.errMsg.c_str()); + return 1; + } + if (showDetails) { + Utils::PrintResults(ev, 1, transfers, results); + Utils::Print("\n"); + } + + if (Utils::RankDoesOutput()) { + Utils::Print("\n--- Pod Ring Group %d ---\n", group); + + int const numHops = groupSize; + int const numRows = 2 + numHops + 3; + int const numCols = 6; + int const precision = 2; + Utils::TableHelper table(numRows, numCols, precision); + + table.DrawRowBorder(0); + table.DrawColBorder(0); + table.DrawColBorder(numCols); + table.DrawRowBorder(numRows); + + table.Set(0, 0, " Src "); + table.Set(0, 1, " Src "); + table.Set(0, 2, " Dst "); + table.Set(0, 3, " Dst "); + table.Set(0, 4, " GFX BW "); + table.Set(1, 0, " Rank "); + table.Set(1, 1, " GPU "); + table.Set(1, 2, " Rank "); + table.Set(1, 3, " GPU "); + table.Set(1, 4, " (GB/s) "); + table.DrawColBorder(2); + table.DrawColBorder(4); + + if (numQueuePairs > 0) { + table.Set(0, 5, " NIC BW "); + table.Set(1, 5, " (GB/s) "); + } else { + table.Set(0, 5, " "); + table.Set(1, 5, " "); + } + + table.DrawRowBorder(2); + + double gfxMin = std::numeric_limits::max(); + double gfxAvg = 0.0; + double gfxMax = std::numeric_limits::lowest(); + double nicMin = std::numeric_limits::max(); + double nicAvg = 0.0; + double nicMax = std::numeric_limits::lowest(); + + int tfrIdx = 0; + for (int i = 0; i < numHops; i++) { + int srcIdx = groupBase + i; + int dstIdx = groupBase + (i + 1) % groupSize; + int row = 2 + i; + + double gfxBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec; + tfrIdx++; + + table.Set(row, 0, " %d ", devices[srcIdx].memRank); + table.Set(row, 1, " %d ", devices[srcIdx].memIndex); + table.Set(row, 2, " %d ", devices[dstIdx].memRank); + table.Set(row, 3, " %d ", devices[dstIdx].memIndex); + table.Set(row, 4, " %.2f ", gfxBw); + + gfxMin = std::min(gfxMin, gfxBw); + gfxAvg += gfxBw; + gfxMax = std::max(gfxMax, gfxBw); + + if (numQueuePairs > 0) { + double nicBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec; + tfrIdx++; + table.Set(row, 5, " %.2f ", nicBw); + nicMin = std::min(nicMin, nicBw); + nicAvg += nicBw; + nicMax = std::max(nicMax, nicBw); + } + } + + int summaryBase = 2 + numHops; + table.DrawRowBorder(summaryBase); + table.Set(summaryBase , 1, " MAX "); + table.Set(summaryBase + 1, 1, " AVG "); + table.Set(summaryBase + 2, 1, " MIN "); + table.Set(summaryBase , 4, " %.2f ", gfxMax); + table.Set(summaryBase + 1, 4, " %.2f ", gfxAvg / numHops); + table.Set(summaryBase + 2, 4, " %.2f ", gfxMin); + + if (numQueuePairs > 0) { + table.Set(summaryBase , 5, " %.2f ", nicMax); + table.Set(summaryBase + 1, 5, " %.2f ", nicAvg / numHops); + table.Set(summaryBase + 2, 5, " %.2f ", nicMin); + } + + table.PrintTable(ev.outputToCsv, ev.showBorders); + + Utils::Print("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec); + } + } + + if (!Utils::RankDoesOutput()) return 0; + + if (Utils::HasDuplicateHostname()) { + printf("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n"); + } + + return 0; +} diff --git a/src/client/Presets/Presets.hpp b/src/client/Presets/Presets.hpp index 8354208..5505681 100644 --- a/src/client/Presets/Presets.hpp +++ b/src/client/Presets/Presets.hpp @@ -43,6 +43,7 @@ THE SOFTWARE. #include "PeerToPeer.hpp" #include "PodAllToAll.hpp" #include "PodPeerToPeer.hpp" +#include "PodRing.hpp" #include "Scaling.hpp" #include "Schmoo.hpp" #include "SmokeTest.hpp" @@ -77,6 +78,7 @@ std::map presetFuncMap = {"p2p" , {PeerToPeerPreset, "Peer-to-peer device memory bandwidth test"}}, {"poda2a", {PodAllToAllPreset, "All-to-all transfers between subgroups of ranks within a pod"}}, {"podp2p", {PodPeerToPeerPreset, "Peer-to-peer transfers test among ranks within a pod"}}, + {"podring", {PodRingPreset, "Ring transfers within subgroups of ranks in a pod"}}, {"rsweep", {SweepPreset, "Randomly sweep through sets of Transfers"}}, {"scaling", {ScalingPreset, "Run scaling test from one GPU to other devices"}}, {"schmoo", {SchmooPreset, "Scaling tests for local/remote read/write/copy"}}, diff --git a/src/client/Utilities.hpp b/src/client/Utilities.hpp index 259e4cc..497770f 100644 --- a/src/client/Utilities.hpp +++ b/src/client/Utilities.hpp @@ -155,6 +155,24 @@ namespace TransferBench::Utils bool AllocateMemory(MemDevice memDevice, size_t numBytes, void** memPtr); bool DeallocateMemory(MemType memType, void *memPtr, size_t const bytes); + // Reorder elements of list by stepping through with stride k, wrapping around. + // When gcd(k, n) > 1 the single cycle breaks into gcd(k, n) orbits which are + // concatenated, so every element appears exactly once in the output. + // The reordered list will be further separated into different groups. + void StrideGenerate(std::vector& list, int k); + + // Returns a schedule of round robin pairing of N elements, using Circle Method. + // If parallel, each round contains N/2 pairs, otherwise serial. + void RoundRobinSchedule(std::vector>>& schedule, + int N, int parallel = 0); + + // Returns a schedule for ordered 2-combination of N elements + // by pairing the list with its rotating self. + // Each round contains n pairs, where 1 <= n <= N and N is divisible by n, + // and an element cannot appear more than twice in a round. + void CombinationSchedule(std::vector>>& schedule, + int N, int n = 0); + // Implementation details below //================================================================ TableHelper::TableHelper(int numRows, int numCols, int precision) : @@ -769,4 +787,113 @@ namespace TransferBench::Utils { return (TransferBench::DeallocateMemory(memType, memPtr, bytes).errType != TransferBench::ERR_NONE); } + + void StrideGenerate(std::vector& list, int k) + { + int n = list.size(); + if (n == 0) return; + k = ((k % n) + n) % n; // normalize to 0..n-1 + if (k == 0) return; + + int d = std::gcd(k, n); + std::vector out; + out.reserve(n); + + for (int s = 0; s < d; s++) { + for (int j = 0; j < n / d; j++) { + out.push_back(list[(s + j * k) % n]); + } + } + list = std::move(out); + } + + void RoundRobinSchedule(std::vector>>& schedule, + int N, int parallel) + { + if (N == 1) { + schedule.push_back({{0, 0}}); + return; + } + // Generate standard round-robin tournament (maximum parallelism) + std::vector>> fullSchedule; + + // Pad odd number of ranks with a dummy round (N+1) + int paddedN = N + N % 2; + // Round-robin tournament scheduling + for (int round = 0; round < paddedN - 1; round++) { + std::vector> roundPairs; + std::vector> roundPairsReversed; + for (int i = 0; i < paddedN / 2; i++) { + int item1 = i; + int item2 = paddedN - 1 - i; + if (round > 0) { + // Rotate all except the first item + if (item1 > 0) item1 = ((item1 - 1 + round) % (paddedN - 1)) + 1; + if (item2 > 0) item2 = ((item2 - 1 + round) % (paddedN - 1)) + 1; + } + // Ignore dummy round, its partner sits out this round + if (item1 < N && item2 < N) { + roundPairs.push_back({item1, item2}); + roundPairsReversed.push_back({item2, item1}); + } + } + fullSchedule.push_back(roundPairs); + fullSchedule.push_back(roundPairsReversed); + } + + // A loopback round where all run in parallel + std::vector> selfRound; + for (int i = 0; i < N; i++) { + selfRound.push_back({i, i}); + } + fullSchedule.push_back(selfRound); + + if (parallel) { + schedule = std::move(fullSchedule); + } else { + // Serialize each round if needed + for (auto const& fullRound : fullSchedule) { + for (auto const& match : fullRound) { + std::vector> subRound; + subRound.push_back({match.first, match.second}); + schedule.push_back(subRound); + } + } + } + } + + void CombinationSchedule(std::vector>>& schedule, + int N, int n) + { + std::vector>> fullSchedule; + + if (n <= 0) n = N; + if (N <= 0 || n > N || N % n != 0) // Assuming balanced load for each round + { + n = 1; + Print("[WARN] cannot create combination schedule, falling back to serial\n"); + } + + // Generate rounds of combination based on incrementing distance + for (int i = 0; i < N; i++) { + std::vector> round; + for (int j = 0; j < N; j++) { + round.push_back({j, (j + i) % N}); + } + fullSchedule.push_back(round); + } + + // Step 2: Split each full round into sub-rounds with at most n pairs + for (auto const& fullRound : fullSchedule) { + for (size_t start = 0; start < fullRound.size(); start += n) { + std::vector> subRound; + for (size_t i = start; i < start + n && i < fullRound.size(); i++) { + subRound.push_back(fullRound[i]); + } + if (!subRound.empty()) { + schedule.push_back(subRound); + } + } + } + } };