From 963d581156820f0c07b4e1b05c7ff2041ffa7a81 Mon Sep 17 00:00:00 2001 From: AtlantaPepsi Date: Tue, 28 Apr 2026 16:10:39 +0000 Subject: [PATCH 1/5] adjusting grouping logic; lifting helper functions to Utilities --- src/client/Presets/NicPeerToPeer.hpp | 98 +--------- src/client/Presets/PodAllToAll.hpp | 23 +-- src/client/Presets/PodPeerToPeer.hpp | 2 +- src/client/Presets/PodRing.hpp | 267 +++++++++++++++++++++++++++ src/client/Presets/Presets.hpp | 2 + src/client/Utilities.hpp | 127 +++++++++++++ 6 files changed, 400 insertions(+), 119 deletions(-) create mode 100644 src/client/Presets/PodRing.hpp diff --git a/src/client/Presets/NicPeerToPeer.hpp b/src/client/Presets/NicPeerToPeer.hpp index f0e0def..8c54baa 100644 --- a/src/client/Presets/NicPeerToPeer.hpp +++ b/src/client/Presets/NicPeerToPeer.hpp @@ -22,100 +22,6 @@ THE SOFTWARE. // Helper functions -// Returns a schedule of round robin pairing of N elements, using Circle Method -// if parallel, each round contains N/2 pairs, otherwise serial -void RoundRobinSchedule(std::vector>>& schedule, - int N, int parallel = 0) { - if (N == 1) { - schedule.push_back({{0,0}}); - return; - } - // Generate standard round-robin tournament (maximum parallelism) - std::vector>> fullSchedule; - - // Pad odd number of ranks with a dummy round (N+1) - int paddedN = N + N%2; - // Round-robin tournament scheduling - for (int round = 0; round < paddedN - 1; round++) { - std::vector> roundPairs; - std::vector> roundPairsReversed; - for (int i = 0; i < paddedN / 2; i++) { - int item1 = i; - int item2 = paddedN - 1 - i; - if (round > 0) { - // Rotate all except the first item - if (item1 > 0) item1 = ((item1 - 1 + round) % (paddedN - 1)) + 1; - if (item2 > 0) item2 = ((item2 - 1 + round) % (paddedN - 1)) + 1; - } - // Ignore dummy round, its partner sits out this ronud - if (item1 < N && item2 < N){ - roundPairs.push_back({item1, item2}); - roundPairsReversed.push_back({item2, item1}); - } - } - fullSchedule.push_back(roundPairs); - fullSchedule.push_back(roundPairsReversed); - } - - // A loopback round where all run in parallel - std::vector> selfRound; - for (int i = 0; i < N; i++) { - selfRound.push_back({i, i}); - } - fullSchedule.push_back(selfRound); - - if (parallel) { - schedule = std::move(fullSchedule); - } else { - // Serialize each round if needed - for (auto const& fullRound : fullSchedule) { - for (auto const& match : fullRound) { - std::vector> subRound; - subRound.push_back({match.first, match.second}); - schedule.push_back(subRound); - } - } - } -} - -// Returns a schedule for ordered 2-combination of N elements -// by pairing the list with its rotating self, -// each round contains n pairs, where 1 <= n <= N and N is divisible by n -// and an element cannot appear more than twice in a round, -void CombinationSchedule(std::vector>>& schedule, - int N, int n = 0) { - std::vector>> fullSchedule; - - if (n <= 0) n = N; - if (N <= 0 || n > N || N % n != 0) // Assuming balanced load for each round - { - n = 1; - Utils::Print("[WARN] cannot create round robin schedule, falling back to serial"); - } - - // Generate rounds of combination based on incrementing distance - for (int i = 0; i < N; i++) { - std::vector> round; - for (int j = 0; j < N; j++) { - round.push_back({j, (j+i)%N}); - } - fullSchedule.push_back(round); - } - - // Step 2: Split each full round into sub-rounds with at most n pairs - for (auto const& fullRound : fullSchedule) { - for (size_t start = 0; start < fullRound.size(); start += n) { - std::vector> subRound; - for (size_t i = start; i < start + n && i < fullRound.size(); i++) { - subRound.push_back(fullRound[i]); - } - if (!subRound.empty()) { - schedule.push_back(subRound); - } - } - } -} - int GetClosestDeviceToNic(MemType memType, int nicIdx, int rank) { return TransferBench::IsCpuMemType(memType) ? TransferBench::GetClosestCpuNumaToNic(nicIdx, rank) : @@ -203,8 +109,8 @@ int NicPeerToPeerPreset(EnvVars& ev, std::vector>> schedule; std::vector>> nicSchedule; - RoundRobinSchedule(schedule, numRanks, nodeParallel); - CombinationSchedule(nicSchedule, numNicsPerRank, nicParLevel); + Utils::RoundRobinSchedule(schedule, numRanks, nodeParallel); + Utils::CombinationSchedule(nicSchedule, numNicsPerRank, nicParLevel); int totalTransfers = numRanks * numNicsPerRank * numRanks * numNicsPerRank; int counter = 0; diff --git a/src/client/Presets/PodAllToAll.hpp b/src/client/Presets/PodAllToAll.hpp index e03d388..dc33f75 100644 --- a/src/client/Presets/PodAllToAll.hpp +++ b/src/client/Presets/PodAllToAll.hpp @@ -20,27 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -// Reorder elements of list by stepping through with stride k, wrapping around. -// When gcd(k, n) > 1 the single cycle breaks into gcd(k, n) orbits which are -// concatenated, so every element appears exactly once in the output. -// The reordered list will be further separated into different groups. -void StrideGenerate(std::vector& list, int k) { - int n = list.size(); - k = ((k % n) + n) % n; // normalize to 0..n-1 - if (k == 0) return; - - int d = std::gcd(k, n); - std::vector out; - out.reserve(n); - - for (int s = 0; s < d; s++) { - for (int j = 0; j < n / d; j++) { - out.push_back(list[(s + j * k) % n]); - } - } - list = std::move(out); -} - int PodAllToAllPreset(EnvVars& ev, size_t const numBytesPerTransfer, std::string const presetName, @@ -164,7 +143,7 @@ int PodAllToAllPreset(EnvVars& ev, std::vector devices(n); std::vector indices(n); for (int k = 0; k < n; k++) indices[k] = k; - StrideGenerate(indices, stride); + Utils::StrideGenerate(indices, stride); int idx = 0; for (int rank : ranks) { for (int devIdx = 0; devIdx < numGpus; devIdx++) { diff --git a/src/client/Presets/PodPeerToPeer.hpp b/src/client/Presets/PodPeerToPeer.hpp index 2148bd4..9ea8ca7 100644 --- a/src/client/Presets/PodPeerToPeer.hpp +++ b/src/client/Presets/PodPeerToPeer.hpp @@ -126,7 +126,7 @@ int PodPeerToPeerPreset(EnvVars& ev, } else { // parallelLevel == 1: node pairs run concurrently, one device pair at a time per node pair std::vector>> nodePairSchedule; - RoundRobinSchedule(nodePairSchedule, (int)ranks.size(), 1); + Utils::RoundRobinSchedule(nodePairSchedule, (int)ranks.size(), 1); for (auto const& roundNodePairs : nodePairSchedule) { for (int srcDev = 0; srcDev < numGpuDevices; srcDev++) { diff --git a/src/client/Presets/PodRing.hpp b/src/client/Presets/PodRing.hpp new file mode 100644 index 0000000..c591923 --- /dev/null +++ b/src/client/Presets/PodRing.hpp @@ -0,0 +1,267 @@ +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +int PodRingPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName) +{ + ev.gfxUnroll = EnvVars::GetEnvVar("GFX_UNROLL", 2); + + int numRanks = TransferBench::GetNumRanks(); + int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX); + + int memTypeIdx = EnvVars::GetEnvVar("MEM_TYPE" , 0); + int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus); + int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 0); + int numSubExecs = EnvVars::GetEnvVar("NUM_SUB_EXEC" , 8); + int showDetails = EnvVars::GetEnvVar("SHOW_DETAILS" , 0); + int useDmaExec = EnvVars::GetEnvVar("USE_DMA_EXEC" , 0); + int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0); + int stride = EnvVars::GetEnvVar("STRIDE" , 1); + int groupSize = EnvVars::GetEnvVar("GROUP_SIZE" , numRanks * numGpus); + + int numNics = TransferBench::GetNumExecutors(EXE_NIC, 0); + bool nicDifference = false; + for (int rank = 0; rank < numRanks; rank++) { + if (numGpus > TransferBench::GetNumExecutors(EXE_GPU_GFX, rank)) { + Utils::Print("[ERROR] PodRing preset requires each rank to have the same number of GPUs\n"); + return 1; + } + if (numQueuePairs > 0 && numNics != TransferBench::GetNumExecutors(EXE_NIC, rank)) + nicDifference = true; + } + if (nicDifference) + Utils::Print("[WARN] Not all ranks have the same number of NICs\n"); + + MemType memType = Utils::GetGpuMemType(memTypeIdx); + std::string devMemTypeStr = Utils::GetGpuMemTypeStr(memTypeIdx); + + if (Utils::RankDoesOutput()) { + ev.DisplayEnvVars(); + if (!ev.hideEnv) { + if (!ev.outputToCsv) printf("[PodRing Related]\n"); + ev.Print("MEM_TYPE" , memTypeIdx , "Using %s GPU memory (%s)", devMemTypeStr.c_str(), Utils::GetAllGpuMemTypeStr().c_str()); + ev.Print("NUM_GPU_DEVICES", numGpus , "Using %d GPUs", numGpus); + ev.Print("NUM_QUEUE_PAIRS", numQueuePairs, "Using %d queue pairs for NIC transfers", numQueuePairs); + ev.Print("NUM_SUB_EXEC" , numSubExecs , "Using %d subexecutors/CUs per Transfer", numSubExecs); + ev.Print("USE_DMA_EXEC" , useDmaExec , "Using %s executor", useDmaExec ? "DMA" : "GFX"); + ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor", useRemoteRead ? "DST" : "SRC"); + ev.Print("STRIDE" , stride , "Reordering devices by taking %d steps", stride); + ev.Print("GROUP_SIZE" , groupSize , "Dividing all devices into ring groups of %d", groupSize); + printf("\n"); + } + } + + if (numGpus <= 0 || numGpus > numDetectedGpus) { + Utils::Print("[ERROR] Cannot use %d GPUs. Detected %d GPUs\n", numGpus, numDetectedGpus); + return 1; + } + if (groupSize < 2) { + Utils::Print("[ERROR] Group size must be at least 2 to form a ring\n"); + return 1; + } + if (numRanks * numDetectedGpus % groupSize) { + Utils::Print("[ERROR] Group size %d cannot evenly divide %d total devices from %d ranks.\n", + groupSize, numRanks * numDetectedGpus, numRanks); + return 1; + } + + Utils::Print("GPU-%s IntraPod Ring benchmark:\n", useDmaExec ? "DMA" : "GFX"); + Utils::Print("==============================\n"); + Utils::Print("[%lu bytes per Transfer] [%s:%d] [MemType:%s] [NIC QueuePairs:%d] [#Ranks:%d]\n", + numBytesPerTransfer, useDmaExec ? "DMA" : "GFX", numSubExecs, + devMemTypeStr.c_str(), numQueuePairs, numRanks); + + TransferBench::ConfigOptions cfg = ev.ToConfigOptions(); + ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX; + + int n = numRanks * numGpus; + int numGroups = n / groupSize; + + std::vector indices(n); + for (int k = 0; k < n; k++) indices[k] = k; + Utils::StrideGenerate(indices, stride); + + std::vector devices(n); + for (int i = 0; i < n; i++) { + int const globalIdx = indices[i]; + int const rank = globalIdx / numGpus; + int const devIdx = globalIdx % numGpus; + devices[i] = {memType, devIdx, rank}; + } + + Utils::Print("%d ring(s) of %d devices:\n", numGroups, groupSize); + for (int group = 0; group < numGroups; group++) { + int const groupBase = group * groupSize; + Utils::Print(" Ring %d: ", group); + for (int i = 0; i < groupSize; i++) { + Utils::Print("R%d:G%d -> ", devices[groupBase + i].memRank, devices[groupBase + i].memIndex); + } + Utils::Print("R%d:G%d\n", devices[groupBase].memRank, devices[groupBase].memIndex); + } + Utils::Print("\n"); + + for (int group = 0; group < numGroups; group++) { + int const groupBase = group * groupSize; + std::vector transfers; + + for (int i = 0; i < groupSize; i++) { + int srcIdx = groupBase + i; + int dstIdx = groupBase + (i + 1) % groupSize; + + TransferBench::Transfer transfer; + transfer.numBytes = numBytesPerTransfer; + transfer.srcs.push_back(devices[srcIdx]); + transfer.dsts.push_back(devices[dstIdx]); + transfer.exeDevice = {exeType, + (int32_t)(useRemoteRead ? devices[dstIdx].memIndex : devices[srcIdx].memIndex), + (int32_t)(useRemoteRead ? devices[dstIdx].memRank : devices[srcIdx].memRank)}; + transfer.exeSubIndex = -1; + transfer.numSubExecs = numSubExecs; + transfers.push_back(transfer); + + if (numQueuePairs > 0) { + TransferBench::Transfer nicTransfer; + nicTransfer.numBytes = numBytesPerTransfer; + nicTransfer.srcs.push_back(devices[srcIdx]); + nicTransfer.dsts.push_back(devices[dstIdx]); + nicTransfer.exeDevice = {TransferBench::EXE_NIC_NEAREST, + (int32_t)devices[srcIdx].memIndex, (int32_t)devices[srcIdx].memRank}; + nicTransfer.exeSubIndex = devices[dstIdx].memIndex; + nicTransfer.numSubExecs = numQueuePairs; + transfers.push_back(nicTransfer); + } + } + + TransferBench::TestResults results; + if (!TransferBench::RunTransfers(cfg, transfers, results)) { + for (auto const& err : results.errResults) + Utils::Print("%s\n", err.errMsg.c_str()); + return 1; + } + if (showDetails) { + Utils::PrintResults(ev, 1, transfers, results); + Utils::Print("\n"); + } + + if (Utils::RankDoesOutput()) { + Utils::Print("\n--- Pod Ring Group %d ---\n", group); + + int const numHops = groupSize; + int const numRows = 2 + numHops + 3; + int const numCols = 6; + int const precision = 2; + Utils::TableHelper table(numRows, numCols, precision); + + table.DrawRowBorder(0); + table.DrawColBorder(0); + table.DrawColBorder(numCols); + table.DrawRowBorder(numRows); + + table.Set(0, 0, " Src "); + table.Set(0, 1, " Src "); + table.Set(0, 2, " Dst "); + table.Set(0, 3, " Dst "); + table.Set(0, 4, " GFX BW "); + table.Set(1, 0, " Rank "); + table.Set(1, 1, " GPU "); + table.Set(1, 2, " Rank "); + table.Set(1, 3, " GPU "); + table.Set(1, 4, " (GB/s) "); + table.DrawColBorder(2); + table.DrawColBorder(4); + + if (numQueuePairs > 0) { + table.Set(0, 5, " NIC BW "); + table.Set(1, 5, " (GB/s) "); + } else { + table.Set(0, 5, " "); + table.Set(1, 5, " "); + } + + table.DrawRowBorder(2); + + double gfxMin = std::numeric_limits::max(); + double gfxAvg = 0.0; + double gfxMax = std::numeric_limits::lowest(); + double nicMin = std::numeric_limits::max(); + double nicAvg = 0.0; + double nicMax = std::numeric_limits::lowest(); + + int tfrIdx = 0; + for (int i = 0; i < numHops; i++) { + int srcIdx = groupBase + i; + int dstIdx = groupBase + (i + 1) % groupSize; + int row = 2 + i; + + double gfxBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec; + tfrIdx++; + + table.Set(row, 0, " %d ", devices[srcIdx].memRank); + table.Set(row, 1, " %d ", devices[srcIdx].memIndex); + table.Set(row, 2, " %d ", devices[dstIdx].memRank); + table.Set(row, 3, " %d ", devices[dstIdx].memIndex); + table.Set(row, 4, " %.2f ", gfxBw); + + gfxMin = std::min(gfxMin, gfxBw); + gfxAvg += gfxBw; + gfxMax = std::max(gfxMax, gfxBw); + + if (numQueuePairs > 0) { + double nicBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec; + tfrIdx++; + table.Set(row, 5, " %.2f ", nicBw); + nicMin = std::min(nicMin, nicBw); + nicAvg += nicBw; + nicMax = std::max(nicMax, nicBw); + } + } + + int summaryBase = 2 + numHops; + table.DrawRowBorder(summaryBase); + table.Set(summaryBase , 1, " MAX "); + table.Set(summaryBase + 1, 1, " AVG "); + table.Set(summaryBase + 2, 1, " MIN "); + table.Set(summaryBase , 4, " %.2f ", gfxMax); + table.Set(summaryBase + 1, 4, " %.2f ", gfxAvg / numHops); + table.Set(summaryBase + 2, 4, " %.2f ", gfxMin); + + if (numQueuePairs > 0) { + table.Set(summaryBase , 5, " %.2f ", nicMax); + table.Set(summaryBase + 1, 5, " %.2f ", nicAvg / numHops); + table.Set(summaryBase + 2, 5, " %.2f ", nicMin); + } + + table.PrintTable(ev.outputToCsv, ev.showBorders); + + Utils::Print("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec); + } + } + + if (!Utils::RankDoesOutput()) return 0; + + if (Utils::HasDuplicateHostname()) { + printf("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n"); + } + + return 0; +} diff --git a/src/client/Presets/Presets.hpp b/src/client/Presets/Presets.hpp index 8354208..09e25b2 100644 --- a/src/client/Presets/Presets.hpp +++ b/src/client/Presets/Presets.hpp @@ -43,6 +43,7 @@ THE SOFTWARE. #include "PeerToPeer.hpp" #include "PodAllToAll.hpp" #include "PodPeerToPeer.hpp" +#include "PodRing.hpp" #include "Scaling.hpp" #include "Schmoo.hpp" #include "SmokeTest.hpp" @@ -77,6 +78,7 @@ std::map presetFuncMap = {"p2p" , {PeerToPeerPreset, "Peer-to-peer device memory bandwidth test"}}, {"poda2a", {PodAllToAllPreset, "All-to-all transfers between subgroups of ranks within a pod"}}, {"podp2p", {PodPeerToPeerPreset, "Peer-to-peer transfers test among ranks within a pod"}}, + {"podring", {PodRingPreset, "Ring transfers within subgroups of ranks in a pod"}}, {"rsweep", {SweepPreset, "Randomly sweep through sets of Transfers"}}, {"scaling", {ScalingPreset, "Run scaling test from one GPU to other devices"}}, {"schmoo", {SchmooPreset, "Scaling tests for local/remote read/write/copy"}}, diff --git a/src/client/Utilities.hpp b/src/client/Utilities.hpp index 259e4cc..017ca17 100644 --- a/src/client/Utilities.hpp +++ b/src/client/Utilities.hpp @@ -155,6 +155,24 @@ namespace TransferBench::Utils bool AllocateMemory(MemDevice memDevice, size_t numBytes, void** memPtr); bool DeallocateMemory(MemType memType, void *memPtr, size_t const bytes); + // Reorder elements of list by stepping through with stride k, wrapping around. + // When gcd(k, n) > 1 the single cycle breaks into gcd(k, n) orbits which are + // concatenated, so every element appears exactly once in the output. + // The reordered list will be further separated into different groups. + void StrideGenerate(std::vector& list, int k); + + // Returns a schedule of round robin pairing of N elements, using Circle Method. + // If parallel, each round contains N/2 pairs, otherwise serial. + void RoundRobinSchedule(std::vector>>& schedule, + int N, int parallel = 0); + + // Returns a schedule for ordered 2-combination of N elements + // by pairing the list with its rotating self. + // Each round contains n pairs, where 1 <= n <= N and N is divisible by n, + // and an element cannot appear more than twice in a round. + void CombinationSchedule(std::vector>>& schedule, + int N, int n = 0); + // Implementation details below //================================================================ TableHelper::TableHelper(int numRows, int numCols, int precision) : @@ -769,4 +787,113 @@ namespace TransferBench::Utils { return (TransferBench::DeallocateMemory(memType, memPtr, bytes).errType != TransferBench::ERR_NONE); } + + void StrideGenerate(std::vector& list, int k) + { + int n = list.size(); + if (n == 0) return; + k = ((k % n) + n) % n; // normalize to 0..n-1 + if (k == 0) return; + + int d = std::gcd(k, n); + std::vector out; + out.reserve(n); + + for (int s = 0; s < d; s++) { + for (int j = 0; j < n / d; j++) { + out.push_back(list[(s + j * k) % n]); + } + } + list = std::move(out); + } + + void RoundRobinSchedule(std::vector>>& schedule, + int N, int parallel) + { + if (N == 1) { + schedule.push_back({{0, 0}}); + return; + } + // Generate standard round-robin tournament (maximum parallelism) + std::vector>> fullSchedule; + + // Pad odd number of ranks with a dummy round (N+1) + int paddedN = N + N % 2; + // Round-robin tournament scheduling + for (int round = 0; round < paddedN - 1; round++) { + std::vector> roundPairs; + std::vector> roundPairsReversed; + for (int i = 0; i < paddedN / 2; i++) { + int item1 = i; + int item2 = paddedN - 1 - i; + if (round > 0) { + // Rotate all except the first item + if (item1 > 0) item1 = ((item1 - 1 + round) % (paddedN - 1)) + 1; + if (item2 > 0) item2 = ((item2 - 1 + round) % (paddedN - 1)) + 1; + } + // Ignore dummy round, its partner sits out this ronud + if (item1 < N && item2 < N) { + roundPairs.push_back({item1, item2}); + roundPairsReversed.push_back({item2, item1}); + } + } + fullSchedule.push_back(roundPairs); + fullSchedule.push_back(roundPairsReversed); + } + + // A loopback round where all run in parallel + std::vector> selfRound; + for (int i = 0; i < N; i++) { + selfRound.push_back({i, i}); + } + fullSchedule.push_back(selfRound); + + if (parallel) { + schedule = std::move(fullSchedule); + } else { + // Serialize each round if needed + for (auto const& fullRound : fullSchedule) { + for (auto const& match : fullRound) { + std::vector> subRound; + subRound.push_back({match.first, match.second}); + schedule.push_back(subRound); + } + } + } + } + + void CombinationSchedule(std::vector>>& schedule, + int N, int n) + { + std::vector>> fullSchedule; + + if (n <= 0) n = N; + if (N <= 0 || n > N || N % n != 0) // Assuming balanced load for each round + { + n = 1; + Print("[WARN] cannot create round robin schedule, falling back to serial"); + } + + // Generate rounds of combination based on incrementing distance + for (int i = 0; i < N; i++) { + std::vector> round; + for (int j = 0; j < N; j++) { + round.push_back({j, (j + i) % N}); + } + fullSchedule.push_back(round); + } + + // Step 2: Split each full round into sub-rounds with at most n pairs + for (auto const& fullRound : fullSchedule) { + for (size_t start = 0; start < fullRound.size(); start += n) { + std::vector> subRound; + for (size_t i = start; i < start + n && i < fullRound.size(); i++) { + subRound.push_back(fullRound[i]); + } + if (!subRound.empty()) { + schedule.push_back(subRound); + } + } + } + } }; From 6886db230a916d63831703b15e6a9196c4a123fe Mon Sep 17 00:00:00 2001 From: AtlantaPepsi Date: Tue, 28 Apr 2026 17:33:19 +0000 Subject: [PATCH 2/5] addition of pod loop and minor fixes --- src/client/Presets/PodRing.hpp | 293 +++++++++++++++++---------------- src/client/Presets/Presets.hpp | 2 +- src/client/Utilities.hpp | 4 +- 3 files changed, 154 insertions(+), 145 deletions(-) diff --git a/src/client/Presets/PodRing.hpp b/src/client/Presets/PodRing.hpp index c591923..8e0b6f3 100644 --- a/src/client/Presets/PodRing.hpp +++ b/src/client/Presets/PodRing.hpp @@ -20,9 +20,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -int PodRingPreset(EnvVars& ev, - size_t const numBytesPerTransfer, - std::string const presetName) +int PodRingPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + bool const bytesSpecified) { ev.gfxUnroll = EnvVars::GetEnvVar("GFX_UNROLL", 2); @@ -94,166 +95,174 @@ int PodRingPreset(EnvVars& ev, TransferBench::ConfigOptions cfg = ev.ToConfigOptions(); ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX; - int n = numRanks * numGpus; - int numGroups = n / groupSize; + Utils::RankPerPodMap& rankToPod = Utils::GetRankPerPodMap(); + if (rankToPod.empty()) { + Utils::Print("[ERROR] No pods detected. Set TB_FORCE_SINGLE_POD=1 to treat all ranks as a single pod.\n"); + return 1; + } - std::vector indices(n); - for (int k = 0; k < n; k++) indices[k] = k; - Utils::StrideGenerate(indices, stride); + for (auto const& [pod, ranks] : rankToPod) { + int n = ranks.size() * numGpus; + int numGroups = n / groupSize; - std::vector devices(n); - for (int i = 0; i < n; i++) { - int const globalIdx = indices[i]; - int const rank = globalIdx / numGpus; - int const devIdx = globalIdx % numGpus; - devices[i] = {memType, devIdx, rank}; - } + std::vector indices(n); + for (int k = 0; k < n; k++) indices[k] = k; + Utils::StrideGenerate(indices, stride); - Utils::Print("%d ring(s) of %d devices:\n", numGroups, groupSize); - for (int group = 0; group < numGroups; group++) { - int const groupBase = group * groupSize; - Utils::Print(" Ring %d: ", group); - for (int i = 0; i < groupSize; i++) { - Utils::Print("R%d:G%d -> ", devices[groupBase + i].memRank, devices[groupBase + i].memIndex); + std::vector devices(n); + for (int i = 0; i < n; i++) { + int const globalIdx = indices[i]; + int const rank = ranks[globalIdx / numGpus]; + int const devIdx = globalIdx % numGpus; + devices[i] = {memType, devIdx, rank}; } - Utils::Print("R%d:G%d\n", devices[groupBase].memRank, devices[groupBase].memIndex); - } - Utils::Print("\n"); - - for (int group = 0; group < numGroups; group++) { - int const groupBase = group * groupSize; - std::vector transfers; - - for (int i = 0; i < groupSize; i++) { - int srcIdx = groupBase + i; - int dstIdx = groupBase + (i + 1) % groupSize; - - TransferBench::Transfer transfer; - transfer.numBytes = numBytesPerTransfer; - transfer.srcs.push_back(devices[srcIdx]); - transfer.dsts.push_back(devices[dstIdx]); - transfer.exeDevice = {exeType, - (int32_t)(useRemoteRead ? devices[dstIdx].memIndex : devices[srcIdx].memIndex), - (int32_t)(useRemoteRead ? devices[dstIdx].memRank : devices[srcIdx].memRank)}; - transfer.exeSubIndex = -1; - transfer.numSubExecs = numSubExecs; - transfers.push_back(transfer); - - if (numQueuePairs > 0) { - TransferBench::Transfer nicTransfer; - nicTransfer.numBytes = numBytesPerTransfer; - nicTransfer.srcs.push_back(devices[srcIdx]); - nicTransfer.dsts.push_back(devices[dstIdx]); - nicTransfer.exeDevice = {TransferBench::EXE_NIC_NEAREST, - (int32_t)devices[srcIdx].memIndex, (int32_t)devices[srcIdx].memRank}; - nicTransfer.exeSubIndex = devices[dstIdx].memIndex; - nicTransfer.numSubExecs = numQueuePairs; - transfers.push_back(nicTransfer); + + Utils::Print("Pod %ld: %d ring(s) of %d devices:\n", pod, numGroups, groupSize); + for (int group = 0; group < numGroups; group++) { + int const groupBase = group * groupSize; + Utils::Print(" Ring %d: ", group); + for (int i = 0; i < groupSize; i++) { + Utils::Print("R%d:G%d -> ", devices[groupBase + i].memRank, devices[groupBase + i].memIndex); } + Utils::Print("R%d:G%d\n", devices[groupBase].memRank, devices[groupBase].memIndex); } + Utils::Print("\n"); - TransferBench::TestResults results; - if (!TransferBench::RunTransfers(cfg, transfers, results)) { - for (auto const& err : results.errResults) - Utils::Print("%s\n", err.errMsg.c_str()); - return 1; - } - if (showDetails) { - Utils::PrintResults(ev, 1, transfers, results); - Utils::Print("\n"); - } + for (int group = 0; group < numGroups; group++) { + int const groupBase = group * groupSize; + std::vector transfers; + + for (int i = 0; i < groupSize; i++) { + int srcIdx = groupBase + i; + int dstIdx = groupBase + (i + 1) % groupSize; - if (Utils::RankDoesOutput()) { - Utils::Print("\n--- Pod Ring Group %d ---\n", group); - - int const numHops = groupSize; - int const numRows = 2 + numHops + 3; - int const numCols = 6; - int const precision = 2; - Utils::TableHelper table(numRows, numCols, precision); - - table.DrawRowBorder(0); - table.DrawColBorder(0); - table.DrawColBorder(numCols); - table.DrawRowBorder(numRows); - - table.Set(0, 0, " Src "); - table.Set(0, 1, " Src "); - table.Set(0, 2, " Dst "); - table.Set(0, 3, " Dst "); - table.Set(0, 4, " GFX BW "); - table.Set(1, 0, " Rank "); - table.Set(1, 1, " GPU "); - table.Set(1, 2, " Rank "); - table.Set(1, 3, " GPU "); - table.Set(1, 4, " (GB/s) "); - table.DrawColBorder(2); - table.DrawColBorder(4); - - if (numQueuePairs > 0) { - table.Set(0, 5, " NIC BW "); - table.Set(1, 5, " (GB/s) "); - } else { - table.Set(0, 5, " "); - table.Set(1, 5, " "); + TransferBench::Transfer transfer; + transfer.numBytes = numBytesPerTransfer; + transfer.srcs.push_back(devices[srcIdx]); + transfer.dsts.push_back(devices[dstIdx]); + transfer.exeDevice = {exeType, + (int32_t)(useRemoteRead ? devices[dstIdx].memIndex : devices[srcIdx].memIndex), + (int32_t)(useRemoteRead ? devices[dstIdx].memRank : devices[srcIdx].memRank)}; + transfer.exeSubIndex = -1; + transfer.numSubExecs = numSubExecs; + transfers.push_back(transfer); + + if (numQueuePairs > 0) { + TransferBench::Transfer nicTransfer; + nicTransfer.numBytes = numBytesPerTransfer; + nicTransfer.srcs.push_back(devices[srcIdx]); + nicTransfer.dsts.push_back(devices[dstIdx]); + nicTransfer.exeDevice = {TransferBench::EXE_NIC_NEAREST, + (int32_t)devices[srcIdx].memIndex, (int32_t)devices[srcIdx].memRank}; + nicTransfer.exeSubIndex = devices[dstIdx].memIndex; + nicTransfer.numSubExecs = numQueuePairs; + transfers.push_back(nicTransfer); + } } - table.DrawRowBorder(2); + TransferBench::TestResults results; + if (!TransferBench::RunTransfers(cfg, transfers, results)) { + for (auto const& err : results.errResults) + Utils::Print("%s\n", err.errMsg.c_str()); + return 1; + } + if (showDetails) { + Utils::PrintResults(ev, 1, transfers, results); + Utils::Print("\n"); + } - double gfxMin = std::numeric_limits::max(); - double gfxAvg = 0.0; - double gfxMax = std::numeric_limits::lowest(); - double nicMin = std::numeric_limits::max(); - double nicAvg = 0.0; - double nicMax = std::numeric_limits::lowest(); + if (Utils::RankDoesOutput()) { + Utils::Print("\n--- Pod %ld Ring Group %d ---\n", pod, group); + + int const numHops = groupSize; + int const numRows = 2 + numHops + 3; + int const numCols = 6; + int const precision = 2; + Utils::TableHelper table(numRows, numCols, precision); + + table.DrawRowBorder(0); + table.DrawColBorder(0); + table.DrawColBorder(numCols); + table.DrawRowBorder(numRows); + + table.Set(0, 0, " Src "); + table.Set(0, 1, " Src "); + table.Set(0, 2, " Dst "); + table.Set(0, 3, " Dst "); + table.Set(0, 4, " GFX BW "); + table.Set(1, 0, " Rank "); + table.Set(1, 1, " GPU "); + table.Set(1, 2, " Rank "); + table.Set(1, 3, " GPU "); + table.Set(1, 4, " (GB/s) "); + table.DrawColBorder(2); + table.DrawColBorder(4); - int tfrIdx = 0; - for (int i = 0; i < numHops; i++) { - int srcIdx = groupBase + i; - int dstIdx = groupBase + (i + 1) % groupSize; - int row = 2 + i; + if (numQueuePairs > 0) { + table.Set(0, 5, " NIC BW "); + table.Set(1, 5, " (GB/s) "); + } else { + table.Set(0, 5, " "); + table.Set(1, 5, " "); + } - double gfxBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec; - tfrIdx++; + table.DrawRowBorder(2); - table.Set(row, 0, " %d ", devices[srcIdx].memRank); - table.Set(row, 1, " %d ", devices[srcIdx].memIndex); - table.Set(row, 2, " %d ", devices[dstIdx].memRank); - table.Set(row, 3, " %d ", devices[dstIdx].memIndex); - table.Set(row, 4, " %.2f ", gfxBw); + double gfxMin = std::numeric_limits::max(); + double gfxAvg = 0.0; + double gfxMax = std::numeric_limits::lowest(); + double nicMin = std::numeric_limits::max(); + double nicAvg = 0.0; + double nicMax = std::numeric_limits::lowest(); - gfxMin = std::min(gfxMin, gfxBw); - gfxAvg += gfxBw; - gfxMax = std::max(gfxMax, gfxBw); + int tfrIdx = 0; + for (int i = 0; i < numHops; i++) { + int srcIdx = groupBase + i; + int dstIdx = groupBase + (i + 1) % groupSize; + int row = 2 + i; - if (numQueuePairs > 0) { - double nicBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec; + double gfxBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec; tfrIdx++; - table.Set(row, 5, " %.2f ", nicBw); - nicMin = std::min(nicMin, nicBw); - nicAvg += nicBw; - nicMax = std::max(nicMax, nicBw); + + table.Set(row, 0, " %d ", devices[srcIdx].memRank); + table.Set(row, 1, " %d ", devices[srcIdx].memIndex); + table.Set(row, 2, " %d ", devices[dstIdx].memRank); + table.Set(row, 3, " %d ", devices[dstIdx].memIndex); + table.Set(row, 4, " %.2f ", gfxBw); + + gfxMin = std::min(gfxMin, gfxBw); + gfxAvg += gfxBw; + gfxMax = std::max(gfxMax, gfxBw); + + if (numQueuePairs > 0) { + double nicBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec; + tfrIdx++; + table.Set(row, 5, " %.2f ", nicBw); + nicMin = std::min(nicMin, nicBw); + nicAvg += nicBw; + nicMax = std::max(nicMax, nicBw); + } } - } - int summaryBase = 2 + numHops; - table.DrawRowBorder(summaryBase); - table.Set(summaryBase , 1, " MAX "); - table.Set(summaryBase + 1, 1, " AVG "); - table.Set(summaryBase + 2, 1, " MIN "); - table.Set(summaryBase , 4, " %.2f ", gfxMax); - table.Set(summaryBase + 1, 4, " %.2f ", gfxAvg / numHops); - table.Set(summaryBase + 2, 4, " %.2f ", gfxMin); - - if (numQueuePairs > 0) { - table.Set(summaryBase , 5, " %.2f ", nicMax); - table.Set(summaryBase + 1, 5, " %.2f ", nicAvg / numHops); - table.Set(summaryBase + 2, 5, " %.2f ", nicMin); - } + int summaryBase = 2 + numHops; + table.DrawRowBorder(summaryBase); + table.Set(summaryBase , 1, " MAX "); + table.Set(summaryBase + 1, 1, " AVG "); + table.Set(summaryBase + 2, 1, " MIN "); + table.Set(summaryBase , 4, " %.2f ", gfxMax); + table.Set(summaryBase + 1, 4, " %.2f ", gfxAvg / numHops); + table.Set(summaryBase + 2, 4, " %.2f ", gfxMin); - table.PrintTable(ev.outputToCsv, ev.showBorders); + if (numQueuePairs > 0) { + table.Set(summaryBase , 5, " %.2f ", nicMax); + table.Set(summaryBase + 1, 5, " %.2f ", nicAvg / numHops); + table.Set(summaryBase + 2, 5, " %.2f ", nicMin); + } + + table.PrintTable(ev.outputToCsv, ev.showBorders); - Utils::Print("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec); + Utils::Print("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec); + } } } diff --git a/src/client/Presets/Presets.hpp b/src/client/Presets/Presets.hpp index 09e25b2..5505681 100644 --- a/src/client/Presets/Presets.hpp +++ b/src/client/Presets/Presets.hpp @@ -78,7 +78,7 @@ std::map presetFuncMap = {"p2p" , {PeerToPeerPreset, "Peer-to-peer device memory bandwidth test"}}, {"poda2a", {PodAllToAllPreset, "All-to-all transfers between subgroups of ranks within a pod"}}, {"podp2p", {PodPeerToPeerPreset, "Peer-to-peer transfers test among ranks within a pod"}}, - {"podring", {PodRingPreset, "Ring transfers within subgroups of ranks in a pod"}}, + {"podring", {PodRingPreset, "Ring transfers within subgroups of ranks in a pod"}}, {"rsweep", {SweepPreset, "Randomly sweep through sets of Transfers"}}, {"scaling", {ScalingPreset, "Run scaling test from one GPU to other devices"}}, {"schmoo", {SchmooPreset, "Scaling tests for local/remote read/write/copy"}}, diff --git a/src/client/Utilities.hpp b/src/client/Utilities.hpp index 017ca17..497770f 100644 --- a/src/client/Utilities.hpp +++ b/src/client/Utilities.hpp @@ -831,7 +831,7 @@ namespace TransferBench::Utils if (item1 > 0) item1 = ((item1 - 1 + round) % (paddedN - 1)) + 1; if (item2 > 0) item2 = ((item2 - 1 + round) % (paddedN - 1)) + 1; } - // Ignore dummy round, its partner sits out this ronud + // Ignore dummy round, its partner sits out this round if (item1 < N && item2 < N) { roundPairs.push_back({item1, item2}); roundPairsReversed.push_back({item2, item1}); @@ -871,7 +871,7 @@ namespace TransferBench::Utils if (N <= 0 || n > N || N % n != 0) // Assuming balanced load for each round { n = 1; - Print("[WARN] cannot create round robin schedule, falling back to serial"); + Print("[WARN] cannot create combination schedule, falling back to serial\n"); } // Generate rounds of combination based on incrementing distance From 86e6eff4ad7dfca63abc484c0a144281bff48be5 Mon Sep 17 00:00:00 2001 From: AtlantaPepsi Date: Tue, 28 Apr 2026 18:57:36 +0000 Subject: [PATCH 3/5] adjusting sizing checks --- src/client/Presets/PodRing.hpp | 59 ++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/src/client/Presets/PodRing.hpp b/src/client/Presets/PodRing.hpp index 8e0b6f3..82d5f82 100644 --- a/src/client/Presets/PodRing.hpp +++ b/src/client/Presets/PodRing.hpp @@ -30,6 +30,12 @@ int PodRingPreset(EnvVars& ev, int numRanks = TransferBench::GetNumRanks(); int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX); + Utils::RankPerPodMap& rankToPod = Utils::GetRankPerPodMap(); + if (rankToPod.empty()) { + Utils::Print("[ERROR] No pods detected. Set TB_FORCE_SINGLE_POD=1 to treat all ranks as a single pod.\n"); + return 1; + } + int memTypeIdx = EnvVars::GetEnvVar("MEM_TYPE" , 0); int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus); int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 0); @@ -40,15 +46,32 @@ int PodRingPreset(EnvVars& ev, int stride = EnvVars::GetEnvVar("STRIDE" , 1); int groupSize = EnvVars::GetEnvVar("GROUP_SIZE" , numRanks * numGpus); + if (numGpus <= 0 || numGpus > numDetectedGpus) { + Utils::Print("[ERROR] Cannot use %d GPUs. Detected %d GPUs\n", numGpus, numDetectedGpus); + return 1; + } + if (groupSize < 2) { + Utils::Print("[ERROR] Group size must be at least 2 to form a ring\n"); + return 1; + } + int numNics = TransferBench::GetNumExecutors(EXE_NIC, 0); bool nicDifference = false; - for (int rank = 0; rank < numRanks; rank++) { - if (numGpus > TransferBench::GetNumExecutors(EXE_GPU_GFX, rank)) { - Utils::Print("[ERROR] PodRing preset requires each rank to have the same number of GPUs\n"); + for (auto const& [pod, ranks] : rankToPod) { + int const podDevices = ranks.size() * numGpus; + if (podDevices % groupSize) { + Utils::Print("[ERROR] Group size %d cannot evenly divide %d devices in pod %ld (%zu ranks x %d GPUs).\n", + groupSize, podDevices, pod, ranks.size(), numGpus); return 1; } - if (numQueuePairs > 0 && numNics != TransferBench::GetNumExecutors(EXE_NIC, rank)) - nicDifference = true; + for (int rank : ranks) { + if (numGpus > TransferBench::GetNumExecutors(EXE_GPU_GFX, rank)) { + Utils::Print("[ERROR] Pod %ld rank %d has fewer than %d GPUs\n", pod, rank, numGpus); + return 1; + } + if (numQueuePairs > 0 && numNics != TransferBench::GetNumExecutors(EXE_NIC, rank)) + nicDifference = true; + } } if (nicDifference) Utils::Print("[WARN] Not all ranks have the same number of NICs\n"); @@ -67,40 +90,20 @@ int PodRingPreset(EnvVars& ev, ev.Print("USE_DMA_EXEC" , useDmaExec , "Using %s executor", useDmaExec ? "DMA" : "GFX"); ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor", useRemoteRead ? "DST" : "SRC"); ev.Print("STRIDE" , stride , "Reordering devices by taking %d steps", stride); - ev.Print("GROUP_SIZE" , groupSize , "Dividing all devices into ring groups of %d", groupSize); + ev.Print("GROUP_SIZE" , groupSize , "Dividing each pod's devices into ring groups of %d", groupSize); printf("\n"); } } - if (numGpus <= 0 || numGpus > numDetectedGpus) { - Utils::Print("[ERROR] Cannot use %d GPUs. Detected %d GPUs\n", numGpus, numDetectedGpus); - return 1; - } - if (groupSize < 2) { - Utils::Print("[ERROR] Group size must be at least 2 to form a ring\n"); - return 1; - } - if (numRanks * numDetectedGpus % groupSize) { - Utils::Print("[ERROR] Group size %d cannot evenly divide %d total devices from %d ranks.\n", - groupSize, numRanks * numDetectedGpus, numRanks); - return 1; - } - Utils::Print("GPU-%s IntraPod Ring benchmark:\n", useDmaExec ? "DMA" : "GFX"); Utils::Print("==============================\n"); - Utils::Print("[%lu bytes per Transfer] [%s:%d] [MemType:%s] [NIC QueuePairs:%d] [#Ranks:%d]\n", + Utils::Print("[%lu bytes per Transfer] [%s:%d] [MemType:%s] [NIC QueuePairs:%d] [#Ranks:%d] [#Pods:%zu]\n", numBytesPerTransfer, useDmaExec ? "DMA" : "GFX", numSubExecs, - devMemTypeStr.c_str(), numQueuePairs, numRanks); + devMemTypeStr.c_str(), numQueuePairs, numRanks, rankToPod.size()); TransferBench::ConfigOptions cfg = ev.ToConfigOptions(); ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX; - Utils::RankPerPodMap& rankToPod = Utils::GetRankPerPodMap(); - if (rankToPod.empty()) { - Utils::Print("[ERROR] No pods detected. Set TB_FORCE_SINGLE_POD=1 to treat all ranks as a single pod.\n"); - return 1; - } - for (auto const& [pod, ranks] : rankToPod) { int n = ranks.size() * numGpus; int numGroups = n / groupSize; From 92cc2657e76fda7d308b1fa6c34221643956a994 Mon Sep 17 00:00:00 2001 From: AtlantaPepsi Date: Tue, 28 Apr 2026 19:52:31 +0000 Subject: [PATCH 4/5] rolling back to single pod --- src/client/Presets/PodRing.hpp | 329 +++++++++++++++++---------------- 1 file changed, 165 insertions(+), 164 deletions(-) diff --git a/src/client/Presets/PodRing.hpp b/src/client/Presets/PodRing.hpp index 82d5f82..5b449e5 100644 --- a/src/client/Presets/PodRing.hpp +++ b/src/client/Presets/PodRing.hpp @@ -25,17 +25,23 @@ int PodRingPreset(EnvVars& ev, std::string const presetName, bool const bytesSpecified) { + // Assuming single pod, for now + if (Utils::GetNumRankGroups() > 1) { + Utils::Print("[ERROR] PodRing preset can only be run across ranks that are homogenous\n"); + Utils::Print("[ERROR] Run ./TransferBench without any args to display topology information\n"); + Utils::Print("[ERROR] TB_NIC_FILTER may also be used to limit NIC visibility\n"); + return 1; + } + if (Utils::GetRankPerPodMap().empty()) { + Utils::Print("[ERROR] No pods detected. Set TB_FORCE_SINGLE_POD=1 to treat all ranks as a single pod.\n"); + return 1; + } + ev.gfxUnroll = EnvVars::GetEnvVar("GFX_UNROLL", 2); int numRanks = TransferBench::GetNumRanks(); int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX); - Utils::RankPerPodMap& rankToPod = Utils::GetRankPerPodMap(); - if (rankToPod.empty()) { - Utils::Print("[ERROR] No pods detected. Set TB_FORCE_SINGLE_POD=1 to treat all ranks as a single pod.\n"); - return 1; - } - int memTypeIdx = EnvVars::GetEnvVar("MEM_TYPE" , 0); int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus); int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 0); @@ -54,24 +60,21 @@ int PodRingPreset(EnvVars& ev, Utils::Print("[ERROR] Group size must be at least 2 to form a ring\n"); return 1; } + if (numRanks * numGpus % groupSize) { + Utils::Print("[ERROR] Group size %d cannot evenly divide %d total devices from %d ranks.\n", + groupSize, numRanks * numGpus, numRanks); + return 1; + } int numNics = TransferBench::GetNumExecutors(EXE_NIC, 0); bool nicDifference = false; - for (auto const& [pod, ranks] : rankToPod) { - int const podDevices = ranks.size() * numGpus; - if (podDevices % groupSize) { - Utils::Print("[ERROR] Group size %d cannot evenly divide %d devices in pod %ld (%zu ranks x %d GPUs).\n", - groupSize, podDevices, pod, ranks.size(), numGpus); + for (int rank = 0; rank < numRanks; rank++) { + if (numGpus > TransferBench::GetNumExecutors(EXE_GPU_GFX, rank)) { + Utils::Print("[ERROR] PodRing preset requires each rank to have the same number of GPUs\n"); return 1; } - for (int rank : ranks) { - if (numGpus > TransferBench::GetNumExecutors(EXE_GPU_GFX, rank)) { - Utils::Print("[ERROR] Pod %ld rank %d has fewer than %d GPUs\n", pod, rank, numGpus); - return 1; - } - if (numQueuePairs > 0 && numNics != TransferBench::GetNumExecutors(EXE_NIC, rank)) - nicDifference = true; - } + if (numQueuePairs > 0 && numNics != TransferBench::GetNumExecutors(EXE_NIC, rank)) + nicDifference = true; } if (nicDifference) Utils::Print("[WARN] Not all ranks have the same number of NICs\n"); @@ -90,182 +93,180 @@ int PodRingPreset(EnvVars& ev, ev.Print("USE_DMA_EXEC" , useDmaExec , "Using %s executor", useDmaExec ? "DMA" : "GFX"); ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor", useRemoteRead ? "DST" : "SRC"); ev.Print("STRIDE" , stride , "Reordering devices by taking %d steps", stride); - ev.Print("GROUP_SIZE" , groupSize , "Dividing each pod's devices into ring groups of %d", groupSize); + ev.Print("GROUP_SIZE" , groupSize , "Dividing all devices into ring groups of %d", groupSize); printf("\n"); } } Utils::Print("GPU-%s IntraPod Ring benchmark:\n", useDmaExec ? "DMA" : "GFX"); Utils::Print("==============================\n"); - Utils::Print("[%lu bytes per Transfer] [%s:%d] [MemType:%s] [NIC QueuePairs:%d] [#Ranks:%d] [#Pods:%zu]\n", + Utils::Print("[%lu bytes per Transfer] [%s:%d] [MemType:%s] [NIC QueuePairs:%d] [#Ranks:%d]\n", numBytesPerTransfer, useDmaExec ? "DMA" : "GFX", numSubExecs, - devMemTypeStr.c_str(), numQueuePairs, numRanks, rankToPod.size()); + devMemTypeStr.c_str(), numQueuePairs, numRanks); TransferBench::ConfigOptions cfg = ev.ToConfigOptions(); ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX; - for (auto const& [pod, ranks] : rankToPod) { - int n = ranks.size() * numGpus; - int numGroups = n / groupSize; + int n = numRanks * numGpus; + int numGroups = n / groupSize; - std::vector indices(n); - for (int k = 0; k < n; k++) indices[k] = k; - Utils::StrideGenerate(indices, stride); + std::vector indices(n); + for (int k = 0; k < n; k++) indices[k] = k; + Utils::StrideGenerate(indices, stride); - std::vector devices(n); - for (int i = 0; i < n; i++) { - int const globalIdx = indices[i]; - int const rank = ranks[globalIdx / numGpus]; - int const devIdx = globalIdx % numGpus; - devices[i] = {memType, devIdx, rank}; - } + std::vector devices(n); + for (int i = 0; i < n; i++) { + int const globalIdx = indices[i]; + int const rank = globalIdx / numGpus; + int const devIdx = globalIdx % numGpus; + devices[i] = {memType, devIdx, rank}; + } - Utils::Print("Pod %ld: %d ring(s) of %d devices:\n", pod, numGroups, groupSize); - for (int group = 0; group < numGroups; group++) { - int const groupBase = group * groupSize; - Utils::Print(" Ring %d: ", group); - for (int i = 0; i < groupSize; i++) { - Utils::Print("R%d:G%d -> ", devices[groupBase + i].memRank, devices[groupBase + i].memIndex); + Utils::Print("%d ring(s) of %d devices:\n", numGroups, groupSize); + for (int group = 0; group < numGroups; group++) { + int const groupBase = group * groupSize; + Utils::Print(" Ring %d: ", group); + for (int i = 0; i < groupSize; i++) { + Utils::Print("R%d:G%d -> ", devices[groupBase + i].memRank, devices[groupBase + i].memIndex); + } + Utils::Print("R%d:G%d\n", devices[groupBase].memRank, devices[groupBase].memIndex); + } + Utils::Print("\n"); + + for (int group = 0; group < numGroups; group++) { + int const groupBase = group * groupSize; + std::vector transfers; + + for (int i = 0; i < groupSize; i++) { + int srcIdx = groupBase + i; + int dstIdx = groupBase + (i + 1) % groupSize; + + TransferBench::Transfer transfer; + transfer.numBytes = numBytesPerTransfer; + transfer.srcs.push_back(devices[srcIdx]); + transfer.dsts.push_back(devices[dstIdx]); + transfer.exeDevice = {exeType, + (int32_t)(useRemoteRead ? devices[dstIdx].memIndex : devices[srcIdx].memIndex), + (int32_t)(useRemoteRead ? devices[dstIdx].memRank : devices[srcIdx].memRank)}; + transfer.exeSubIndex = -1; + transfer.numSubExecs = numSubExecs; + transfers.push_back(transfer); + + if (numQueuePairs > 0) { + TransferBench::Transfer nicTransfer; + nicTransfer.numBytes = numBytesPerTransfer; + nicTransfer.srcs.push_back(devices[srcIdx]); + nicTransfer.dsts.push_back(devices[dstIdx]); + nicTransfer.exeDevice = {TransferBench::EXE_NIC_NEAREST, + (int32_t)devices[srcIdx].memIndex, (int32_t)devices[srcIdx].memRank}; + nicTransfer.exeSubIndex = devices[dstIdx].memIndex; + nicTransfer.numSubExecs = numQueuePairs; + transfers.push_back(nicTransfer); } - Utils::Print("R%d:G%d\n", devices[groupBase].memRank, devices[groupBase].memIndex); } - Utils::Print("\n"); - - for (int group = 0; group < numGroups; group++) { - int const groupBase = group * groupSize; - std::vector transfers; - for (int i = 0; i < groupSize; i++) { - int srcIdx = groupBase + i; - int dstIdx = groupBase + (i + 1) % groupSize; - - TransferBench::Transfer transfer; - transfer.numBytes = numBytesPerTransfer; - transfer.srcs.push_back(devices[srcIdx]); - transfer.dsts.push_back(devices[dstIdx]); - transfer.exeDevice = {exeType, - (int32_t)(useRemoteRead ? devices[dstIdx].memIndex : devices[srcIdx].memIndex), - (int32_t)(useRemoteRead ? devices[dstIdx].memRank : devices[srcIdx].memRank)}; - transfer.exeSubIndex = -1; - transfer.numSubExecs = numSubExecs; - transfers.push_back(transfer); + TransferBench::TestResults results; + if (!TransferBench::RunTransfers(cfg, transfers, results)) { + for (auto const& err : results.errResults) + Utils::Print("%s\n", err.errMsg.c_str()); + return 1; + } + if (showDetails) { + Utils::PrintResults(ev, 1, transfers, results); + Utils::Print("\n"); + } - if (numQueuePairs > 0) { - TransferBench::Transfer nicTransfer; - nicTransfer.numBytes = numBytesPerTransfer; - nicTransfer.srcs.push_back(devices[srcIdx]); - nicTransfer.dsts.push_back(devices[dstIdx]); - nicTransfer.exeDevice = {TransferBench::EXE_NIC_NEAREST, - (int32_t)devices[srcIdx].memIndex, (int32_t)devices[srcIdx].memRank}; - nicTransfer.exeSubIndex = devices[dstIdx].memIndex; - nicTransfer.numSubExecs = numQueuePairs; - transfers.push_back(nicTransfer); - } + if (Utils::RankDoesOutput()) { + Utils::Print("\n--- Pod Ring Group %d ---\n", group); + + int const numHops = groupSize; + int const numRows = 2 + numHops + 3; + int const numCols = 6; + int const precision = 2; + Utils::TableHelper table(numRows, numCols, precision); + + table.DrawRowBorder(0); + table.DrawColBorder(0); + table.DrawColBorder(numCols); + table.DrawRowBorder(numRows); + + table.Set(0, 0, " Src "); + table.Set(0, 1, " Src "); + table.Set(0, 2, " Dst "); + table.Set(0, 3, " Dst "); + table.Set(0, 4, " GFX BW "); + table.Set(1, 0, " Rank "); + table.Set(1, 1, " GPU "); + table.Set(1, 2, " Rank "); + table.Set(1, 3, " GPU "); + table.Set(1, 4, " (GB/s) "); + table.DrawColBorder(2); + table.DrawColBorder(4); + + if (numQueuePairs > 0) { + table.Set(0, 5, " NIC BW "); + table.Set(1, 5, " (GB/s) "); + } else { + table.Set(0, 5, " "); + table.Set(1, 5, " "); } - TransferBench::TestResults results; - if (!TransferBench::RunTransfers(cfg, transfers, results)) { - for (auto const& err : results.errResults) - Utils::Print("%s\n", err.errMsg.c_str()); - return 1; - } - if (showDetails) { - Utils::PrintResults(ev, 1, transfers, results); - Utils::Print("\n"); - } + table.DrawRowBorder(2); - if (Utils::RankDoesOutput()) { - Utils::Print("\n--- Pod %ld Ring Group %d ---\n", pod, group); - - int const numHops = groupSize; - int const numRows = 2 + numHops + 3; - int const numCols = 6; - int const precision = 2; - Utils::TableHelper table(numRows, numCols, precision); - - table.DrawRowBorder(0); - table.DrawColBorder(0); - table.DrawColBorder(numCols); - table.DrawRowBorder(numRows); - - table.Set(0, 0, " Src "); - table.Set(0, 1, " Src "); - table.Set(0, 2, " Dst "); - table.Set(0, 3, " Dst "); - table.Set(0, 4, " GFX BW "); - table.Set(1, 0, " Rank "); - table.Set(1, 1, " GPU "); - table.Set(1, 2, " Rank "); - table.Set(1, 3, " GPU "); - table.Set(1, 4, " (GB/s) "); - table.DrawColBorder(2); - table.DrawColBorder(4); + double gfxMin = std::numeric_limits::max(); + double gfxAvg = 0.0; + double gfxMax = std::numeric_limits::lowest(); + double nicMin = std::numeric_limits::max(); + double nicAvg = 0.0; + double nicMax = std::numeric_limits::lowest(); - if (numQueuePairs > 0) { - table.Set(0, 5, " NIC BW "); - table.Set(1, 5, " (GB/s) "); - } else { - table.Set(0, 5, " "); - table.Set(1, 5, " "); - } + int tfrIdx = 0; + for (int i = 0; i < numHops; i++) { + int srcIdx = groupBase + i; + int dstIdx = groupBase + (i + 1) % groupSize; + int row = 2 + i; - table.DrawRowBorder(2); + double gfxBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec; + tfrIdx++; - double gfxMin = std::numeric_limits::max(); - double gfxAvg = 0.0; - double gfxMax = std::numeric_limits::lowest(); - double nicMin = std::numeric_limits::max(); - double nicAvg = 0.0; - double nicMax = std::numeric_limits::lowest(); + table.Set(row, 0, " %d ", devices[srcIdx].memRank); + table.Set(row, 1, " %d ", devices[srcIdx].memIndex); + table.Set(row, 2, " %d ", devices[dstIdx].memRank); + table.Set(row, 3, " %d ", devices[dstIdx].memIndex); + table.Set(row, 4, " %.2f ", gfxBw); - int tfrIdx = 0; - for (int i = 0; i < numHops; i++) { - int srcIdx = groupBase + i; - int dstIdx = groupBase + (i + 1) % groupSize; - int row = 2 + i; + gfxMin = std::min(gfxMin, gfxBw); + gfxAvg += gfxBw; + gfxMax = std::max(gfxMax, gfxBw); - double gfxBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec; + if (numQueuePairs > 0) { + double nicBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec; tfrIdx++; - - table.Set(row, 0, " %d ", devices[srcIdx].memRank); - table.Set(row, 1, " %d ", devices[srcIdx].memIndex); - table.Set(row, 2, " %d ", devices[dstIdx].memRank); - table.Set(row, 3, " %d ", devices[dstIdx].memIndex); - table.Set(row, 4, " %.2f ", gfxBw); - - gfxMin = std::min(gfxMin, gfxBw); - gfxAvg += gfxBw; - gfxMax = std::max(gfxMax, gfxBw); - - if (numQueuePairs > 0) { - double nicBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec; - tfrIdx++; - table.Set(row, 5, " %.2f ", nicBw); - nicMin = std::min(nicMin, nicBw); - nicAvg += nicBw; - nicMax = std::max(nicMax, nicBw); - } + table.Set(row, 5, " %.2f ", nicBw); + nicMin = std::min(nicMin, nicBw); + nicAvg += nicBw; + nicMax = std::max(nicMax, nicBw); } + } - int summaryBase = 2 + numHops; - table.DrawRowBorder(summaryBase); - table.Set(summaryBase , 1, " MAX "); - table.Set(summaryBase + 1, 1, " AVG "); - table.Set(summaryBase + 2, 1, " MIN "); - table.Set(summaryBase , 4, " %.2f ", gfxMax); - table.Set(summaryBase + 1, 4, " %.2f ", gfxAvg / numHops); - table.Set(summaryBase + 2, 4, " %.2f ", gfxMin); - - if (numQueuePairs > 0) { - table.Set(summaryBase , 5, " %.2f ", nicMax); - table.Set(summaryBase + 1, 5, " %.2f ", nicAvg / numHops); - table.Set(summaryBase + 2, 5, " %.2f ", nicMin); - } + int summaryBase = 2 + numHops; + table.DrawRowBorder(summaryBase); + table.Set(summaryBase , 1, " MAX "); + table.Set(summaryBase + 1, 1, " AVG "); + table.Set(summaryBase + 2, 1, " MIN "); + table.Set(summaryBase , 4, " %.2f ", gfxMax); + table.Set(summaryBase + 1, 4, " %.2f ", gfxAvg / numHops); + table.Set(summaryBase + 2, 4, " %.2f ", gfxMin); + + if (numQueuePairs > 0) { + table.Set(summaryBase , 5, " %.2f ", nicMax); + table.Set(summaryBase + 1, 5, " %.2f ", nicAvg / numHops); + table.Set(summaryBase + 2, 5, " %.2f ", nicMin); + } - table.PrintTable(ev.outputToCsv, ev.showBorders); + table.PrintTable(ev.outputToCsv, ev.showBorders); - Utils::Print("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec); - } + Utils::Print("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec); } } From 56b8de6c97f2bb7a00ee0c9c3d6e91220a3c5e11 Mon Sep 17 00:00:00 2001 From: AtlantaPepsi Date: Tue, 28 Apr 2026 21:50:20 +0000 Subject: [PATCH 5/5] trigger CI --- src/client/Presets/PodAllToAll.hpp | 144 ++++++++++++++++++++++------- 1 file changed, 110 insertions(+), 34 deletions(-) diff --git a/src/client/Presets/PodAllToAll.hpp b/src/client/Presets/PodAllToAll.hpp index dc33f75..b449ca8 100644 --- a/src/client/Presets/PodAllToAll.hpp +++ b/src/client/Presets/PodAllToAll.hpp @@ -151,9 +151,17 @@ int PodAllToAllPreset(EnvVars& ev, } } + // Build transfers for every group, then run once per pod so all groups share the same + // timed iterations (traffic across groups is concurrent within RunTransfers). + std::vector podTransfers; + std::vector groupTransferBase(numGroups); + std::vector>> groupReIndexes(numGroups); + for (int group = 0; group < numGroups; group++) { - std::vector> groupReIndex(groupSize, std::vector(groupSize, -1)); - std::vector transfers; + groupTransferBase[group] = podTransfers.size(); + groupReIndexes[group].assign(groupSize, std::vector(groupSize, -1)); + std::vector>& groupReIndex = groupReIndexes[group]; + for (int i = group * groupSize; i < (group + 1) * groupSize; i++) { for (int j = group * groupSize; j < (group + 1) * groupSize; j++) { if (i == j) { @@ -171,8 +179,9 @@ int PodAllToAllPreset(EnvVars& ev, transfer.numSubExecs = numSubExecs; int const localI = i - group * groupSize; int const localJ = j - group * groupSize; - groupReIndex[localI][localJ] = (int)transfers.size(); - transfers.push_back(transfer); + groupReIndex[localI][localJ] = + (int)(podTransfers.size() - groupTransferBase[group]); + podTransfers.push_back(transfer); } if (numQueuePairs > 0) { @@ -185,19 +194,47 @@ int PodAllToAllPreset(EnvVars& ev, (int32_t)devices[i].memIndex, (int32_t)devices[i].memRank}; transfer.exeSubIndex = devices[next].memIndex; transfer.numSubExecs = numQueuePairs; - transfers.push_back(transfer); + podTransfers.push_back(transfer); } } - TransferBench::TestResults results; - if (!TransferBench::RunTransfers(cfg, transfers, results)) { - for (auto const& err : results.errResults) - Utils::Print("%s\n", err.errMsg.c_str()); - return 1; - } - if (showDetails) { - Utils::PrintResults(ev, 1, transfers, results); + } + + if (Utils::RankDoesOutput()) { + for (int g = 0; g < numGroups; g++) { + int const gb = g * groupSize; + Utils::Print("A2A group %d:", g); + std::vector ord(groupSize); + for (int i = 0; i < groupSize; i++) ord[i] = i; + std::sort(ord.begin(), ord.end(), [&](int a, int b) { + MemDevice const& da = devices[gb + a]; + MemDevice const& db = devices[gb + b]; + if (da.memRank != db.memRank) return da.memRank < db.memRank; + return da.memIndex < db.memIndex; + }); + for (size_t si = 0; si < ord.size(); si++) { + MemDevice const& d = devices[gb + ord[si]]; + Utils::Print("%s R%d:G%d", si ? "," : "", d.memRank, d.memIndex); + } Utils::Print("\n"); } + } + + TransferBench::TestResults results; + if (!TransferBench::RunTransfers(cfg, podTransfers, results)) { + for (auto const& err : results.errResults) + Utils::Print("%s\n", err.errMsg.c_str()); + return 1; + } + if (showDetails) { + if (Utils::RankDoesOutput()) + Utils::Print("\n--- Pod AllToAll (all %d groups concurrent) ---\n", numGroups); + Utils::PrintResults(ev, 1, podTransfers, results); + Utils::Print("\n"); + } + + for (int group = 0; group < numGroups; group++) { + std::vector> const& groupReIndex = groupReIndexes[group]; + size_t const tfrBase = groupTransferBase[group]; // Per-group bandwidth table std::vector> groupBw(groupSize, std::vector(groupSize, -1.0)); @@ -205,14 +242,42 @@ int PodAllToAllPreset(EnvVars& ev, for (int localJ = 0; localJ < groupSize; localJ++) { int const k = groupReIndex[localI][localJ]; if (k >= 0) - groupBw[localI][localJ] = results.tfrResults[k].avgBandwidthGbPerSec; + groupBw[localI][localJ] = results.tfrResults[tfrBase + k].avgBandwidthGbPerSec; } } if (Utils::RankDoesOutput()) { Utils::Print("\n--- Pod AllToAll Group %d ---\n", group); int const groupBase = group * groupSize; + + // Display order: group devices by MPI rank, then GPU index (stride only affects execution order). + std::vector order(groupSize); + for (int i = 0; i < groupSize; i++) order[i] = i; + std::sort(order.begin(), order.end(), [&](int a, int b) { + MemDevice const& da = devices[groupBase + a]; + MemDevice const& db = devices[groupBase + b]; + if (da.memRank != db.memRank) return da.memRank < db.memRank; + return da.memIndex < db.memIndex; + }); + std::vector colRanks; + for (int slot : order) { + int const r = devices[groupBase + slot].memRank; + if (colRanks.empty() || colRanks.back() != r) colRanks.push_back(r); + } + std::vector> localsPerCol; + localsPerCol.reserve(colRanks.size()); + for (int dr : colRanks) { + std::vector loc; + for (int li = 0; li < groupSize; li++) { + if (devices[groupBase + li].memRank == dr) loc.push_back(li); + } + std::sort(loc.begin(), loc.end(), [&](int a, int b) { + return devices[groupBase + a].memIndex < devices[groupBase + b].memIndex; + }); + localsPerCol.push_back(std::move(loc)); + } + int const numRows = 2 + groupSize; - int const numCols = 2 + groupSize; + int const numCols = 2 + (int)colRanks.size(); int const precision = 2; Utils::TableHelper table(numRows, numCols, precision); table.DrawRowBorder(0); @@ -224,35 +289,46 @@ int PodAllToAllPreset(EnvVars& ev, table.DrawColBorder(1); table.Set(1, 1, " Mem Device "); - // Column headers - int colPrevRank = -1; - for (int j = 0; j < groupSize; j++) { - int colIdx = 2 + j; - int r = devices[groupBase + j].memRank; - if (r != colPrevRank) { - table.DrawColBorder(colIdx); - table.Set(0, colIdx, " Rank %02d ", r); - colPrevRank = r; + for (size_t c = 0; c < colRanks.size(); c++) { + int const colIdx = 2 + (int)c; + table.DrawColBorder(colIdx); + table.Set(0, colIdx, " Rank %02d ", colRanks[c]); + std::string gpuHdr; + for (int li : localsPerCol[c]) { + char t[24]; + snprintf(t, sizeof(t), " GPU %02d ", devices[groupBase + li].memIndex); + gpuHdr += t; } - table.Set(1, colIdx, " GPU %02d ", devices[groupBase + j].memIndex); + table.Set(1, colIdx, "%s", gpuHdr.c_str()); + table.SetColAlignment((int)c + 2, Utils::TableHelper::ALIGN_LEFT); } - // Row headers and data int rowPrevRank = -1; - for (int localI = 0; localI < groupSize; localI++) { - int rowIdx = 2 + localI; - int r = devices[groupBase + localI].memRank; + for (int disp = 0; disp < groupSize; disp++) { + int const localI = order[disp]; + int const rowIdx = 2 + disp; + int const r = devices[groupBase + localI].memRank; if (r != rowPrevRank) { table.DrawRowBorder(rowIdx); table.Set(rowIdx, 0, " Rank %02d ", r); rowPrevRank = r; + } else { + table.Set(rowIdx, 0, " "); } table.Set(rowIdx, 1, " GPU %02d ", devices[groupBase + localI].memIndex); - for (int localJ = 0; localJ < groupSize; localJ++) { - if (groupBw[localI][localJ] >= 0) - table.Set(rowIdx, 2 + localJ, " %.2f ", groupBw[localI][localJ]); - else - table.Set(rowIdx, 2 + localJ, " N/A "); + for (size_t c = 0; c < colRanks.size(); c++) { + std::string cell; + for (int localJ : localsPerCol[c]) { + char t[16]; + if (groupBw[localI][localJ] >= 0) + snprintf(t, sizeof(t), " %7.2f", groupBw[localI][localJ]); + else + snprintf(t, sizeof(t), " %7s", "N/A"); + cell += t; + } + int const colIdx = 2 + (int)c; + table.Set(rowIdx, colIdx, "%s", cell.c_str()); + table.SetCellAlignment(rowIdx, colIdx, Utils::TableHelper::ALIGN_LEFT); } } table.PrintTable(ev.outputToCsv, ev.showBorders);