From 963d581156820f0c07b4e1b05c7ff2041ffa7a81 Mon Sep 17 00:00:00 2001
From: AtlantaPepsi <timhu102@amd.com>
Date: Tue, 28 Apr 2026 16:10:39 +0000
Subject: [PATCH 1/5] adjusting grouping logic; lifting helper functions to
 Utilities

---
 src/client/Presets/NicPeerToPeer.hpp |  98 +---------
 src/client/Presets/PodAllToAll.hpp   |  23 +--
 src/client/Presets/PodPeerToPeer.hpp |   2 +-
 src/client/Presets/PodRing.hpp       | 267 +++++++++++++++++++++++++++
 src/client/Presets/Presets.hpp       |   2 +
 src/client/Utilities.hpp             | 127 +++++++++++++
 6 files changed, 400 insertions(+), 119 deletions(-)
 create mode 100644 src/client/Presets/PodRing.hpp

diff --git a/src/client/Presets/NicPeerToPeer.hpp b/src/client/Presets/NicPeerToPeer.hpp
index f0e0def..8c54baa 100644
--- a/src/client/Presets/NicPeerToPeer.hpp
+++ b/src/client/Presets/NicPeerToPeer.hpp
@@ -22,100 +22,6 @@ THE SOFTWARE.
 
 // Helper functions
 
-// Returns a schedule of round robin pairing of N elements, using Circle Method
-// if parallel, each round contains N/2 pairs, otherwise serial
-void RoundRobinSchedule(std::vector<std::vector<std::pair<int, int>>>& schedule,
-                        int N, int parallel = 0) {
-  if (N == 1) {
-    schedule.push_back({{0,0}});
-    return;
-  }
-  // Generate standard round-robin tournament (maximum parallelism)
-  std::vector<std::vector<std::pair<int, int>>> fullSchedule;
-
-  // Pad odd number of ranks with a dummy round (N+1)
-  int paddedN = N + N%2;
-  // Round-robin tournament scheduling
-  for (int round = 0; round < paddedN - 1; round++) {
-    std::vector<std::pair<int, int>> roundPairs;
-    std::vector<std::pair<int, int>> roundPairsReversed;
-    for (int i = 0; i < paddedN / 2; i++) {
-      int item1 = i;
-      int item2 = paddedN - 1 - i;
-      if (round > 0) {
-        // Rotate all except the first item
-        if (item1 > 0) item1 = ((item1 - 1 + round) % (paddedN - 1)) + 1;
-        if (item2 > 0) item2 = ((item2 - 1 + round) % (paddedN - 1)) + 1;
-      }
-      // Ignore dummy round, its partner sits out this ronud
-      if (item1 < N && item2 < N){
-        roundPairs.push_back({item1, item2});
-        roundPairsReversed.push_back({item2, item1});
-      }
-    }
-    fullSchedule.push_back(roundPairs);
-    fullSchedule.push_back(roundPairsReversed);
-  }
-
-  // A loopback round where all run in parallel
-  std::vector<std::pair<int, int>> selfRound;
-  for (int i = 0; i < N; i++) {
-    selfRound.push_back({i, i});
-  }
-  fullSchedule.push_back(selfRound);
-
-  if (parallel) {
-    schedule = std::move(fullSchedule);
-  } else {
-    // Serialize each round if needed
-    for (auto const& fullRound : fullSchedule) {
-      for (auto const& match : fullRound) {
-        std::vector<std::pair<int, int>> subRound;
-        subRound.push_back({match.first, match.second});
-        schedule.push_back(subRound);
-      }
-    }
-  }
-}
-
-// Returns a schedule for ordered 2-combination of N elements
-// by pairing the list with its rotating self,
-// each round contains n pairs, where 1 <= n <= N and N is divisible by n
-// and an element cannot appear more than twice in a round,
-void CombinationSchedule(std::vector<std::vector<std::pair<int, int>>>& schedule,
-                           int N, int n = 0) {
-  std::vector<std::vector<std::pair<int, int>>> fullSchedule;
-
-  if (n <= 0) n = N;
-  if (N <= 0 || n > N || N % n != 0) // Assuming balanced load for each round
-  {
-    n = 1;
-    Utils::Print("[WARN] cannot create round robin schedule, falling back to serial");
-  }
-
-  // Generate rounds of combination based on incrementing distance
-  for (int i = 0; i < N; i++) {
-    std::vector<std::pair<int, int>> round;
-    for (int j = 0; j < N; j++) {
-      round.push_back({j, (j+i)%N});
-    }
-    fullSchedule.push_back(round);
-  }
-
-  // Step 2: Split each full round into sub-rounds with at most n pairs
-  for (auto const& fullRound : fullSchedule) {
-    for (size_t start = 0; start < fullRound.size(); start += n) {
-      std::vector<std::pair<int, int>> subRound;
-      for (size_t i = start; i < start + n && i < fullRound.size(); i++) {
-        subRound.push_back(fullRound[i]);
-      }
-      if (!subRound.empty()) {
-        schedule.push_back(subRound);
-      }
-    }
-  }
-}
-
 int GetClosestDeviceToNic(MemType memType, int nicIdx, int rank) {
   return TransferBench::IsCpuMemType(memType) ?
          TransferBench::GetClosestCpuNumaToNic(nicIdx, rank) :
@@ -203,8 +109,8 @@ int NicPeerToPeerPreset(EnvVars&          ev,
   std::vector<std::vector<std::pair<int, int>>> schedule;
   std::vector<std::vector<std::pair<int, int>>> nicSchedule;
 
-  RoundRobinSchedule(schedule, numRanks, nodeParallel);
-  CombinationSchedule(nicSchedule, numNicsPerRank, nicParLevel);
+  Utils::RoundRobinSchedule(schedule, numRanks, nodeParallel);
+  Utils::CombinationSchedule(nicSchedule, numNicsPerRank, nicParLevel);
 
   int totalTransfers = numRanks * numNicsPerRank * numRanks * numNicsPerRank;
   int counter = 0;
diff --git a/src/client/Presets/PodAllToAll.hpp b/src/client/Presets/PodAllToAll.hpp
index e03d388..dc33f75 100644
--- a/src/client/Presets/PodAllToAll.hpp
+++ b/src/client/Presets/PodAllToAll.hpp
@@ -20,27 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
-// Reorder elements of list by stepping through with stride k, wrapping around.
-// When gcd(k, n) > 1 the single cycle breaks into gcd(k, n) orbits which are
-// concatenated, so every element appears exactly once in the output.
-// The reordered list will be further separated into different groups.
-void StrideGenerate(std::vector<int>& list, int k) {
-  int n = list.size();
-  k = ((k % n) + n) % n;  // normalize to 0..n-1
-  if (k == 0) return;
-
-  int d = std::gcd(k, n);
-  std::vector<int> out;
-  out.reserve(n);
-
-  for (int s = 0; s < d; s++) {
-    for (int j = 0; j < n / d; j++) {
-      out.push_back(list[(s + j * k) % n]);
-    }
-  }
-  list = std::move(out);
-}
-
 int PodAllToAllPreset(EnvVars&          ev,
                       size_t      const numBytesPerTransfer,
                       std::string const presetName,
@@ -164,7 +143,7 @@ int PodAllToAllPreset(EnvVars&          ev,
     std::vector<MemDevice> devices(n);
     std::vector<int> indices(n);
     for (int k = 0; k < n; k++) indices[k] = k;
-    StrideGenerate(indices, stride);
+    Utils::StrideGenerate(indices, stride);
     int idx = 0;
     for (int rank : ranks) {
       for (int devIdx = 0; devIdx < numGpus; devIdx++) {
diff --git a/src/client/Presets/PodPeerToPeer.hpp b/src/client/Presets/PodPeerToPeer.hpp
index 2148bd4..9ea8ca7 100644
--- a/src/client/Presets/PodPeerToPeer.hpp
+++ b/src/client/Presets/PodPeerToPeer.hpp
@@ -126,7 +126,7 @@ int PodPeerToPeerPreset(EnvVars&          ev,
       } else {
         // parallelLevel == 1: node pairs run concurrently, one device pair at a time per node pair
         std::vector<std::vector<std::pair<int, int>>> nodePairSchedule;
-        RoundRobinSchedule(nodePairSchedule, (int)ranks.size(), 1);
+        Utils::RoundRobinSchedule(nodePairSchedule, (int)ranks.size(), 1);
 
         for (auto const& roundNodePairs : nodePairSchedule) {
           for (int srcDev = 0; srcDev < numGpuDevices; srcDev++) {
diff --git a/src/client/Presets/PodRing.hpp b/src/client/Presets/PodRing.hpp
new file mode 100644
index 0000000..c591923
--- /dev/null
+++ b/src/client/Presets/PodRing.hpp
@@ -0,0 +1,267 @@
+/*
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+int PodRingPreset(EnvVars&           ev,
+                  size_t      const  numBytesPerTransfer,
+                  std::string const  presetName)
+{
+  ev.gfxUnroll       = EnvVars::GetEnvVar("GFX_UNROLL", 2);
+
+  int numRanks       = TransferBench::GetNumRanks();
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  int memTypeIdx    = EnvVars::GetEnvVar("MEM_TYPE"       , 0);
+  int numGpus       = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+  int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 0);
+  int numSubExecs   = EnvVars::GetEnvVar("NUM_SUB_EXEC"   , 8);
+  int showDetails   = EnvVars::GetEnvVar("SHOW_DETAILS"   , 0);
+  int useDmaExec    = EnvVars::GetEnvVar("USE_DMA_EXEC"   , 0);
+  int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
+  int stride        = EnvVars::GetEnvVar("STRIDE"         , 1);
+  int groupSize     = EnvVars::GetEnvVar("GROUP_SIZE"     , numRanks * numGpus);
+
+  int numNics = TransferBench::GetNumExecutors(EXE_NIC, 0);
+  bool nicDifference = false;
+  for (int rank = 0; rank < numRanks; rank++) {
+    if (numGpus > TransferBench::GetNumExecutors(EXE_GPU_GFX, rank)) {
+      Utils::Print("[ERROR] PodRing preset requires each rank to have the same number of GPUs\n");
+      return 1;
+    }
+    if (numQueuePairs > 0 && numNics != TransferBench::GetNumExecutors(EXE_NIC, rank))
+      nicDifference = true;
+  }
+  if (nicDifference)
+    Utils::Print("[WARN] Not all ranks have the same number of NICs\n");
+
+  MemType memType = Utils::GetGpuMemType(memTypeIdx);
+  std::string devMemTypeStr = Utils::GetGpuMemTypeStr(memTypeIdx);
+
+  if (Utils::RankDoesOutput()) {
+    ev.DisplayEnvVars();
+    if (!ev.hideEnv) {
+      if (!ev.outputToCsv) printf("[PodRing Related]\n");
+      ev.Print("MEM_TYPE"       , memTypeIdx   , "Using %s GPU memory (%s)", devMemTypeStr.c_str(), Utils::GetAllGpuMemTypeStr().c_str());
+      ev.Print("NUM_GPU_DEVICES", numGpus      , "Using %d GPUs", numGpus);
+      ev.Print("NUM_QUEUE_PAIRS", numQueuePairs, "Using %d queue pairs for NIC transfers", numQueuePairs);
+      ev.Print("NUM_SUB_EXEC"   , numSubExecs  , "Using %d subexecutors/CUs per Transfer", numSubExecs);
+      ev.Print("USE_DMA_EXEC"   , useDmaExec   , "Using %s executor", useDmaExec ? "DMA" : "GFX");
+      ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor", useRemoteRead ? "DST" : "SRC");
+      ev.Print("STRIDE"         , stride       , "Reordering devices by taking %d steps", stride);
+      ev.Print("GROUP_SIZE"     , groupSize    , "Dividing all devices into ring groups of %d", groupSize);
+      printf("\n");
+    }
+  }
+
+  if (numGpus <= 0 || numGpus > numDetectedGpus) {
+    Utils::Print("[ERROR] Cannot use %d GPUs.  Detected %d GPUs\n", numGpus, numDetectedGpus);
+    return 1;
+  }
+  if (groupSize < 2) {
+    Utils::Print("[ERROR] Group size must be at least 2 to form a ring\n");
+    return 1;
+  }
+  if (numRanks * numDetectedGpus % groupSize) {
+    Utils::Print("[ERROR] Group size %d cannot evenly divide %d total devices from %d ranks.\n",
+                 groupSize, numRanks * numDetectedGpus, numRanks);
+    return 1;
+  }
+
+  Utils::Print("GPU-%s IntraPod Ring benchmark:\n", useDmaExec ? "DMA" : "GFX");
+  Utils::Print("==============================\n");
+  Utils::Print("[%lu bytes per Transfer] [%s:%d] [MemType:%s] [NIC QueuePairs:%d] [#Ranks:%d]\n",
+               numBytesPerTransfer, useDmaExec ? "DMA" : "GFX", numSubExecs,
+               devMemTypeStr.c_str(), numQueuePairs, numRanks);
+
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+  ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX;
+
+  int n = numRanks * numGpus;
+  int numGroups = n / groupSize;
+
+  std::vector<int> indices(n);
+  for (int k = 0; k < n; k++) indices[k] = k;
+  Utils::StrideGenerate(indices, stride);
+
+  std::vector<MemDevice> devices(n);
+  for (int i = 0; i < n; i++) {
+    int const globalIdx = indices[i];
+    int const rank      = globalIdx / numGpus;
+    int const devIdx    = globalIdx % numGpus;
+    devices[i] = {memType, devIdx, rank};
+  }
+
+  Utils::Print("%d ring(s) of %d devices:\n", numGroups, groupSize);
+  for (int group = 0; group < numGroups; group++) {
+    int const groupBase = group * groupSize;
+    Utils::Print("  Ring %d: ", group);
+    for (int i = 0; i < groupSize; i++) {
+      Utils::Print("R%d:G%d -> ", devices[groupBase + i].memRank, devices[groupBase + i].memIndex);
+    }
+    Utils::Print("R%d:G%d\n", devices[groupBase].memRank, devices[groupBase].memIndex);
+  }
+  Utils::Print("\n");
+
+  for (int group = 0; group < numGroups; group++) {
+    int const groupBase = group * groupSize;
+    std::vector<Transfer> transfers;
+
+    for (int i = 0; i < groupSize; i++) {
+      int srcIdx = groupBase + i;
+      int dstIdx = groupBase + (i + 1) % groupSize;
+
+      TransferBench::Transfer transfer;
+      transfer.numBytes = numBytesPerTransfer;
+      transfer.srcs.push_back(devices[srcIdx]);
+      transfer.dsts.push_back(devices[dstIdx]);
+      transfer.exeDevice = {exeType,
+                           (int32_t)(useRemoteRead ? devices[dstIdx].memIndex : devices[srcIdx].memIndex),
+                           (int32_t)(useRemoteRead ? devices[dstIdx].memRank  : devices[srcIdx].memRank)};
+      transfer.exeSubIndex = -1;
+      transfer.numSubExecs = numSubExecs;
+      transfers.push_back(transfer);
+
+      if (numQueuePairs > 0) {
+        TransferBench::Transfer nicTransfer;
+        nicTransfer.numBytes = numBytesPerTransfer;
+        nicTransfer.srcs.push_back(devices[srcIdx]);
+        nicTransfer.dsts.push_back(devices[dstIdx]);
+        nicTransfer.exeDevice = {TransferBench::EXE_NIC_NEAREST,
+                                (int32_t)devices[srcIdx].memIndex, (int32_t)devices[srcIdx].memRank};
+        nicTransfer.exeSubIndex = devices[dstIdx].memIndex;
+        nicTransfer.numSubExecs = numQueuePairs;
+        transfers.push_back(nicTransfer);
+      }
+    }
+
+    TransferBench::TestResults results;
+    if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+      for (auto const& err : results.errResults)
+        Utils::Print("%s\n", err.errMsg.c_str());
+      return 1;
+    }
+    if (showDetails) {
+      Utils::PrintResults(ev, 1, transfers, results);
+      Utils::Print("\n");
+    }
+
+    if (Utils::RankDoesOutput()) {
+      Utils::Print("\n--- Pod Ring Group %d ---\n", group);
+
+      int const numHops   = groupSize;
+      int const numRows   = 2 + numHops + 3;
+      int const numCols   = 6;
+      int const precision = 2;
+      Utils::TableHelper table(numRows, numCols, precision);
+
+      table.DrawRowBorder(0);
+      table.DrawColBorder(0);
+      table.DrawColBorder(numCols);
+      table.DrawRowBorder(numRows);
+
+      table.Set(0, 0, " Src ");
+      table.Set(0, 1, " Src ");
+      table.Set(0, 2, " Dst ");
+      table.Set(0, 3, " Dst ");
+      table.Set(0, 4, " GFX BW ");
+      table.Set(1, 0, " Rank ");
+      table.Set(1, 1, " GPU ");
+      table.Set(1, 2, " Rank ");
+      table.Set(1, 3, " GPU ");
+      table.Set(1, 4, " (GB/s) ");
+      table.DrawColBorder(2);
+      table.DrawColBorder(4);
+
+      if (numQueuePairs > 0) {
+        table.Set(0, 5, " NIC BW ");
+        table.Set(1, 5, " (GB/s) ");
+      } else {
+        table.Set(0, 5, " ");
+        table.Set(1, 5, " ");
+      }
+
+      table.DrawRowBorder(2);
+
+      double gfxMin = std::numeric_limits<double>::max();
+      double gfxAvg = 0.0;
+      double gfxMax = std::numeric_limits<double>::lowest();
+      double nicMin = std::numeric_limits<double>::max();
+      double nicAvg = 0.0;
+      double nicMax = std::numeric_limits<double>::lowest();
+
+      int tfrIdx = 0;
+      for (int i = 0; i < numHops; i++) {
+        int srcIdx = groupBase + i;
+        int dstIdx = groupBase + (i + 1) % groupSize;
+        int row    = 2 + i;
+
+        double gfxBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec;
+        tfrIdx++;
+
+        table.Set(row, 0, " %d ", devices[srcIdx].memRank);
+        table.Set(row, 1, " %d ", devices[srcIdx].memIndex);
+        table.Set(row, 2, " %d ", devices[dstIdx].memRank);
+        table.Set(row, 3, " %d ", devices[dstIdx].memIndex);
+        table.Set(row, 4, " %.2f ", gfxBw);
+
+        gfxMin = std::min(gfxMin, gfxBw);
+        gfxAvg += gfxBw;
+        gfxMax = std::max(gfxMax, gfxBw);
+
+        if (numQueuePairs > 0) {
+          double nicBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec;
+          tfrIdx++;
+          table.Set(row, 5, " %.2f ", nicBw);
+          nicMin = std::min(nicMin, nicBw);
+          nicAvg += nicBw;
+          nicMax = std::max(nicMax, nicBw);
+        }
+      }
+
+      int summaryBase = 2 + numHops;
+      table.DrawRowBorder(summaryBase);
+      table.Set(summaryBase    , 1, " MAX ");
+      table.Set(summaryBase + 1, 1, " AVG ");
+      table.Set(summaryBase + 2, 1, " MIN ");
+      table.Set(summaryBase    , 4, " %.2f ", gfxMax);
+      table.Set(summaryBase + 1, 4, " %.2f ", gfxAvg / numHops);
+      table.Set(summaryBase + 2, 4, " %.2f ", gfxMin);
+
+      if (numQueuePairs > 0) {
+        table.Set(summaryBase    , 5, " %.2f ", nicMax);
+        table.Set(summaryBase + 1, 5, " %.2f ", nicAvg / numHops);
+        table.Set(summaryBase + 2, 5, " %.2f ", nicMin);
+      }
+
+      table.PrintTable(ev.outputToCsv, ev.showBorders);
+
+      Utils::Print("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
+    }
+  }
+
+  if (!Utils::RankDoesOutput()) return 0;
+
+  if (Utils::HasDuplicateHostname()) {
+    printf("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n");
+  }
+
+  return 0;
+}
diff --git a/src/client/Presets/Presets.hpp b/src/client/Presets/Presets.hpp
index 8354208..09e25b2 100644
--- a/src/client/Presets/Presets.hpp
+++ b/src/client/Presets/Presets.hpp
@@ -43,6 +43,7 @@ THE SOFTWARE.
 #include "PeerToPeer.hpp"
 #include "PodAllToAll.hpp"
 #include "PodPeerToPeer.hpp"
+#include "PodRing.hpp"
 #include "Scaling.hpp"
 #include "Schmoo.hpp"
 #include "SmokeTest.hpp"
@@ -77,6 +78,7 @@ std::map<std::string, PresetInfo> presetFuncMap =
   {"p2p"   ,      {PeerToPeerPreset,    "Peer-to-peer device memory bandwidth test"}},
   {"poda2a",      {PodAllToAllPreset,   "All-to-all transfers between subgroups of ranks within a pod"}},
   {"podp2p",      {PodPeerToPeerPreset, "Peer-to-peer transfers test among ranks within a pod"}},
+  {"podring",     {PodRingPreset,      "Ring transfers within subgroups of ranks in a pod"}},
   {"rsweep",      {SweepPreset,         "Randomly sweep through sets of Transfers"}},
   {"scaling",     {ScalingPreset,       "Run scaling test from one GPU to other devices"}},
   {"schmoo",      {SchmooPreset,        "Scaling tests for local/remote read/write/copy"}},
diff --git a/src/client/Utilities.hpp b/src/client/Utilities.hpp
index 259e4cc..017ca17 100644
--- a/src/client/Utilities.hpp
+++ b/src/client/Utilities.hpp
@@ -155,6 +155,24 @@ namespace TransferBench::Utils
   bool AllocateMemory(MemDevice memDevice, size_t numBytes, void** memPtr);
   bool DeallocateMemory(MemType memType, void *memPtr, size_t const bytes);
 
+  // Reorder elements of list by stepping through with stride k, wrapping around.
+  // When gcd(k, n) > 1 the single cycle breaks into gcd(k, n) orbits which are
+  // concatenated, so every element appears exactly once in the output.
+  // The reordered list will be further separated into different groups.
+  void StrideGenerate(std::vector<int>& list, int k);
+
+  // Returns a schedule of round robin pairing of N elements, using Circle Method.
+  // If parallel, each round contains N/2 pairs, otherwise serial.
+  void RoundRobinSchedule(std::vector<std::vector<std::pair<int, int>>>& schedule,
+                          int N, int parallel = 0);
+
+  // Returns a schedule for ordered 2-combination of N elements
+  // by pairing the list with its rotating self.
+  // Each round contains n pairs, where 1 <= n <= N and N is divisible by n,
+  // and an element cannot appear more than twice in a round.
+  void CombinationSchedule(std::vector<std::vector<std::pair<int, int>>>& schedule,
+                           int N, int n = 0);
+
   // Implementation details below
   //================================================================
   TableHelper::TableHelper(int numRows, int numCols, int precision) :
@@ -769,4 +787,113 @@ namespace TransferBench::Utils
   {
     return (TransferBench::DeallocateMemory(memType, memPtr, bytes).errType != TransferBench::ERR_NONE);
   }
+
+  void StrideGenerate(std::vector<int>& list, int k)
+  {
+    int n = list.size();
+    if (n == 0) return;
+    k = ((k % n) + n) % n;  // normalize to 0..n-1
+    if (k == 0) return;
+
+    int d = std::gcd(k, n);
+    std::vector<int> out;
+    out.reserve(n);
+
+    for (int s = 0; s < d; s++) {
+      for (int j = 0; j < n / d; j++) {
+        out.push_back(list[(s + j * k) % n]);
+      }
+    }
+    list = std::move(out);
+  }
+
+  void RoundRobinSchedule(std::vector<std::vector<std::pair<int, int>>>& schedule,
+                          int N, int parallel)
+  {
+    if (N == 1) {
+      schedule.push_back({{0, 0}});
+      return;
+    }
+    // Generate standard round-robin tournament (maximum parallelism)
+    std::vector<std::vector<std::pair<int, int>>> fullSchedule;
+
+    // Pad odd number of ranks with a dummy round (N+1)
+    int paddedN = N + N % 2;
+    // Round-robin tournament scheduling
+    for (int round = 0; round < paddedN - 1; round++) {
+      std::vector<std::pair<int, int>> roundPairs;
+      std::vector<std::pair<int, int>> roundPairsReversed;
+      for (int i = 0; i < paddedN / 2; i++) {
+        int item1 = i;
+        int item2 = paddedN - 1 - i;
+        if (round > 0) {
+          // Rotate all except the first item
+          if (item1 > 0) item1 = ((item1 - 1 + round) % (paddedN - 1)) + 1;
+          if (item2 > 0) item2 = ((item2 - 1 + round) % (paddedN - 1)) + 1;
+        }
+        // Ignore dummy round, its partner sits out this ronud
+        if (item1 < N && item2 < N) {
+          roundPairs.push_back({item1, item2});
+          roundPairsReversed.push_back({item2, item1});
+        }
+      }
+      fullSchedule.push_back(roundPairs);
+      fullSchedule.push_back(roundPairsReversed);
+    }
+
+    // A loopback round where all run in parallel
+    std::vector<std::pair<int, int>> selfRound;
+    for (int i = 0; i < N; i++) {
+      selfRound.push_back({i, i});
+    }
+    fullSchedule.push_back(selfRound);
+
+    if (parallel) {
+      schedule = std::move(fullSchedule);
+    } else {
+      // Serialize each round if needed
+      for (auto const& fullRound : fullSchedule) {
+        for (auto const& match : fullRound) {
+          std::vector<std::pair<int, int>> subRound;
+          subRound.push_back({match.first, match.second});
+          schedule.push_back(subRound);
+        }
+      }
+    }
+  }
+
+  void CombinationSchedule(std::vector<std::vector<std::pair<int, int>>>& schedule,
+                           int N, int n)
+  {
+    std::vector<std::vector<std::pair<int, int>>> fullSchedule;
+
+    if (n <= 0) n = N;
+    if (N <= 0 || n > N || N % n != 0) // Assuming balanced load for each round
+    {
+      n = 1;
+      Print("[WARN] cannot create round robin schedule, falling back to serial");
+    }
+
+    // Generate rounds of combination based on incrementing distance
+    for (int i = 0; i < N; i++) {
+      std::vector<std::pair<int, int>> round;
+      for (int j = 0; j < N; j++) {
+        round.push_back({j, (j + i) % N});
+      }
+      fullSchedule.push_back(round);
+    }
+
+    // Step 2: Split each full round into sub-rounds with at most n pairs
+    for (auto const& fullRound : fullSchedule) {
+      for (size_t start = 0; start < fullRound.size(); start += n) {
+        std::vector<std::pair<int, int>> subRound;
+        for (size_t i = start; i < start + n && i < fullRound.size(); i++) {
+          subRound.push_back(fullRound[i]);
+        }
+        if (!subRound.empty()) {
+          schedule.push_back(subRound);
+        }
+      }
+    }
+  }
 };

From 6886db230a916d63831703b15e6a9196c4a123fe Mon Sep 17 00:00:00 2001
From: AtlantaPepsi <timhu102@amd.com>
Date: Tue, 28 Apr 2026 17:33:19 +0000
Subject: [PATCH 2/5] addition of pod loop and minor fixes

---
 src/client/Presets/PodRing.hpp | 293 +++++++++++++++++----------------
 src/client/Presets/Presets.hpp |   2 +-
 src/client/Utilities.hpp       |   4 +-
 3 files changed, 154 insertions(+), 145 deletions(-)

diff --git a/src/client/Presets/PodRing.hpp b/src/client/Presets/PodRing.hpp
index c591923..8e0b6f3 100644
--- a/src/client/Presets/PodRing.hpp
+++ b/src/client/Presets/PodRing.hpp
@@ -20,9 +20,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
-int PodRingPreset(EnvVars&           ev,
-                  size_t      const  numBytesPerTransfer,
-                  std::string const  presetName)
+int PodRingPreset(EnvVars&          ev,
+                  size_t      const numBytesPerTransfer,
+                  std::string const presetName,
+                  bool        const bytesSpecified)
 {
   ev.gfxUnroll       = EnvVars::GetEnvVar("GFX_UNROLL", 2);
 
@@ -94,166 +95,174 @@ int PodRingPreset(EnvVars&           ev,
   TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
   ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX;
 
-  int n = numRanks * numGpus;
-  int numGroups = n / groupSize;
+  Utils::RankPerPodMap& rankToPod = Utils::GetRankPerPodMap();
+  if (rankToPod.empty()) {
+    Utils::Print("[ERROR] No pods detected. Set TB_FORCE_SINGLE_POD=1 to treat all ranks as a single pod.\n");
+    return 1;
+  }
 
-  std::vector<int> indices(n);
-  for (int k = 0; k < n; k++) indices[k] = k;
-  Utils::StrideGenerate(indices, stride);
+  for (auto const& [pod, ranks] : rankToPod) {
+    int n = ranks.size() * numGpus;
+    int numGroups = n / groupSize;
 
-  std::vector<MemDevice> devices(n);
-  for (int i = 0; i < n; i++) {
-    int const globalIdx = indices[i];
-    int const rank      = globalIdx / numGpus;
-    int const devIdx    = globalIdx % numGpus;
-    devices[i] = {memType, devIdx, rank};
-  }
+    std::vector<int> indices(n);
+    for (int k = 0; k < n; k++) indices[k] = k;
+    Utils::StrideGenerate(indices, stride);
 
-  Utils::Print("%d ring(s) of %d devices:\n", numGroups, groupSize);
-  for (int group = 0; group < numGroups; group++) {
-    int const groupBase = group * groupSize;
-    Utils::Print("  Ring %d: ", group);
-    for (int i = 0; i < groupSize; i++) {
-      Utils::Print("R%d:G%d -> ", devices[groupBase + i].memRank, devices[groupBase + i].memIndex);
+    std::vector<MemDevice> devices(n);
+    for (int i = 0; i < n; i++) {
+      int const globalIdx = indices[i];
+      int const rank      = ranks[globalIdx / numGpus];
+      int const devIdx    = globalIdx % numGpus;
+      devices[i] = {memType, devIdx, rank};
     }
-    Utils::Print("R%d:G%d\n", devices[groupBase].memRank, devices[groupBase].memIndex);
-  }
-  Utils::Print("\n");
-
-  for (int group = 0; group < numGroups; group++) {
-    int const groupBase = group * groupSize;
-    std::vector<Transfer> transfers;
-
-    for (int i = 0; i < groupSize; i++) {
-      int srcIdx = groupBase + i;
-      int dstIdx = groupBase + (i + 1) % groupSize;
-
-      TransferBench::Transfer transfer;
-      transfer.numBytes = numBytesPerTransfer;
-      transfer.srcs.push_back(devices[srcIdx]);
-      transfer.dsts.push_back(devices[dstIdx]);
-      transfer.exeDevice = {exeType,
-                           (int32_t)(useRemoteRead ? devices[dstIdx].memIndex : devices[srcIdx].memIndex),
-                           (int32_t)(useRemoteRead ? devices[dstIdx].memRank  : devices[srcIdx].memRank)};
-      transfer.exeSubIndex = -1;
-      transfer.numSubExecs = numSubExecs;
-      transfers.push_back(transfer);
-
-      if (numQueuePairs > 0) {
-        TransferBench::Transfer nicTransfer;
-        nicTransfer.numBytes = numBytesPerTransfer;
-        nicTransfer.srcs.push_back(devices[srcIdx]);
-        nicTransfer.dsts.push_back(devices[dstIdx]);
-        nicTransfer.exeDevice = {TransferBench::EXE_NIC_NEAREST,
-                                (int32_t)devices[srcIdx].memIndex, (int32_t)devices[srcIdx].memRank};
-        nicTransfer.exeSubIndex = devices[dstIdx].memIndex;
-        nicTransfer.numSubExecs = numQueuePairs;
-        transfers.push_back(nicTransfer);
+
+    Utils::Print("Pod %ld: %d ring(s) of %d devices:\n", pod, numGroups, groupSize);
+    for (int group = 0; group < numGroups; group++) {
+      int const groupBase = group * groupSize;
+      Utils::Print("  Ring %d: ", group);
+      for (int i = 0; i < groupSize; i++) {
+        Utils::Print("R%d:G%d -> ", devices[groupBase + i].memRank, devices[groupBase + i].memIndex);
       }
+      Utils::Print("R%d:G%d\n", devices[groupBase].memRank, devices[groupBase].memIndex);
     }
+    Utils::Print("\n");
 
-    TransferBench::TestResults results;
-    if (!TransferBench::RunTransfers(cfg, transfers, results)) {
-      for (auto const& err : results.errResults)
-        Utils::Print("%s\n", err.errMsg.c_str());
-      return 1;
-    }
-    if (showDetails) {
-      Utils::PrintResults(ev, 1, transfers, results);
-      Utils::Print("\n");
-    }
+    for (int group = 0; group < numGroups; group++) {
+      int const groupBase = group * groupSize;
+      std::vector<Transfer> transfers;
+
+      for (int i = 0; i < groupSize; i++) {
+        int srcIdx = groupBase + i;
+        int dstIdx = groupBase + (i + 1) % groupSize;
 
-    if (Utils::RankDoesOutput()) {
-      Utils::Print("\n--- Pod Ring Group %d ---\n", group);
-
-      int const numHops   = groupSize;
-      int const numRows   = 2 + numHops + 3;
-      int const numCols   = 6;
-      int const precision = 2;
-      Utils::TableHelper table(numRows, numCols, precision);
-
-      table.DrawRowBorder(0);
-      table.DrawColBorder(0);
-      table.DrawColBorder(numCols);
-      table.DrawRowBorder(numRows);
-
-      table.Set(0, 0, " Src ");
-      table.Set(0, 1, " Src ");
-      table.Set(0, 2, " Dst ");
-      table.Set(0, 3, " Dst ");
-      table.Set(0, 4, " GFX BW ");
-      table.Set(1, 0, " Rank ");
-      table.Set(1, 1, " GPU ");
-      table.Set(1, 2, " Rank ");
-      table.Set(1, 3, " GPU ");
-      table.Set(1, 4, " (GB/s) ");
-      table.DrawColBorder(2);
-      table.DrawColBorder(4);
-
-      if (numQueuePairs > 0) {
-        table.Set(0, 5, " NIC BW ");
-        table.Set(1, 5, " (GB/s) ");
-      } else {
-        table.Set(0, 5, " ");
-        table.Set(1, 5, " ");
+        TransferBench::Transfer transfer;
+        transfer.numBytes = numBytesPerTransfer;
+        transfer.srcs.push_back(devices[srcIdx]);
+        transfer.dsts.push_back(devices[dstIdx]);
+        transfer.exeDevice = {exeType,
+                             (int32_t)(useRemoteRead ? devices[dstIdx].memIndex : devices[srcIdx].memIndex),
+                             (int32_t)(useRemoteRead ? devices[dstIdx].memRank  : devices[srcIdx].memRank)};
+        transfer.exeSubIndex = -1;
+        transfer.numSubExecs = numSubExecs;
+        transfers.push_back(transfer);
+
+        if (numQueuePairs > 0) {
+          TransferBench::Transfer nicTransfer;
+          nicTransfer.numBytes = numBytesPerTransfer;
+          nicTransfer.srcs.push_back(devices[srcIdx]);
+          nicTransfer.dsts.push_back(devices[dstIdx]);
+          nicTransfer.exeDevice = {TransferBench::EXE_NIC_NEAREST,
+                                  (int32_t)devices[srcIdx].memIndex, (int32_t)devices[srcIdx].memRank};
+          nicTransfer.exeSubIndex = devices[dstIdx].memIndex;
+          nicTransfer.numSubExecs = numQueuePairs;
+          transfers.push_back(nicTransfer);
+        }
       }
 
-      table.DrawRowBorder(2);
+      TransferBench::TestResults results;
+      if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+        for (auto const& err : results.errResults)
+          Utils::Print("%s\n", err.errMsg.c_str());
+        return 1;
+      }
+      if (showDetails) {
+        Utils::PrintResults(ev, 1, transfers, results);
+        Utils::Print("\n");
+      }
 
-      double gfxMin = std::numeric_limits<double>::max();
-      double gfxAvg = 0.0;
-      double gfxMax = std::numeric_limits<double>::lowest();
-      double nicMin = std::numeric_limits<double>::max();
-      double nicAvg = 0.0;
-      double nicMax = std::numeric_limits<double>::lowest();
+      if (Utils::RankDoesOutput()) {
+        Utils::Print("\n--- Pod %ld Ring Group %d ---\n", pod, group);
+
+        int const numHops   = groupSize;
+        int const numRows   = 2 + numHops + 3;
+        int const numCols   = 6;
+        int const precision = 2;
+        Utils::TableHelper table(numRows, numCols, precision);
+
+        table.DrawRowBorder(0);
+        table.DrawColBorder(0);
+        table.DrawColBorder(numCols);
+        table.DrawRowBorder(numRows);
+
+        table.Set(0, 0, " Src ");
+        table.Set(0, 1, " Src ");
+        table.Set(0, 2, " Dst ");
+        table.Set(0, 3, " Dst ");
+        table.Set(0, 4, " GFX BW ");
+        table.Set(1, 0, " Rank ");
+        table.Set(1, 1, " GPU ");
+        table.Set(1, 2, " Rank ");
+        table.Set(1, 3, " GPU ");
+        table.Set(1, 4, " (GB/s) ");
+        table.DrawColBorder(2);
+        table.DrawColBorder(4);
 
-      int tfrIdx = 0;
-      for (int i = 0; i < numHops; i++) {
-        int srcIdx = groupBase + i;
-        int dstIdx = groupBase + (i + 1) % groupSize;
-        int row    = 2 + i;
+        if (numQueuePairs > 0) {
+          table.Set(0, 5, " NIC BW ");
+          table.Set(1, 5, " (GB/s) ");
+        } else {
+          table.Set(0, 5, " ");
+          table.Set(1, 5, " ");
+        }
 
-        double gfxBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec;
-        tfrIdx++;
+        table.DrawRowBorder(2);
 
-        table.Set(row, 0, " %d ", devices[srcIdx].memRank);
-        table.Set(row, 1, " %d ", devices[srcIdx].memIndex);
-        table.Set(row, 2, " %d ", devices[dstIdx].memRank);
-        table.Set(row, 3, " %d ", devices[dstIdx].memIndex);
-        table.Set(row, 4, " %.2f ", gfxBw);
+        double gfxMin = std::numeric_limits<double>::max();
+        double gfxAvg = 0.0;
+        double gfxMax = std::numeric_limits<double>::lowest();
+        double nicMin = std::numeric_limits<double>::max();
+        double nicAvg = 0.0;
+        double nicMax = std::numeric_limits<double>::lowest();
 
-        gfxMin = std::min(gfxMin, gfxBw);
-        gfxAvg += gfxBw;
-        gfxMax = std::max(gfxMax, gfxBw);
+        int tfrIdx = 0;
+        for (int i = 0; i < numHops; i++) {
+          int srcIdx = groupBase + i;
+          int dstIdx = groupBase + (i + 1) % groupSize;
+          int row    = 2 + i;
 
-        if (numQueuePairs > 0) {
-          double nicBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec;
+          double gfxBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec;
           tfrIdx++;
-          table.Set(row, 5, " %.2f ", nicBw);
-          nicMin = std::min(nicMin, nicBw);
-          nicAvg += nicBw;
-          nicMax = std::max(nicMax, nicBw);
+
+          table.Set(row, 0, " %d ", devices[srcIdx].memRank);
+          table.Set(row, 1, " %d ", devices[srcIdx].memIndex);
+          table.Set(row, 2, " %d ", devices[dstIdx].memRank);
+          table.Set(row, 3, " %d ", devices[dstIdx].memIndex);
+          table.Set(row, 4, " %.2f ", gfxBw);
+
+          gfxMin = std::min(gfxMin, gfxBw);
+          gfxAvg += gfxBw;
+          gfxMax = std::max(gfxMax, gfxBw);
+
+          if (numQueuePairs > 0) {
+            double nicBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec;
+            tfrIdx++;
+            table.Set(row, 5, " %.2f ", nicBw);
+            nicMin = std::min(nicMin, nicBw);
+            nicAvg += nicBw;
+            nicMax = std::max(nicMax, nicBw);
+          }
         }
-      }
 
-      int summaryBase = 2 + numHops;
-      table.DrawRowBorder(summaryBase);
-      table.Set(summaryBase    , 1, " MAX ");
-      table.Set(summaryBase + 1, 1, " AVG ");
-      table.Set(summaryBase + 2, 1, " MIN ");
-      table.Set(summaryBase    , 4, " %.2f ", gfxMax);
-      table.Set(summaryBase + 1, 4, " %.2f ", gfxAvg / numHops);
-      table.Set(summaryBase + 2, 4, " %.2f ", gfxMin);
-
-      if (numQueuePairs > 0) {
-        table.Set(summaryBase    , 5, " %.2f ", nicMax);
-        table.Set(summaryBase + 1, 5, " %.2f ", nicAvg / numHops);
-        table.Set(summaryBase + 2, 5, " %.2f ", nicMin);
-      }
+        int summaryBase = 2 + numHops;
+        table.DrawRowBorder(summaryBase);
+        table.Set(summaryBase    , 1, " MAX ");
+        table.Set(summaryBase + 1, 1, " AVG ");
+        table.Set(summaryBase + 2, 1, " MIN ");
+        table.Set(summaryBase    , 4, " %.2f ", gfxMax);
+        table.Set(summaryBase + 1, 4, " %.2f ", gfxAvg / numHops);
+        table.Set(summaryBase + 2, 4, " %.2f ", gfxMin);
 
-      table.PrintTable(ev.outputToCsv, ev.showBorders);
+        if (numQueuePairs > 0) {
+          table.Set(summaryBase    , 5, " %.2f ", nicMax);
+          table.Set(summaryBase + 1, 5, " %.2f ", nicAvg / numHops);
+          table.Set(summaryBase + 2, 5, " %.2f ", nicMin);
+        }
+
+        table.PrintTable(ev.outputToCsv, ev.showBorders);
 
-      Utils::Print("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
+        Utils::Print("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
+      }
     }
   }
 
diff --git a/src/client/Presets/Presets.hpp b/src/client/Presets/Presets.hpp
index 09e25b2..5505681 100644
--- a/src/client/Presets/Presets.hpp
+++ b/src/client/Presets/Presets.hpp
@@ -78,7 +78,7 @@ std::map<std::string, PresetInfo> presetFuncMap =
   {"p2p"   ,      {PeerToPeerPreset,    "Peer-to-peer device memory bandwidth test"}},
   {"poda2a",      {PodAllToAllPreset,   "All-to-all transfers between subgroups of ranks within a pod"}},
   {"podp2p",      {PodPeerToPeerPreset, "Peer-to-peer transfers test among ranks within a pod"}},
-  {"podring",     {PodRingPreset,      "Ring transfers within subgroups of ranks in a pod"}},
+  {"podring",     {PodRingPreset,       "Ring transfers within subgroups of ranks in a pod"}},
   {"rsweep",      {SweepPreset,         "Randomly sweep through sets of Transfers"}},
   {"scaling",     {ScalingPreset,       "Run scaling test from one GPU to other devices"}},
   {"schmoo",      {SchmooPreset,        "Scaling tests for local/remote read/write/copy"}},
diff --git a/src/client/Utilities.hpp b/src/client/Utilities.hpp
index 017ca17..497770f 100644
--- a/src/client/Utilities.hpp
+++ b/src/client/Utilities.hpp
@@ -831,7 +831,7 @@ namespace TransferBench::Utils
           if (item1 > 0) item1 = ((item1 - 1 + round) % (paddedN - 1)) + 1;
           if (item2 > 0) item2 = ((item2 - 1 + round) % (paddedN - 1)) + 1;
         }
-        // Ignore dummy round, its partner sits out this ronud
+        // Ignore dummy round, its partner sits out this round
         if (item1 < N && item2 < N) {
           roundPairs.push_back({item1, item2});
           roundPairsReversed.push_back({item2, item1});
@@ -871,7 +871,7 @@ namespace TransferBench::Utils
     if (N <= 0 || n > N || N % n != 0) // Assuming balanced load for each round
     {
       n = 1;
-      Print("[WARN] cannot create round robin schedule, falling back to serial");
+      Print("[WARN] cannot create combination schedule, falling back to serial\n");
     }
 
     // Generate rounds of combination based on incrementing distance

From 86e6eff4ad7dfca63abc484c0a144281bff48be5 Mon Sep 17 00:00:00 2001
From: AtlantaPepsi <timhu102@amd.com>
Date: Tue, 28 Apr 2026 18:57:36 +0000
Subject: [PATCH 3/5] adjusting sizing checks

---
 src/client/Presets/PodRing.hpp | 59 ++++++++++++++++++----------------
 1 file changed, 31 insertions(+), 28 deletions(-)

diff --git a/src/client/Presets/PodRing.hpp b/src/client/Presets/PodRing.hpp
index 8e0b6f3..82d5f82 100644
--- a/src/client/Presets/PodRing.hpp
+++ b/src/client/Presets/PodRing.hpp
@@ -30,6 +30,12 @@ int PodRingPreset(EnvVars&          ev,
   int numRanks       = TransferBench::GetNumRanks();
   int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
 
+  Utils::RankPerPodMap& rankToPod = Utils::GetRankPerPodMap();
+  if (rankToPod.empty()) {
+    Utils::Print("[ERROR] No pods detected. Set TB_FORCE_SINGLE_POD=1 to treat all ranks as a single pod.\n");
+    return 1;
+  }
+
   int memTypeIdx    = EnvVars::GetEnvVar("MEM_TYPE"       , 0);
   int numGpus       = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
   int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 0);
@@ -40,15 +46,32 @@ int PodRingPreset(EnvVars&          ev,
   int stride        = EnvVars::GetEnvVar("STRIDE"         , 1);
   int groupSize     = EnvVars::GetEnvVar("GROUP_SIZE"     , numRanks * numGpus);
 
+  if (numGpus <= 0 || numGpus > numDetectedGpus) {
+    Utils::Print("[ERROR] Cannot use %d GPUs.  Detected %d GPUs\n", numGpus, numDetectedGpus);
+    return 1;
+  }
+  if (groupSize < 2) {
+    Utils::Print("[ERROR] Group size must be at least 2 to form a ring\n");
+    return 1;
+  }
+
   int numNics = TransferBench::GetNumExecutors(EXE_NIC, 0);
   bool nicDifference = false;
-  for (int rank = 0; rank < numRanks; rank++) {
-    if (numGpus > TransferBench::GetNumExecutors(EXE_GPU_GFX, rank)) {
-      Utils::Print("[ERROR] PodRing preset requires each rank to have the same number of GPUs\n");
+  for (auto const& [pod, ranks] : rankToPod) {
+    int const podDevices = ranks.size() * numGpus;
+    if (podDevices % groupSize) {
+      Utils::Print("[ERROR] Group size %d cannot evenly divide %d devices in pod %ld (%zu ranks x %d GPUs).\n",
+                   groupSize, podDevices, pod, ranks.size(), numGpus);
       return 1;
     }
-    if (numQueuePairs > 0 && numNics != TransferBench::GetNumExecutors(EXE_NIC, rank))
-      nicDifference = true;
+    for (int rank : ranks) {
+      if (numGpus > TransferBench::GetNumExecutors(EXE_GPU_GFX, rank)) {
+        Utils::Print("[ERROR] Pod %ld rank %d has fewer than %d GPUs\n", pod, rank, numGpus);
+        return 1;
+      }
+      if (numQueuePairs > 0 && numNics != TransferBench::GetNumExecutors(EXE_NIC, rank))
+        nicDifference = true;
+    }
   }
   if (nicDifference)
     Utils::Print("[WARN] Not all ranks have the same number of NICs\n");
@@ -67,40 +90,20 @@ int PodRingPreset(EnvVars&          ev,
       ev.Print("USE_DMA_EXEC"   , useDmaExec   , "Using %s executor", useDmaExec ? "DMA" : "GFX");
       ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor", useRemoteRead ? "DST" : "SRC");
       ev.Print("STRIDE"         , stride       , "Reordering devices by taking %d steps", stride);
-      ev.Print("GROUP_SIZE"     , groupSize    , "Dividing all devices into ring groups of %d", groupSize);
+      ev.Print("GROUP_SIZE"     , groupSize    , "Dividing each pod's devices into ring groups of %d", groupSize);
       printf("\n");
     }
   }
 
-  if (numGpus <= 0 || numGpus > numDetectedGpus) {
-    Utils::Print("[ERROR] Cannot use %d GPUs.  Detected %d GPUs\n", numGpus, numDetectedGpus);
-    return 1;
-  }
-  if (groupSize < 2) {
-    Utils::Print("[ERROR] Group size must be at least 2 to form a ring\n");
-    return 1;
-  }
-  if (numRanks * numDetectedGpus % groupSize) {
-    Utils::Print("[ERROR] Group size %d cannot evenly divide %d total devices from %d ranks.\n",
-                 groupSize, numRanks * numDetectedGpus, numRanks);
-    return 1;
-  }
-
   Utils::Print("GPU-%s IntraPod Ring benchmark:\n", useDmaExec ? "DMA" : "GFX");
   Utils::Print("==============================\n");
-  Utils::Print("[%lu bytes per Transfer] [%s:%d] [MemType:%s] [NIC QueuePairs:%d] [#Ranks:%d]\n",
+  Utils::Print("[%lu bytes per Transfer] [%s:%d] [MemType:%s] [NIC QueuePairs:%d] [#Ranks:%d] [#Pods:%zu]\n",
                numBytesPerTransfer, useDmaExec ? "DMA" : "GFX", numSubExecs,
-               devMemTypeStr.c_str(), numQueuePairs, numRanks);
+               devMemTypeStr.c_str(), numQueuePairs, numRanks, rankToPod.size());
 
   TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
   ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX;
 
-  Utils::RankPerPodMap& rankToPod = Utils::GetRankPerPodMap();
-  if (rankToPod.empty()) {
-    Utils::Print("[ERROR] No pods detected. Set TB_FORCE_SINGLE_POD=1 to treat all ranks as a single pod.\n");
-    return 1;
-  }
-
   for (auto const& [pod, ranks] : rankToPod) {
     int n = ranks.size() * numGpus;
     int numGroups = n / groupSize;

From 92cc2657e76fda7d308b1fa6c34221643956a994 Mon Sep 17 00:00:00 2001
From: AtlantaPepsi <timhu102@amd.com>
Date: Tue, 28 Apr 2026 19:52:31 +0000
Subject: [PATCH 4/5] rolling back to single pod

---
 src/client/Presets/PodRing.hpp | 329 +++++++++++++++++----------------
 1 file changed, 165 insertions(+), 164 deletions(-)

diff --git a/src/client/Presets/PodRing.hpp b/src/client/Presets/PodRing.hpp
index 82d5f82..5b449e5 100644
--- a/src/client/Presets/PodRing.hpp
+++ b/src/client/Presets/PodRing.hpp
@@ -25,17 +25,23 @@ int PodRingPreset(EnvVars&          ev,
                   std::string const presetName,
                   bool        const bytesSpecified)
 {
+  // Assuming single pod, for now
+  if (Utils::GetNumRankGroups() > 1) {
+    Utils::Print("[ERROR] PodRing preset can only be run across ranks that are homogenous\n");
+    Utils::Print("[ERROR] Run ./TransferBench without any args to display topology information\n");
+    Utils::Print("[ERROR] TB_NIC_FILTER may also be used to limit NIC visibility\n");
+    return 1;
+  }
+  if (Utils::GetRankPerPodMap().empty()) {
+    Utils::Print("[ERROR] No pods detected. Set TB_FORCE_SINGLE_POD=1 to treat all ranks as a single pod.\n");
+    return 1;
+  }
+
   ev.gfxUnroll       = EnvVars::GetEnvVar("GFX_UNROLL", 2);
 
   int numRanks       = TransferBench::GetNumRanks();
   int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
 
-  Utils::RankPerPodMap& rankToPod = Utils::GetRankPerPodMap();
-  if (rankToPod.empty()) {
-    Utils::Print("[ERROR] No pods detected. Set TB_FORCE_SINGLE_POD=1 to treat all ranks as a single pod.\n");
-    return 1;
-  }
-
   int memTypeIdx    = EnvVars::GetEnvVar("MEM_TYPE"       , 0);
   int numGpus       = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
   int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 0);
@@ -54,24 +60,21 @@ int PodRingPreset(EnvVars&          ev,
     Utils::Print("[ERROR] Group size must be at least 2 to form a ring\n");
     return 1;
   }
+  if (numRanks * numGpus % groupSize) {
+    Utils::Print("[ERROR] Group size %d cannot evenly divide %d total devices from %d ranks.\n",
+                 groupSize, numRanks * numGpus, numRanks);
+    return 1;
+  }
 
   int numNics = TransferBench::GetNumExecutors(EXE_NIC, 0);
   bool nicDifference = false;
-  for (auto const& [pod, ranks] : rankToPod) {
-    int const podDevices = ranks.size() * numGpus;
-    if (podDevices % groupSize) {
-      Utils::Print("[ERROR] Group size %d cannot evenly divide %d devices in pod %ld (%zu ranks x %d GPUs).\n",
-                   groupSize, podDevices, pod, ranks.size(), numGpus);
+  for (int rank = 0; rank < numRanks; rank++) {
+    if (numGpus > TransferBench::GetNumExecutors(EXE_GPU_GFX, rank)) {
+      Utils::Print("[ERROR] PodRing preset requires each rank to have the same number of GPUs\n");
       return 1;
     }
-    for (int rank : ranks) {
-      if (numGpus > TransferBench::GetNumExecutors(EXE_GPU_GFX, rank)) {
-        Utils::Print("[ERROR] Pod %ld rank %d has fewer than %d GPUs\n", pod, rank, numGpus);
-        return 1;
-      }
-      if (numQueuePairs > 0 && numNics != TransferBench::GetNumExecutors(EXE_NIC, rank))
-        nicDifference = true;
-    }
+    if (numQueuePairs > 0 && numNics != TransferBench::GetNumExecutors(EXE_NIC, rank))
+      nicDifference = true;
   }
   if (nicDifference)
     Utils::Print("[WARN] Not all ranks have the same number of NICs\n");
@@ -90,182 +93,180 @@ int PodRingPreset(EnvVars&          ev,
       ev.Print("USE_DMA_EXEC"   , useDmaExec   , "Using %s executor", useDmaExec ? "DMA" : "GFX");
       ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor", useRemoteRead ? "DST" : "SRC");
       ev.Print("STRIDE"         , stride       , "Reordering devices by taking %d steps", stride);
-      ev.Print("GROUP_SIZE"     , groupSize    , "Dividing each pod's devices into ring groups of %d", groupSize);
+      ev.Print("GROUP_SIZE"     , groupSize    , "Dividing all devices into ring groups of %d", groupSize);
       printf("\n");
     }
   }
 
   Utils::Print("GPU-%s IntraPod Ring benchmark:\n", useDmaExec ? "DMA" : "GFX");
   Utils::Print("==============================\n");
-  Utils::Print("[%lu bytes per Transfer] [%s:%d] [MemType:%s] [NIC QueuePairs:%d] [#Ranks:%d] [#Pods:%zu]\n",
+  Utils::Print("[%lu bytes per Transfer] [%s:%d] [MemType:%s] [NIC QueuePairs:%d] [#Ranks:%d]\n",
                numBytesPerTransfer, useDmaExec ? "DMA" : "GFX", numSubExecs,
-               devMemTypeStr.c_str(), numQueuePairs, numRanks, rankToPod.size());
+               devMemTypeStr.c_str(), numQueuePairs, numRanks);
 
   TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
   ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX;
 
-  for (auto const& [pod, ranks] : rankToPod) {
-    int n = ranks.size() * numGpus;
-    int numGroups = n / groupSize;
+  int n = numRanks * numGpus;
+  int numGroups = n / groupSize;
 
-    std::vector<int> indices(n);
-    for (int k = 0; k < n; k++) indices[k] = k;
-    Utils::StrideGenerate(indices, stride);
+  std::vector<int> indices(n);
+  for (int k = 0; k < n; k++) indices[k] = k;
+  Utils::StrideGenerate(indices, stride);
 
-    std::vector<MemDevice> devices(n);
-    for (int i = 0; i < n; i++) {
-      int const globalIdx = indices[i];
-      int const rank      = ranks[globalIdx / numGpus];
-      int const devIdx    = globalIdx % numGpus;
-      devices[i] = {memType, devIdx, rank};
-    }
+  std::vector<MemDevice> devices(n);
+  for (int i = 0; i < n; i++) {
+    int const globalIdx = indices[i];
+    int const rank      = globalIdx / numGpus;
+    int const devIdx    = globalIdx % numGpus;
+    devices[i] = {memType, devIdx, rank};
+  }
 
-    Utils::Print("Pod %ld: %d ring(s) of %d devices:\n", pod, numGroups, groupSize);
-    for (int group = 0; group < numGroups; group++) {
-      int const groupBase = group * groupSize;
-      Utils::Print("  Ring %d: ", group);
-      for (int i = 0; i < groupSize; i++) {
-        Utils::Print("R%d:G%d -> ", devices[groupBase + i].memRank, devices[groupBase + i].memIndex);
+  Utils::Print("%d ring(s) of %d devices:\n", numGroups, groupSize);
+  for (int group = 0; group < numGroups; group++) {
+    int const groupBase = group * groupSize;
+    Utils::Print("  Ring %d: ", group);
+    for (int i = 0; i < groupSize; i++) {
+      Utils::Print("R%d:G%d -> ", devices[groupBase + i].memRank, devices[groupBase + i].memIndex);
+    }
+    Utils::Print("R%d:G%d\n", devices[groupBase].memRank, devices[groupBase].memIndex);
+  }
+  Utils::Print("\n");
+
+  for (int group = 0; group < numGroups; group++) {
+    int const groupBase = group * groupSize;
+    std::vector<Transfer> transfers;
+
+    for (int i = 0; i < groupSize; i++) {
+      int srcIdx = groupBase + i;
+      int dstIdx = groupBase + (i + 1) % groupSize;
+
+      TransferBench::Transfer transfer;
+      transfer.numBytes = numBytesPerTransfer;
+      transfer.srcs.push_back(devices[srcIdx]);
+      transfer.dsts.push_back(devices[dstIdx]);
+      transfer.exeDevice = {exeType,
+                           (int32_t)(useRemoteRead ? devices[dstIdx].memIndex : devices[srcIdx].memIndex),
+                           (int32_t)(useRemoteRead ? devices[dstIdx].memRank  : devices[srcIdx].memRank)};
+      transfer.exeSubIndex = -1;
+      transfer.numSubExecs = numSubExecs;
+      transfers.push_back(transfer);
+
+      if (numQueuePairs > 0) {
+        TransferBench::Transfer nicTransfer;
+        nicTransfer.numBytes = numBytesPerTransfer;
+        nicTransfer.srcs.push_back(devices[srcIdx]);
+        nicTransfer.dsts.push_back(devices[dstIdx]);
+        nicTransfer.exeDevice = {TransferBench::EXE_NIC_NEAREST,
+                                (int32_t)devices[srcIdx].memIndex, (int32_t)devices[srcIdx].memRank};
+        nicTransfer.exeSubIndex = devices[dstIdx].memIndex;
+        nicTransfer.numSubExecs = numQueuePairs;
+        transfers.push_back(nicTransfer);
       }
-      Utils::Print("R%d:G%d\n", devices[groupBase].memRank, devices[groupBase].memIndex);
     }
-    Utils::Print("\n");
-
-    for (int group = 0; group < numGroups; group++) {
-      int const groupBase = group * groupSize;
-      std::vector<Transfer> transfers;
 
-      for (int i = 0; i < groupSize; i++) {
-        int srcIdx = groupBase + i;
-        int dstIdx = groupBase + (i + 1) % groupSize;
-
-        TransferBench::Transfer transfer;
-        transfer.numBytes = numBytesPerTransfer;
-        transfer.srcs.push_back(devices[srcIdx]);
-        transfer.dsts.push_back(devices[dstIdx]);
-        transfer.exeDevice = {exeType,
-                             (int32_t)(useRemoteRead ? devices[dstIdx].memIndex : devices[srcIdx].memIndex),
-                             (int32_t)(useRemoteRead ? devices[dstIdx].memRank  : devices[srcIdx].memRank)};
-        transfer.exeSubIndex = -1;
-        transfer.numSubExecs = numSubExecs;
-        transfers.push_back(transfer);
+    TransferBench::TestResults results;
+    if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+      for (auto const& err : results.errResults)
+        Utils::Print("%s\n", err.errMsg.c_str());
+      return 1;
+    }
+    if (showDetails) {
+      Utils::PrintResults(ev, 1, transfers, results);
+      Utils::Print("\n");
+    }
 
-        if (numQueuePairs > 0) {
-          TransferBench::Transfer nicTransfer;
-          nicTransfer.numBytes = numBytesPerTransfer;
-          nicTransfer.srcs.push_back(devices[srcIdx]);
-          nicTransfer.dsts.push_back(devices[dstIdx]);
-          nicTransfer.exeDevice = {TransferBench::EXE_NIC_NEAREST,
-                                  (int32_t)devices[srcIdx].memIndex, (int32_t)devices[srcIdx].memRank};
-          nicTransfer.exeSubIndex = devices[dstIdx].memIndex;
-          nicTransfer.numSubExecs = numQueuePairs;
-          transfers.push_back(nicTransfer);
-        }
+    if (Utils::RankDoesOutput()) {
+      Utils::Print("\n--- Pod Ring Group %d ---\n", group);
+
+      int const numHops   = groupSize;
+      int const numRows   = 2 + numHops + 3;
+      int const numCols   = 6;
+      int const precision = 2;
+      Utils::TableHelper table(numRows, numCols, precision);
+
+      table.DrawRowBorder(0);
+      table.DrawColBorder(0);
+      table.DrawColBorder(numCols);
+      table.DrawRowBorder(numRows);
+
+      table.Set(0, 0, " Src ");
+      table.Set(0, 1, " Src ");
+      table.Set(0, 2, " Dst ");
+      table.Set(0, 3, " Dst ");
+      table.Set(0, 4, " GFX BW ");
+      table.Set(1, 0, " Rank ");
+      table.Set(1, 1, " GPU ");
+      table.Set(1, 2, " Rank ");
+      table.Set(1, 3, " GPU ");
+      table.Set(1, 4, " (GB/s) ");
+      table.DrawColBorder(2);
+      table.DrawColBorder(4);
+
+      if (numQueuePairs > 0) {
+        table.Set(0, 5, " NIC BW ");
+        table.Set(1, 5, " (GB/s) ");
+      } else {
+        table.Set(0, 5, " ");
+        table.Set(1, 5, " ");
       }
 
-      TransferBench::TestResults results;
-      if (!TransferBench::RunTransfers(cfg, transfers, results)) {
-        for (auto const& err : results.errResults)
-          Utils::Print("%s\n", err.errMsg.c_str());
-        return 1;
-      }
-      if (showDetails) {
-        Utils::PrintResults(ev, 1, transfers, results);
-        Utils::Print("\n");
-      }
+      table.DrawRowBorder(2);
 
-      if (Utils::RankDoesOutput()) {
-        Utils::Print("\n--- Pod %ld Ring Group %d ---\n", pod, group);
-
-        int const numHops   = groupSize;
-        int const numRows   = 2 + numHops + 3;
-        int const numCols   = 6;
-        int const precision = 2;
-        Utils::TableHelper table(numRows, numCols, precision);
-
-        table.DrawRowBorder(0);
-        table.DrawColBorder(0);
-        table.DrawColBorder(numCols);
-        table.DrawRowBorder(numRows);
-
-        table.Set(0, 0, " Src ");
-        table.Set(0, 1, " Src ");
-        table.Set(0, 2, " Dst ");
-        table.Set(0, 3, " Dst ");
-        table.Set(0, 4, " GFX BW ");
-        table.Set(1, 0, " Rank ");
-        table.Set(1, 1, " GPU ");
-        table.Set(1, 2, " Rank ");
-        table.Set(1, 3, " GPU ");
-        table.Set(1, 4, " (GB/s) ");
-        table.DrawColBorder(2);
-        table.DrawColBorder(4);
+      double gfxMin = std::numeric_limits<double>::max();
+      double gfxAvg = 0.0;
+      double gfxMax = std::numeric_limits<double>::lowest();
+      double nicMin = std::numeric_limits<double>::max();
+      double nicAvg = 0.0;
+      double nicMax = std::numeric_limits<double>::lowest();
 
-        if (numQueuePairs > 0) {
-          table.Set(0, 5, " NIC BW ");
-          table.Set(1, 5, " (GB/s) ");
-        } else {
-          table.Set(0, 5, " ");
-          table.Set(1, 5, " ");
-        }
+      int tfrIdx = 0;
+      for (int i = 0; i < numHops; i++) {
+        int srcIdx = groupBase + i;
+        int dstIdx = groupBase + (i + 1) % groupSize;
+        int row    = 2 + i;
 
-        table.DrawRowBorder(2);
+        double gfxBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec;
+        tfrIdx++;
 
-        double gfxMin = std::numeric_limits<double>::max();
-        double gfxAvg = 0.0;
-        double gfxMax = std::numeric_limits<double>::lowest();
-        double nicMin = std::numeric_limits<double>::max();
-        double nicAvg = 0.0;
-        double nicMax = std::numeric_limits<double>::lowest();
+        table.Set(row, 0, " %d ", devices[srcIdx].memRank);
+        table.Set(row, 1, " %d ", devices[srcIdx].memIndex);
+        table.Set(row, 2, " %d ", devices[dstIdx].memRank);
+        table.Set(row, 3, " %d ", devices[dstIdx].memIndex);
+        table.Set(row, 4, " %.2f ", gfxBw);
 
-        int tfrIdx = 0;
-        for (int i = 0; i < numHops; i++) {
-          int srcIdx = groupBase + i;
-          int dstIdx = groupBase + (i + 1) % groupSize;
-          int row    = 2 + i;
+        gfxMin = std::min(gfxMin, gfxBw);
+        gfxAvg += gfxBw;
+        gfxMax = std::max(gfxMax, gfxBw);
 
-          double gfxBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec;
+        if (numQueuePairs > 0) {
+          double nicBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec;
           tfrIdx++;
-
-          table.Set(row, 0, " %d ", devices[srcIdx].memRank);
-          table.Set(row, 1, " %d ", devices[srcIdx].memIndex);
-          table.Set(row, 2, " %d ", devices[dstIdx].memRank);
-          table.Set(row, 3, " %d ", devices[dstIdx].memIndex);
-          table.Set(row, 4, " %.2f ", gfxBw);
-
-          gfxMin = std::min(gfxMin, gfxBw);
-          gfxAvg += gfxBw;
-          gfxMax = std::max(gfxMax, gfxBw);
-
-          if (numQueuePairs > 0) {
-            double nicBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec;
-            tfrIdx++;
-            table.Set(row, 5, " %.2f ", nicBw);
-            nicMin = std::min(nicMin, nicBw);
-            nicAvg += nicBw;
-            nicMax = std::max(nicMax, nicBw);
-          }
+          table.Set(row, 5, " %.2f ", nicBw);
+          nicMin = std::min(nicMin, nicBw);
+          nicAvg += nicBw;
+          nicMax = std::max(nicMax, nicBw);
         }
+      }
 
-        int summaryBase = 2 + numHops;
-        table.DrawRowBorder(summaryBase);
-        table.Set(summaryBase    , 1, " MAX ");
-        table.Set(summaryBase + 1, 1, " AVG ");
-        table.Set(summaryBase + 2, 1, " MIN ");
-        table.Set(summaryBase    , 4, " %.2f ", gfxMax);
-        table.Set(summaryBase + 1, 4, " %.2f ", gfxAvg / numHops);
-        table.Set(summaryBase + 2, 4, " %.2f ", gfxMin);
-
-        if (numQueuePairs > 0) {
-          table.Set(summaryBase    , 5, " %.2f ", nicMax);
-          table.Set(summaryBase + 1, 5, " %.2f ", nicAvg / numHops);
-          table.Set(summaryBase + 2, 5, " %.2f ", nicMin);
-        }
+      int summaryBase = 2 + numHops;
+      table.DrawRowBorder(summaryBase);
+      table.Set(summaryBase    , 1, " MAX ");
+      table.Set(summaryBase + 1, 1, " AVG ");
+      table.Set(summaryBase + 2, 1, " MIN ");
+      table.Set(summaryBase    , 4, " %.2f ", gfxMax);
+      table.Set(summaryBase + 1, 4, " %.2f ", gfxAvg / numHops);
+      table.Set(summaryBase + 2, 4, " %.2f ", gfxMin);
+
+      if (numQueuePairs > 0) {
+        table.Set(summaryBase    , 5, " %.2f ", nicMax);
+        table.Set(summaryBase + 1, 5, " %.2f ", nicAvg / numHops);
+        table.Set(summaryBase + 2, 5, " %.2f ", nicMin);
+      }
 
-        table.PrintTable(ev.outputToCsv, ev.showBorders);
+      table.PrintTable(ev.outputToCsv, ev.showBorders);
 
-        Utils::Print("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
-      }
+      Utils::Print("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
     }
   }
 

From 56b8de6c97f2bb7a00ee0c9c3d6e91220a3c5e11 Mon Sep 17 00:00:00 2001
From: AtlantaPepsi <timhu102@amd.com>
Date: Tue, 28 Apr 2026 21:50:20 +0000
Subject: [PATCH 5/5] trigger CI

---
 src/client/Presets/PodAllToAll.hpp | 144 ++++++++++++++++++++++-------
 1 file changed, 110 insertions(+), 34 deletions(-)

diff --git a/src/client/Presets/PodAllToAll.hpp b/src/client/Presets/PodAllToAll.hpp
index dc33f75..b449ca8 100644
--- a/src/client/Presets/PodAllToAll.hpp
+++ b/src/client/Presets/PodAllToAll.hpp
@@ -151,9 +151,17 @@ int PodAllToAllPreset(EnvVars&          ev,
       }
     }
 
+    // Build transfers for every group, then run once per pod so all groups share the same
+    // timed iterations (traffic across groups is concurrent within RunTransfers).
+    std::vector<Transfer> podTransfers;
+    std::vector<size_t> groupTransferBase(numGroups);
+    std::vector<std::vector<std::vector<int>>> groupReIndexes(numGroups);
+
     for (int group = 0; group < numGroups; group++) {
-      std::vector<std::vector<int>> groupReIndex(groupSize, std::vector<int>(groupSize, -1));
-      std::vector<Transfer> transfers;
+      groupTransferBase[group] = podTransfers.size();
+      groupReIndexes[group].assign(groupSize, std::vector<int>(groupSize, -1));
+      std::vector<std::vector<int>>& groupReIndex = groupReIndexes[group];
+
       for (int i = group * groupSize; i < (group + 1) * groupSize; i++) {
         for (int j = group * groupSize; j < (group + 1) * groupSize; j++) {
           if (i == j) {
@@ -171,8 +179,9 @@ int PodAllToAllPreset(EnvVars&          ev,
           transfer.numSubExecs = numSubExecs;
           int const localI = i - group * groupSize;
           int const localJ = j - group * groupSize;
-          groupReIndex[localI][localJ] = (int)transfers.size();
-          transfers.push_back(transfer);
+          groupReIndex[localI][localJ] =
+              (int)(podTransfers.size() - groupTransferBase[group]);
+          podTransfers.push_back(transfer);
         }
 
         if (numQueuePairs > 0) {
@@ -185,19 +194,47 @@ int PodAllToAllPreset(EnvVars&          ev,
                                (int32_t)devices[i].memIndex, (int32_t)devices[i].memRank};
           transfer.exeSubIndex = devices[next].memIndex;
           transfer.numSubExecs = numQueuePairs;
-          transfers.push_back(transfer);
+          podTransfers.push_back(transfer);
         }
       }
-      TransferBench::TestResults results;
-      if (!TransferBench::RunTransfers(cfg, transfers, results)) {
-        for (auto const& err : results.errResults)
-          Utils::Print("%s\n", err.errMsg.c_str());
-        return 1;
-      }
-      if (showDetails) {
-        Utils::PrintResults(ev, 1, transfers, results);
+    }
+
+    if (Utils::RankDoesOutput()) {
+      for (int g = 0; g < numGroups; g++) {
+        int const gb = g * groupSize;
+        Utils::Print("A2A group %d:", g);
+        std::vector<int> ord(groupSize);
+        for (int i = 0; i < groupSize; i++) ord[i] = i;
+        std::sort(ord.begin(), ord.end(), [&](int a, int b) {
+          MemDevice const& da = devices[gb + a];
+          MemDevice const& db = devices[gb + b];
+          if (da.memRank != db.memRank) return da.memRank < db.memRank;
+          return da.memIndex < db.memIndex;
+        });
+        for (size_t si = 0; si < ord.size(); si++) {
+          MemDevice const& d = devices[gb + ord[si]];
+          Utils::Print("%s R%d:G%d", si ? "," : "", d.memRank, d.memIndex);
+        }
         Utils::Print("\n");
       }
+    }
+
+    TransferBench::TestResults results;
+    if (!TransferBench::RunTransfers(cfg, podTransfers, results)) {
+      for (auto const& err : results.errResults)
+        Utils::Print("%s\n", err.errMsg.c_str());
+      return 1;
+    }
+    if (showDetails) {
+      if (Utils::RankDoesOutput())
+        Utils::Print("\n--- Pod AllToAll (all %d groups concurrent) ---\n", numGroups);
+      Utils::PrintResults(ev, 1, podTransfers, results);
+      Utils::Print("\n");
+    }
+
+    for (int group = 0; group < numGroups; group++) {
+      std::vector<std::vector<int>> const& groupReIndex = groupReIndexes[group];
+      size_t const tfrBase = groupTransferBase[group];
 
       // Per-group bandwidth table
       std::vector<std::vector<double>> groupBw(groupSize, std::vector<double>(groupSize, -1.0));
@@ -205,14 +242,42 @@ int PodAllToAllPreset(EnvVars&          ev,
         for (int localJ = 0; localJ < groupSize; localJ++) {
           int const k = groupReIndex[localI][localJ];
           if (k >= 0)
-            groupBw[localI][localJ] = results.tfrResults[k].avgBandwidthGbPerSec;
+            groupBw[localI][localJ] = results.tfrResults[tfrBase + k].avgBandwidthGbPerSec;
         }
       }
       if (Utils::RankDoesOutput()) {
         Utils::Print("\n--- Pod AllToAll Group %d ---\n", group);
         int const groupBase = group * groupSize;
+
+        // Display order: group devices by MPI rank, then GPU index (stride only affects execution order).
+        std::vector<int> order(groupSize);
+        for (int i = 0; i < groupSize; i++) order[i] = i;
+        std::sort(order.begin(), order.end(), [&](int a, int b) {
+          MemDevice const& da = devices[groupBase + a];
+          MemDevice const& db = devices[groupBase + b];
+          if (da.memRank != db.memRank) return da.memRank < db.memRank;
+          return da.memIndex < db.memIndex;
+        });
+        std::vector<int> colRanks;
+        for (int slot : order) {
+          int const r = devices[groupBase + slot].memRank;
+          if (colRanks.empty() || colRanks.back() != r) colRanks.push_back(r);
+        }
+        std::vector<std::vector<int>> localsPerCol;
+        localsPerCol.reserve(colRanks.size());
+        for (int dr : colRanks) {
+          std::vector<int> loc;
+          for (int li = 0; li < groupSize; li++) {
+            if (devices[groupBase + li].memRank == dr) loc.push_back(li);
+          }
+          std::sort(loc.begin(), loc.end(), [&](int a, int b) {
+            return devices[groupBase + a].memIndex < devices[groupBase + b].memIndex;
+          });
+          localsPerCol.push_back(std::move(loc));
+        }
+
         int const numRows = 2 + groupSize;
-        int const numCols = 2 + groupSize;
+        int const numCols = 2 + (int)colRanks.size();
         int const precision = 2;
         Utils::TableHelper table(numRows, numCols, precision);
         table.DrawRowBorder(0);
@@ -224,35 +289,46 @@ int PodAllToAllPreset(EnvVars&          ev,
         table.DrawColBorder(1);
         table.Set(1, 1, " Mem Device ");
 
-        // Column headers
-        int colPrevRank = -1;
-        for (int j = 0; j < groupSize; j++) {
-          int colIdx = 2 + j;
-          int r = devices[groupBase + j].memRank;
-          if (r != colPrevRank) {
-            table.DrawColBorder(colIdx);
-            table.Set(0, colIdx, " Rank %02d ", r);
-            colPrevRank = r;
+        for (size_t c = 0; c < colRanks.size(); c++) {
+          int const colIdx = 2 + (int)c;
+          table.DrawColBorder(colIdx);
+          table.Set(0, colIdx, " Rank %02d ", colRanks[c]);
+          std::string gpuHdr;
+          for (int li : localsPerCol[c]) {
+            char t[24];
+            snprintf(t, sizeof(t), "  GPU %02d ", devices[groupBase + li].memIndex);
+            gpuHdr += t;
           }
-          table.Set(1, colIdx, " GPU %02d ", devices[groupBase + j].memIndex);
+          table.Set(1, colIdx, "%s", gpuHdr.c_str());
+          table.SetColAlignment((int)c + 2, Utils::TableHelper::ALIGN_LEFT);
         }
 
-        // Row headers and data
         int rowPrevRank = -1;
-        for (int localI = 0; localI < groupSize; localI++) {
-          int rowIdx = 2 + localI;
-          int r = devices[groupBase + localI].memRank;
+        for (int disp = 0; disp < groupSize; disp++) {
+          int const localI = order[disp];
+          int const rowIdx = 2 + disp;
+          int const r = devices[groupBase + localI].memRank;
           if (r != rowPrevRank) {
             table.DrawRowBorder(rowIdx);
             table.Set(rowIdx, 0, " Rank %02d ", r);
             rowPrevRank = r;
+          } else {
+            table.Set(rowIdx, 0, " ");
           }
           table.Set(rowIdx, 1, " GPU %02d ", devices[groupBase + localI].memIndex);
-          for (int localJ = 0; localJ < groupSize; localJ++) {
-            if (groupBw[localI][localJ] >= 0)
-              table.Set(rowIdx, 2 + localJ, " %.2f ", groupBw[localI][localJ]);
-            else
-              table.Set(rowIdx, 2 + localJ, " N/A ");
+          for (size_t c = 0; c < colRanks.size(); c++) {
+            std::string cell;
+            for (int localJ : localsPerCol[c]) {
+              char t[16];
+              if (groupBw[localI][localJ] >= 0)
+                snprintf(t, sizeof(t), " %7.2f", groupBw[localI][localJ]);
+              else
+                snprintf(t, sizeof(t), " %7s", "N/A");
+              cell += t;
+            }
+            int const colIdx = 2 + (int)c;
+            table.Set(rowIdx, colIdx, "%s", cell.c_str());
+            table.SetCellAlignment(rowIdx, colIdx, Utils::TableHelper::ALIGN_LEFT);
           }
         }
         table.PrintTable(ev.outputToCsv, ev.showBorders);