ROCm · AtlantaPepsi · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026
@@ -22,100 +22,6 @@ THE SOFTWARE.
 
 // Helper functions
 
-// Returns a schedule of round robin pairing of N elements, using Circle Method
-// if parallel, each round contains N/2 pairs, otherwise serial
-void RoundRobinSchedule(std::vector<std::vector<std::pair<int, int>>>& schedule,
-                        int N, int parallel = 0) {
-  if (N == 1) {
-    schedule.push_back({{0,0}});
-    return;
-  }
-  // Generate standard round-robin tournament (maximum parallelism)
-  std::vector<std::vector<std::pair<int, int>>> fullSchedule;
-
-  // Pad odd number of ranks with a dummy round (N+1)
-  int paddedN = N + N%2;
-  // Round-robin tournament scheduling
-  for (int round = 0; round < paddedN - 1; round++) {
-    std::vector<std::pair<int, int>> roundPairs;
-    std::vector<std::pair<int, int>> roundPairsReversed;
-    for (int i = 0; i < paddedN / 2; i++) {
-      int item1 = i;
-      int item2 = paddedN - 1 - i;
-      if (round > 0) {
-        // Rotate all except the first item
-        if (item1 > 0) item1 = ((item1 - 1 + round) % (paddedN - 1)) + 1;
-        if (item2 > 0) item2 = ((item2 - 1 + round) % (paddedN - 1)) + 1;
-      }
-      // Ignore dummy round, its partner sits out this ronud
-      if (item1 < N && item2 < N){
-        roundPairs.push_back({item1, item2});
-        roundPairsReversed.push_back({item2, item1});
-      }
-    }
-    fullSchedule.push_back(roundPairs);
-    fullSchedule.push_back(roundPairsReversed);
-  }
-
-  // A loopback round where all run in parallel
-  std::vector<std::pair<int, int>> selfRound;
-  for (int i = 0; i < N; i++) {
-    selfRound.push_back({i, i});
-  }
-  fullSchedule.push_back(selfRound);
-
-  if (parallel) {
-    schedule = std::move(fullSchedule);
-  } else {
-    // Serialize each round if needed
-    for (auto const& fullRound : fullSchedule) {
-      for (auto const& match : fullRound) {
-        std::vector<std::pair<int, int>> subRound;
-        subRound.push_back({match.first, match.second});
-        schedule.push_back(subRound);
-      }
-    }
-  }
-}
-
-// Returns a schedule for ordered 2-combination of N elements
-// by pairing the list with its rotating self,
-// each round contains n pairs, where 1 <= n <= N and N is divisible by n
-// and an element cannot appear more than twice in a round,
-void CombinationSchedule(std::vector<std::vector<std::pair<int, int>>>& schedule,
-                           int N, int n = 0) {
-  std::vector<std::vector<std::pair<int, int>>> fullSchedule;
-
-  if (n <= 0) n = N;
-  if (N <= 0 || n > N || N % n != 0) // Assuming balanced load for each round
-  {
-    n = 1;
-    Utils::Print("[WARN] cannot create round robin schedule, falling back to serial");
-  }
-
-  // Generate rounds of combination based on incrementing distance
-  for (int i = 0; i < N; i++) {
-    std::vector<std::pair<int, int>> round;
-    for (int j = 0; j < N; j++) {
-      round.push_back({j, (j+i)%N});
-    }
-    fullSchedule.push_back(round);
-  }
-
-  // Step 2: Split each full round into sub-rounds with at most n pairs
-  for (auto const& fullRound : fullSchedule) {
-    for (size_t start = 0; start < fullRound.size(); start += n) {
-      std::vector<std::pair<int, int>> subRound;
-      for (size_t i = start; i < start + n && i < fullRound.size(); i++) {
-        subRound.push_back(fullRound[i]);
-      }
-      if (!subRound.empty()) {
-        schedule.push_back(subRound);
-      }
-    }
-  }
-}
-
 int GetClosestDeviceToNic(MemType memType, int nicIdx, int rank) {
   return TransferBench::IsCpuMemType(memType) ?
          TransferBench::GetClosestCpuNumaToNic(nicIdx, rank) :
@@ -203,8 +109,8 @@ int NicPeerToPeerPreset(EnvVars&          ev,
   std::vector<std::vector<std::pair<int, int>>> schedule;
   std::vector<std::vector<std::pair<int, int>>> nicSchedule;
 
-  RoundRobinSchedule(schedule, numRanks, nodeParallel);
-  CombinationSchedule(nicSchedule, numNicsPerRank, nicParLevel);
+  Utils::RoundRobinSchedule(schedule, numRanks, nodeParallel);
+  Utils::CombinationSchedule(nicSchedule, numNicsPerRank, nicParLevel);
 
   int totalTransfers = numRanks * numNicsPerRank * numRanks * numNicsPerRank;
   int counter = 0;

@@ -20,27 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
-// Reorder elements of list by stepping through with stride k, wrapping around.
-// When gcd(k, n) > 1 the single cycle breaks into gcd(k, n) orbits which are
-// concatenated, so every element appears exactly once in the output.
-// The reordered list will be further separated into different groups.
-void StrideGenerate(std::vector<int>& list, int k) {
-  int n = list.size();
-  k = ((k % n) + n) % n;  // normalize to 0..n-1
-  if (k == 0) return;
-
-  int d = std::gcd(k, n);
-  std::vector<int> out;
-  out.reserve(n);
-
-  for (int s = 0; s < d; s++) {
-    for (int j = 0; j < n / d; j++) {
-      out.push_back(list[(s + j * k) % n]);
-    }
-  }
-  list = std::move(out);
-}
-
 int PodAllToAllPreset(EnvVars&          ev,
                       size_t      const numBytesPerTransfer,
                       std::string const presetName,
@@ -164,17 +143,25 @@ int PodAllToAllPreset(EnvVars&          ev,
     std::vector<MemDevice> devices(n);
     std::vector<int> indices(n);
     for (int k = 0; k < n; k++) indices[k] = k;
-    StrideGenerate(indices, stride);
+    Utils::StrideGenerate(indices, stride);
     int idx = 0;
     for (int rank : ranks) {
       for (int devIdx = 0; devIdx < numGpus; devIdx++) {
         devices[indices[idx++]] = {memType, devIdx, rank};
       }
     }
 
+    // Build transfers for every group, then run once per pod so all groups share the same
+    // timed iterations (traffic across groups is concurrent within RunTransfers).
+    std::vector<Transfer> podTransfers;
+    std::vector<size_t> groupTransferBase(numGroups);
+    std::vector<std::vector<std::vector<int>>> groupReIndexes(numGroups);
+
     for (int group = 0; group < numGroups; group++) {
-      std::vector<std::vector<int>> groupReIndex(groupSize, std::vector<int>(groupSize, -1));
-      std::vector<Transfer> transfers;
+      groupTransferBase[group] = podTransfers.size();
+      groupReIndexes[group].assign(groupSize, std::vector<int>(groupSize, -1));
+      std::vector<std::vector<int>>& groupReIndex = groupReIndexes[group];
+
       for (int i = group * groupSize; i < (group + 1) * groupSize; i++) {
         for (int j = group * groupSize; j < (group + 1) * groupSize; j++) {
           if (i == j) {
@@ -192,8 +179,9 @@ int PodAllToAllPreset(EnvVars&          ev,
           transfer.numSubExecs = numSubExecs;
           int const localI = i - group * groupSize;
           int const localJ = j - group * groupSize;
-          groupReIndex[localI][localJ] = (int)transfers.size();
-          transfers.push_back(transfer);
+          groupReIndex[localI][localJ] =
+              (int)(podTransfers.size() - groupTransferBase[group]);
+          podTransfers.push_back(transfer);
         }
 
         if (numQueuePairs > 0) {
@@ -206,34 +194,90 @@ int PodAllToAllPreset(EnvVars&          ev,
                                (int32_t)devices[i].memIndex, (int32_t)devices[i].memRank};
           transfer.exeSubIndex = devices[next].memIndex;
           transfer.numSubExecs = numQueuePairs;
-          transfers.push_back(transfer);
+          podTransfers.push_back(transfer);
         }
       }
-      TransferBench::TestResults results;
-      if (!TransferBench::RunTransfers(cfg, transfers, results)) {
-        for (auto const& err : results.errResults)
-          Utils::Print("%s\n", err.errMsg.c_str());
-        return 1;
-      }
-      if (showDetails) {
-        Utils::PrintResults(ev, 1, transfers, results);
+    }
+
+    if (Utils::RankDoesOutput()) {
+      for (int g = 0; g < numGroups; g++) {
+        int const gb = g * groupSize;
+        Utils::Print("A2A group %d:", g);
+        std::vector<int> ord(groupSize);
+        for (int i = 0; i < groupSize; i++) ord[i] = i;
+        std::sort(ord.begin(), ord.end(), [&](int a, int b) {
+          MemDevice const& da = devices[gb + a];
+          MemDevice const& db = devices[gb + b];
+          if (da.memRank != db.memRank) return da.memRank < db.memRank;
+          return da.memIndex < db.memIndex;
+        });
+        for (size_t si = 0; si < ord.size(); si++) {
+          MemDevice const& d = devices[gb + ord[si]];
+          Utils::Print("%s R%d:G%d", si ? "," : "", d.memRank, d.memIndex);
+        }
         Utils::Print("\n");
       }
+    }
+
+    TransferBench::TestResults results;
+    if (!TransferBench::RunTransfers(cfg, podTransfers, results)) {
+      for (auto const& err : results.errResults)
+        Utils::Print("%s\n", err.errMsg.c_str());
+      return 1;
+    }
+    if (showDetails) {
+      if (Utils::RankDoesOutput())
+        Utils::Print("\n--- Pod AllToAll (all %d groups concurrent) ---\n", numGroups);
+      Utils::PrintResults(ev, 1, podTransfers, results);
+      Utils::Print("\n");
+    }
+
+    for (int group = 0; group < numGroups; group++) {
+      std::vector<std::vector<int>> const& groupReIndex = groupReIndexes[group];
+      size_t const tfrBase = groupTransferBase[group];
 
       // Per-group bandwidth table
       std::vector<std::vector<double>> groupBw(groupSize, std::vector<double>(groupSize, -1.0));
       for (int localI = 0; localI < groupSize; localI++) {
         for (int localJ = 0; localJ < groupSize; localJ++) {
           int const k = groupReIndex[localI][localJ];
           if (k >= 0)
-            groupBw[localI][localJ] = results.tfrResults[k].avgBandwidthGbPerSec;
+            groupBw[localI][localJ] = results.tfrResults[tfrBase + k].avgBandwidthGbPerSec;
         }
       }
       if (Utils::RankDoesOutput()) {
         Utils::Print("\n--- Pod AllToAll Group %d ---\n", group);
         int const groupBase = group * groupSize;
+
+        // Display order: group devices by MPI rank, then GPU index (stride only affects execution order).
+        std::vector<int> order(groupSize);
+        for (int i = 0; i < groupSize; i++) order[i] = i;
+        std::sort(order.begin(), order.end(), [&](int a, int b) {
+          MemDevice const& da = devices[groupBase + a];
+          MemDevice const& db = devices[groupBase + b];
+          if (da.memRank != db.memRank) return da.memRank < db.memRank;
+          return da.memIndex < db.memIndex;
+        });
+        std::vector<int> colRanks;
+        for (int slot : order) {
+          int const r = devices[groupBase + slot].memRank;
+          if (colRanks.empty() || colRanks.back() != r) colRanks.push_back(r);
+        }
+        std::vector<std::vector<int>> localsPerCol;
+        localsPerCol.reserve(colRanks.size());
+        for (int dr : colRanks) {
+          std::vector<int> loc;
+          for (int li = 0; li < groupSize; li++) {
+            if (devices[groupBase + li].memRank == dr) loc.push_back(li);
+          }
+          std::sort(loc.begin(), loc.end(), [&](int a, int b) {
+            return devices[groupBase + a].memIndex < devices[groupBase + b].memIndex;
+          });
+          localsPerCol.push_back(std::move(loc));
+        }
+
         int const numRows = 2 + groupSize;
-        int const numCols = 2 + groupSize;
+        int const numCols = 2 + (int)colRanks.size();
         int const precision = 2;
         Utils::TableHelper table(numRows, numCols, precision);
         table.DrawRowBorder(0);
@@ -245,35 +289,46 @@ int PodAllToAllPreset(EnvVars&          ev,
         table.DrawColBorder(1);
         table.Set(1, 1, " Mem Device ");
 
-        // Column headers
-        int colPrevRank = -1;
-        for (int j = 0; j < groupSize; j++) {
-          int colIdx = 2 + j;
-          int r = devices[groupBase + j].memRank;
-          if (r != colPrevRank) {
-            table.DrawColBorder(colIdx);
-            table.Set(0, colIdx, " Rank %02d ", r);
-            colPrevRank = r;
+        for (size_t c = 0; c < colRanks.size(); c++) {
+          int const colIdx = 2 + (int)c;
+          table.DrawColBorder(colIdx);
+          table.Set(0, colIdx, " Rank %02d ", colRanks[c]);
+          std::string gpuHdr;
+          for (int li : localsPerCol[c]) {
+            char t[24];
+            snprintf(t, sizeof(t), "  GPU %02d ", devices[groupBase + li].memIndex);
+            gpuHdr += t;
           }
-          table.Set(1, colIdx, " GPU %02d ", devices[groupBase + j].memIndex);
+          table.Set(1, colIdx, "%s", gpuHdr.c_str());
+          table.SetColAlignment((int)c + 2, Utils::TableHelper::ALIGN_LEFT);
         }
 
-        // Row headers and data
         int rowPrevRank = -1;
-        for (int localI = 0; localI < groupSize; localI++) {
-          int rowIdx = 2 + localI;
-          int r = devices[groupBase + localI].memRank;
+        for (int disp = 0; disp < groupSize; disp++) {
+          int const localI = order[disp];
+          int const rowIdx = 2 + disp;
+          int const r = devices[groupBase + localI].memRank;
           if (r != rowPrevRank) {
             table.DrawRowBorder(rowIdx);
             table.Set(rowIdx, 0, " Rank %02d ", r);
             rowPrevRank = r;
+          } else {
+            table.Set(rowIdx, 0, " ");
           }
           table.Set(rowIdx, 1, " GPU %02d ", devices[groupBase + localI].memIndex);
-          for (int localJ = 0; localJ < groupSize; localJ++) {
-            if (groupBw[localI][localJ] >= 0)
-              table.Set(rowIdx, 2 + localJ, " %.2f ", groupBw[localI][localJ]);
-            else
-              table.Set(rowIdx, 2 + localJ, " N/A ");
+          for (size_t c = 0; c < colRanks.size(); c++) {
+            std::string cell;
+            for (int localJ : localsPerCol[c]) {
+              char t[16];
+              if (groupBw[localI][localJ] >= 0)
+                snprintf(t, sizeof(t), " %7.2f", groupBw[localI][localJ]);
+              else
+                snprintf(t, sizeof(t), " %7s", "N/A");
+              cell += t;
+            }
+            int const colIdx = 2 + (int)c;
+            table.Set(rowIdx, colIdx, "%s", cell.c_str());
+            table.SetCellAlignment(rowIdx, colIdx, Utils::TableHelper::ALIGN_LEFT);
           }
         }
         table.PrintTable(ev.outputToCsv, ev.showBorders);

@@ -126,7 +126,7 @@ int PodPeerToPeerPreset(EnvVars&          ev,
       } else {
         // parallelLevel == 1: node pairs run concurrently, one device pair at a time per node pair
         std::vector<std::vector<std::pair<int, int>>> nodePairSchedule;
-        RoundRobinSchedule(nodePairSchedule, (int)ranks.size(), 1);
+        Utils::RoundRobinSchedule(nodePairSchedule, (int)ranks.size(), 1);
 
         for (auto const& roundNodePairs : nodePairSchedule) {
           for (int srcDev = 0; srcDev < numGpuDevices; srcDev++) {