diff --git a/src/client/Presets/NicPeerToPeer.hpp b/src/client/Presets/NicPeerToPeer.hpp
index f0e0def..8c54baa 100644
--- a/src/client/Presets/NicPeerToPeer.hpp
+++ b/src/client/Presets/NicPeerToPeer.hpp
@@ -22,100 +22,6 @@ THE SOFTWARE.
 
 // Helper functions
 
-// Returns a schedule of round robin pairing of N elements, using Circle Method
-// if parallel, each round contains N/2 pairs, otherwise serial
-void RoundRobinSchedule(std::vector<std::vector<std::pair<int, int>>>& schedule,
-                        int N, int parallel = 0) {
-  if (N == 1) {
-    schedule.push_back({{0,0}});
-    return;
-  }
-  // Generate standard round-robin tournament (maximum parallelism)
-  std::vector<std::vector<std::pair<int, int>>> fullSchedule;
-
-  // Pad odd number of ranks with a dummy round (N+1)
-  int paddedN = N + N%2;
-  // Round-robin tournament scheduling
-  for (int round = 0; round < paddedN - 1; round++) {
-    std::vector<std::pair<int, int>> roundPairs;
-    std::vector<std::pair<int, int>> roundPairsReversed;
-    for (int i = 0; i < paddedN / 2; i++) {
-      int item1 = i;
-      int item2 = paddedN - 1 - i;
-      if (round > 0) {
-        // Rotate all except the first item
-        if (item1 > 0) item1 = ((item1 - 1 + round) % (paddedN - 1)) + 1;
-        if (item2 > 0) item2 = ((item2 - 1 + round) % (paddedN - 1)) + 1;
-      }
-      // Ignore dummy round, its partner sits out this ronud
-      if (item1 < N && item2 < N){
-        roundPairs.push_back({item1, item2});
-        roundPairsReversed.push_back({item2, item1});
-      }
-    }
-    fullSchedule.push_back(roundPairs);
-    fullSchedule.push_back(roundPairsReversed);
-  }
-
-  // A loopback round where all run in parallel
-  std::vector<std::pair<int, int>> selfRound;
-  for (int i = 0; i < N; i++) {
-    selfRound.push_back({i, i});
-  }
-  fullSchedule.push_back(selfRound);
-
-  if (parallel) {
-    schedule = std::move(fullSchedule);
-  } else {
-    // Serialize each round if needed
-    for (auto const& fullRound : fullSchedule) {
-      for (auto const& match : fullRound) {
-        std::vector<std::pair<int, int>> subRound;
-        subRound.push_back({match.first, match.second});
-        schedule.push_back(subRound);
-      }
-    }
-  }
-}
-
-// Returns a schedule for ordered 2-combination of N elements
-// by pairing the list with its rotating self,
-// each round contains n pairs, where 1 <= n <= N and N is divisible by n
-// and an element cannot appear more than twice in a round,
-void CombinationSchedule(std::vector<std::vector<std::pair<int, int>>>& schedule,
-                           int N, int n = 0) {
-  std::vector<std::vector<std::pair<int, int>>> fullSchedule;
-
-  if (n <= 0) n = N;
-  if (N <= 0 || n > N || N % n != 0) // Assuming balanced load for each round
-  {
-    n = 1;
-    Utils::Print("[WARN] cannot create round robin schedule, falling back to serial");
-  }
-
-  // Generate rounds of combination based on incrementing distance
-  for (int i = 0; i < N; i++) {
-    std::vector<std::pair<int, int>> round;
-    for (int j = 0; j < N; j++) {
-      round.push_back({j, (j+i)%N});
-    }
-    fullSchedule.push_back(round);
-  }
-
-  // Step 2: Split each full round into sub-rounds with at most n pairs
-  for (auto const& fullRound : fullSchedule) {
-    for (size_t start = 0; start < fullRound.size(); start += n) {
-      std::vector<std::pair<int, int>> subRound;
-      for (size_t i = start; i < start + n && i < fullRound.size(); i++) {
-        subRound.push_back(fullRound[i]);
-      }
-      if (!subRound.empty()) {
-        schedule.push_back(subRound);
-      }
-    }
-  }
-}
-
 int GetClosestDeviceToNic(MemType memType, int nicIdx, int rank) {
   return TransferBench::IsCpuMemType(memType) ?
          TransferBench::GetClosestCpuNumaToNic(nicIdx, rank) :
@@ -203,8 +109,8 @@ int NicPeerToPeerPreset(EnvVars&          ev,
   std::vector<std::vector<std::pair<int, int>>> schedule;
   std::vector<std::vector<std::pair<int, int>>> nicSchedule;
 
-  RoundRobinSchedule(schedule, numRanks, nodeParallel);
-  CombinationSchedule(nicSchedule, numNicsPerRank, nicParLevel);
+  Utils::RoundRobinSchedule(schedule, numRanks, nodeParallel);
+  Utils::CombinationSchedule(nicSchedule, numNicsPerRank, nicParLevel);
 
   int totalTransfers = numRanks * numNicsPerRank * numRanks * numNicsPerRank;
   int counter = 0;
diff --git a/src/client/Presets/PodAllToAll.hpp b/src/client/Presets/PodAllToAll.hpp
index e03d388..b449ca8 100644
--- a/src/client/Presets/PodAllToAll.hpp
+++ b/src/client/Presets/PodAllToAll.hpp
@@ -20,27 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
-// Reorder elements of list by stepping through with stride k, wrapping around.
-// When gcd(k, n) > 1 the single cycle breaks into gcd(k, n) orbits which are
-// concatenated, so every element appears exactly once in the output.
-// The reordered list will be further separated into different groups.
-void StrideGenerate(std::vector<int>& list, int k) {
-  int n = list.size();
-  k = ((k % n) + n) % n;  // normalize to 0..n-1
-  if (k == 0) return;
-
-  int d = std::gcd(k, n);
-  std::vector<int> out;
-  out.reserve(n);
-
-  for (int s = 0; s < d; s++) {
-    for (int j = 0; j < n / d; j++) {
-      out.push_back(list[(s + j * k) % n]);
-    }
-  }
-  list = std::move(out);
-}
-
 int PodAllToAllPreset(EnvVars&          ev,
                       size_t      const numBytesPerTransfer,
                       std::string const presetName,
@@ -164,7 +143,7 @@ int PodAllToAllPreset(EnvVars&          ev,
     std::vector<MemDevice> devices(n);
     std::vector<int> indices(n);
     for (int k = 0; k < n; k++) indices[k] = k;
-    StrideGenerate(indices, stride);
+    Utils::StrideGenerate(indices, stride);
     int idx = 0;
     for (int rank : ranks) {
       for (int devIdx = 0; devIdx < numGpus; devIdx++) {
@@ -172,9 +151,17 @@ int PodAllToAllPreset(EnvVars&          ev,
       }
     }
 
+    // Build transfers for every group, then run once per pod so all groups share the same
+    // timed iterations (traffic across groups is concurrent within RunTransfers).
+    std::vector<Transfer> podTransfers;
+    std::vector<size_t> groupTransferBase(numGroups);
+    std::vector<std::vector<std::vector<int>>> groupReIndexes(numGroups);
+
     for (int group = 0; group < numGroups; group++) {
-      std::vector<std::vector<int>> groupReIndex(groupSize, std::vector<int>(groupSize, -1));
-      std::vector<Transfer> transfers;
+      groupTransferBase[group] = podTransfers.size();
+      groupReIndexes[group].assign(groupSize, std::vector<int>(groupSize, -1));
+      std::vector<std::vector<int>>& groupReIndex = groupReIndexes[group];
+
       for (int i = group * groupSize; i < (group + 1) * groupSize; i++) {
         for (int j = group * groupSize; j < (group + 1) * groupSize; j++) {
           if (i == j) {
@@ -192,8 +179,9 @@ int PodAllToAllPreset(EnvVars&          ev,
           transfer.numSubExecs = numSubExecs;
           int const localI = i - group * groupSize;
           int const localJ = j - group * groupSize;
-          groupReIndex[localI][localJ] = (int)transfers.size();
-          transfers.push_back(transfer);
+          groupReIndex[localI][localJ] =
+              (int)(podTransfers.size() - groupTransferBase[group]);
+          podTransfers.push_back(transfer);
         }
 
         if (numQueuePairs > 0) {
@@ -206,19 +194,47 @@ int PodAllToAllPreset(EnvVars&          ev,
                                (int32_t)devices[i].memIndex, (int32_t)devices[i].memRank};
           transfer.exeSubIndex = devices[next].memIndex;
           transfer.numSubExecs = numQueuePairs;
-          transfers.push_back(transfer);
+          podTransfers.push_back(transfer);
         }
       }
-      TransferBench::TestResults results;
-      if (!TransferBench::RunTransfers(cfg, transfers, results)) {
-        for (auto const& err : results.errResults)
-          Utils::Print("%s\n", err.errMsg.c_str());
-        return 1;
-      }
-      if (showDetails) {
-        Utils::PrintResults(ev, 1, transfers, results);
+    }
+
+    if (Utils::RankDoesOutput()) {
+      for (int g = 0; g < numGroups; g++) {
+        int const gb = g * groupSize;
+        Utils::Print("A2A group %d:", g);
+        std::vector<int> ord(groupSize);
+        for (int i = 0; i < groupSize; i++) ord[i] = i;
+        std::sort(ord.begin(), ord.end(), [&](int a, int b) {
+          MemDevice const& da = devices[gb + a];
+          MemDevice const& db = devices[gb + b];
+          if (da.memRank != db.memRank) return da.memRank < db.memRank;
+          return da.memIndex < db.memIndex;
+        });
+        for (size_t si = 0; si < ord.size(); si++) {
+          MemDevice const& d = devices[gb + ord[si]];
+          Utils::Print("%s R%d:G%d", si ? "," : "", d.memRank, d.memIndex);
+        }
         Utils::Print("\n");
       }
+    }
+
+    TransferBench::TestResults results;
+    if (!TransferBench::RunTransfers(cfg, podTransfers, results)) {
+      for (auto const& err : results.errResults)
+        Utils::Print("%s\n", err.errMsg.c_str());
+      return 1;
+    }
+    if (showDetails) {
+      if (Utils::RankDoesOutput())
+        Utils::Print("\n--- Pod AllToAll (all %d groups concurrent) ---\n", numGroups);
+      Utils::PrintResults(ev, 1, podTransfers, results);
+      Utils::Print("\n");
+    }
+
+    for (int group = 0; group < numGroups; group++) {
+      std::vector<std::vector<int>> const& groupReIndex = groupReIndexes[group];
+      size_t const tfrBase = groupTransferBase[group];
 
       // Per-group bandwidth table
       std::vector<std::vector<double>> groupBw(groupSize, std::vector<double>(groupSize, -1.0));
@@ -226,14 +242,42 @@ int PodAllToAllPreset(EnvVars&          ev,
         for (int localJ = 0; localJ < groupSize; localJ++) {
           int const k = groupReIndex[localI][localJ];
           if (k >= 0)
-            groupBw[localI][localJ] = results.tfrResults[k].avgBandwidthGbPerSec;
+            groupBw[localI][localJ] = results.tfrResults[tfrBase + k].avgBandwidthGbPerSec;
         }
       }
       if (Utils::RankDoesOutput()) {
         Utils::Print("\n--- Pod AllToAll Group %d ---\n", group);
         int const groupBase = group * groupSize;
+
+        // Display order: group devices by MPI rank, then GPU index (stride only affects execution order).
+        std::vector<int> order(groupSize);
+        for (int i = 0; i < groupSize; i++) order[i] = i;
+        std::sort(order.begin(), order.end(), [&](int a, int b) {
+          MemDevice const& da = devices[groupBase + a];
+          MemDevice const& db = devices[groupBase + b];
+          if (da.memRank != db.memRank) return da.memRank < db.memRank;
+          return da.memIndex < db.memIndex;
+        });
+        std::vector<int> colRanks;
+        for (int slot : order) {
+          int const r = devices[groupBase + slot].memRank;
+          if (colRanks.empty() || colRanks.back() != r) colRanks.push_back(r);
+        }
+        std::vector<std::vector<int>> localsPerCol;
+        localsPerCol.reserve(colRanks.size());
+        for (int dr : colRanks) {
+          std::vector<int> loc;
+          for (int li = 0; li < groupSize; li++) {
+            if (devices[groupBase + li].memRank == dr) loc.push_back(li);
+          }
+          std::sort(loc.begin(), loc.end(), [&](int a, int b) {
+            return devices[groupBase + a].memIndex < devices[groupBase + b].memIndex;
+          });
+          localsPerCol.push_back(std::move(loc));
+        }
+
         int const numRows = 2 + groupSize;
-        int const numCols = 2 + groupSize;
+        int const numCols = 2 + (int)colRanks.size();
         int const precision = 2;
         Utils::TableHelper table(numRows, numCols, precision);
         table.DrawRowBorder(0);
@@ -245,35 +289,46 @@ int PodAllToAllPreset(EnvVars&          ev,
         table.DrawColBorder(1);
         table.Set(1, 1, " Mem Device ");
 
-        // Column headers
-        int colPrevRank = -1;
-        for (int j = 0; j < groupSize; j++) {
-          int colIdx = 2 + j;
-          int r = devices[groupBase + j].memRank;
-          if (r != colPrevRank) {
-            table.DrawColBorder(colIdx);
-            table.Set(0, colIdx, " Rank %02d ", r);
-            colPrevRank = r;
+        for (size_t c = 0; c < colRanks.size(); c++) {
+          int const colIdx = 2 + (int)c;
+          table.DrawColBorder(colIdx);
+          table.Set(0, colIdx, " Rank %02d ", colRanks[c]);
+          std::string gpuHdr;
+          for (int li : localsPerCol[c]) {
+            char t[24];
+            snprintf(t, sizeof(t), "  GPU %02d ", devices[groupBase + li].memIndex);
+            gpuHdr += t;
           }
-          table.Set(1, colIdx, " GPU %02d ", devices[groupBase + j].memIndex);
+          table.Set(1, colIdx, "%s", gpuHdr.c_str());
+          table.SetColAlignment((int)c + 2, Utils::TableHelper::ALIGN_LEFT);
         }
 
-        // Row headers and data
         int rowPrevRank = -1;
-        for (int localI = 0; localI < groupSize; localI++) {
-          int rowIdx = 2 + localI;
-          int r = devices[groupBase + localI].memRank;
+        for (int disp = 0; disp < groupSize; disp++) {
+          int const localI = order[disp];
+          int const rowIdx = 2 + disp;
+          int const r = devices[groupBase + localI].memRank;
           if (r != rowPrevRank) {
             table.DrawRowBorder(rowIdx);
             table.Set(rowIdx, 0, " Rank %02d ", r);
             rowPrevRank = r;
+          } else {
+            table.Set(rowIdx, 0, " ");
           }
           table.Set(rowIdx, 1, " GPU %02d ", devices[groupBase + localI].memIndex);
-          for (int localJ = 0; localJ < groupSize; localJ++) {
-            if (groupBw[localI][localJ] >= 0)
-              table.Set(rowIdx, 2 + localJ, " %.2f ", groupBw[localI][localJ]);
-            else
-              table.Set(rowIdx, 2 + localJ, " N/A ");
+          for (size_t c = 0; c < colRanks.size(); c++) {
+            std::string cell;
+            for (int localJ : localsPerCol[c]) {
+              char t[16];
+              if (groupBw[localI][localJ] >= 0)
+                snprintf(t, sizeof(t), " %7.2f", groupBw[localI][localJ]);
+              else
+                snprintf(t, sizeof(t), " %7s", "N/A");
+              cell += t;
+            }
+            int const colIdx = 2 + (int)c;
+            table.Set(rowIdx, colIdx, "%s", cell.c_str());
+            table.SetCellAlignment(rowIdx, colIdx, Utils::TableHelper::ALIGN_LEFT);
           }
         }
         table.PrintTable(ev.outputToCsv, ev.showBorders);
diff --git a/src/client/Presets/PodPeerToPeer.hpp b/src/client/Presets/PodPeerToPeer.hpp
index 2148bd4..9ea8ca7 100644
--- a/src/client/Presets/PodPeerToPeer.hpp
+++ b/src/client/Presets/PodPeerToPeer.hpp
@@ -126,7 +126,7 @@ int PodPeerToPeerPreset(EnvVars&          ev,
       } else {
         // parallelLevel == 1: node pairs run concurrently, one device pair at a time per node pair
         std::vector<std::vector<std::pair<int, int>>> nodePairSchedule;
-        RoundRobinSchedule(nodePairSchedule, (int)ranks.size(), 1);
+        Utils::RoundRobinSchedule(nodePairSchedule, (int)ranks.size(), 1);
 
         for (auto const& roundNodePairs : nodePairSchedule) {
           for (int srcDev = 0; srcDev < numGpuDevices; srcDev++) {
diff --git a/src/client/Presets/PodRing.hpp b/src/client/Presets/PodRing.hpp
new file mode 100644
index 0000000..5b449e5
--- /dev/null
+++ b/src/client/Presets/PodRing.hpp
@@ -0,0 +1,280 @@
+/*
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+int PodRingPreset(EnvVars&          ev,
+                  size_t      const numBytesPerTransfer,
+                  std::string const presetName,
+                  bool        const bytesSpecified)
+{
+  // Assuming single pod, for now
+  if (Utils::GetNumRankGroups() > 1) {
+    Utils::Print("[ERROR] PodRing preset can only be run across ranks that are homogenous\n");
+    Utils::Print("[ERROR] Run ./TransferBench without any args to display topology information\n");
+    Utils::Print("[ERROR] TB_NIC_FILTER may also be used to limit NIC visibility\n");
+    return 1;
+  }
+  if (Utils::GetRankPerPodMap().empty()) {
+    Utils::Print("[ERROR] No pods detected. Set TB_FORCE_SINGLE_POD=1 to treat all ranks as a single pod.\n");
+    return 1;
+  }
+
+  ev.gfxUnroll       = EnvVars::GetEnvVar("GFX_UNROLL", 2);
+
+  int numRanks       = TransferBench::GetNumRanks();
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  int memTypeIdx    = EnvVars::GetEnvVar("MEM_TYPE"       , 0);
+  int numGpus       = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+  int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 0);
+  int numSubExecs   = EnvVars::GetEnvVar("NUM_SUB_EXEC"   , 8);
+  int showDetails   = EnvVars::GetEnvVar("SHOW_DETAILS"   , 0);
+  int useDmaExec    = EnvVars::GetEnvVar("USE_DMA_EXEC"   , 0);
+  int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
+  int stride        = EnvVars::GetEnvVar("STRIDE"         , 1);
+  int groupSize     = EnvVars::GetEnvVar("GROUP_SIZE"     , numRanks * numGpus);
+
+  if (numGpus <= 0 || numGpus > numDetectedGpus) {
+    Utils::Print("[ERROR] Cannot use %d GPUs.  Detected %d GPUs\n", numGpus, numDetectedGpus);
+    return 1;
+  }
+  if (groupSize < 2) {
+    Utils::Print("[ERROR] Group size must be at least 2 to form a ring\n");
+    return 1;
+  }
+  if (numRanks * numGpus % groupSize) {
+    Utils::Print("[ERROR] Group size %d cannot evenly divide %d total devices from %d ranks.\n",
+                 groupSize, numRanks * numGpus, numRanks);
+    return 1;
+  }
+
+  int numNics = TransferBench::GetNumExecutors(EXE_NIC, 0);
+  bool nicDifference = false;
+  for (int rank = 0; rank < numRanks; rank++) {
+    if (numGpus > TransferBench::GetNumExecutors(EXE_GPU_GFX, rank)) {
+      Utils::Print("[ERROR] PodRing preset requires each rank to have the same number of GPUs\n");
+      return 1;
+    }
+    if (numQueuePairs > 0 && numNics != TransferBench::GetNumExecutors(EXE_NIC, rank))
+      nicDifference = true;
+  }
+  if (nicDifference)
+    Utils::Print("[WARN] Not all ranks have the same number of NICs\n");
+
+  MemType memType = Utils::GetGpuMemType(memTypeIdx);
+  std::string devMemTypeStr = Utils::GetGpuMemTypeStr(memTypeIdx);
+
+  if (Utils::RankDoesOutput()) {
+    ev.DisplayEnvVars();
+    if (!ev.hideEnv) {
+      if (!ev.outputToCsv) printf("[PodRing Related]\n");
+      ev.Print("MEM_TYPE"       , memTypeIdx   , "Using %s GPU memory (%s)", devMemTypeStr.c_str(), Utils::GetAllGpuMemTypeStr().c_str());
+      ev.Print("NUM_GPU_DEVICES", numGpus      , "Using %d GPUs", numGpus);
+      ev.Print("NUM_QUEUE_PAIRS", numQueuePairs, "Using %d queue pairs for NIC transfers", numQueuePairs);
+      ev.Print("NUM_SUB_EXEC"   , numSubExecs  , "Using %d subexecutors/CUs per Transfer", numSubExecs);
+      ev.Print("USE_DMA_EXEC"   , useDmaExec   , "Using %s executor", useDmaExec ? "DMA" : "GFX");
+      ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor", useRemoteRead ? "DST" : "SRC");
+      ev.Print("STRIDE"         , stride       , "Reordering devices by taking %d steps", stride);
+      ev.Print("GROUP_SIZE"     , groupSize    , "Dividing all devices into ring groups of %d", groupSize);
+      printf("\n");
+    }
+  }
+
+  Utils::Print("GPU-%s IntraPod Ring benchmark:\n", useDmaExec ? "DMA" : "GFX");
+  Utils::Print("==============================\n");
+  Utils::Print("[%lu bytes per Transfer] [%s:%d] [MemType:%s] [NIC QueuePairs:%d] [#Ranks:%d]\n",
+               numBytesPerTransfer, useDmaExec ? "DMA" : "GFX", numSubExecs,
+               devMemTypeStr.c_str(), numQueuePairs, numRanks);
+
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+  ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX;
+
+  int n = numRanks * numGpus;
+  int numGroups = n / groupSize;
+
+  std::vector<int> indices(n);
+  for (int k = 0; k < n; k++) indices[k] = k;
+  Utils::StrideGenerate(indices, stride);
+
+  std::vector<MemDevice> devices(n);
+  for (int i = 0; i < n; i++) {
+    int const globalIdx = indices[i];
+    int const rank      = globalIdx / numGpus;
+    int const devIdx    = globalIdx % numGpus;
+    devices[i] = {memType, devIdx, rank};
+  }
+
+  Utils::Print("%d ring(s) of %d devices:\n", numGroups, groupSize);
+  for (int group = 0; group < numGroups; group++) {
+    int const groupBase = group * groupSize;
+    Utils::Print("  Ring %d: ", group);
+    for (int i = 0; i < groupSize; i++) {
+      Utils::Print("R%d:G%d -> ", devices[groupBase + i].memRank, devices[groupBase + i].memIndex);
+    }
+    Utils::Print("R%d:G%d\n", devices[groupBase].memRank, devices[groupBase].memIndex);
+  }
+  Utils::Print("\n");
+
+  for (int group = 0; group < numGroups; group++) {
+    int const groupBase = group * groupSize;
+    std::vector<Transfer> transfers;
+
+    for (int i = 0; i < groupSize; i++) {
+      int srcIdx = groupBase + i;
+      int dstIdx = groupBase + (i + 1) % groupSize;
+
+      TransferBench::Transfer transfer;
+      transfer.numBytes = numBytesPerTransfer;
+      transfer.srcs.push_back(devices[srcIdx]);
+      transfer.dsts.push_back(devices[dstIdx]);
+      transfer.exeDevice = {exeType,
+                           (int32_t)(useRemoteRead ? devices[dstIdx].memIndex : devices[srcIdx].memIndex),
+                           (int32_t)(useRemoteRead ? devices[dstIdx].memRank  : devices[srcIdx].memRank)};
+      transfer.exeSubIndex = -1;
+      transfer.numSubExecs = numSubExecs;
+      transfers.push_back(transfer);
+
+      if (numQueuePairs > 0) {
+        TransferBench::Transfer nicTransfer;
+        nicTransfer.numBytes = numBytesPerTransfer;
+        nicTransfer.srcs.push_back(devices[srcIdx]);
+        nicTransfer.dsts.push_back(devices[dstIdx]);
+        nicTransfer.exeDevice = {TransferBench::EXE_NIC_NEAREST,
+                                (int32_t)devices[srcIdx].memIndex, (int32_t)devices[srcIdx].memRank};
+        nicTransfer.exeSubIndex = devices[dstIdx].memIndex;
+        nicTransfer.numSubExecs = numQueuePairs;
+        transfers.push_back(nicTransfer);
+      }
+    }
+
+    TransferBench::TestResults results;
+    if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+      for (auto const& err : results.errResults)
+        Utils::Print("%s\n", err.errMsg.c_str());
+      return 1;
+    }
+    if (showDetails) {
+      Utils::PrintResults(ev, 1, transfers, results);
+      Utils::Print("\n");
+    }
+
+    if (Utils::RankDoesOutput()) {
+      Utils::Print("\n--- Pod Ring Group %d ---\n", group);
+
+      int const numHops   = groupSize;
+      int const numRows   = 2 + numHops + 3;
+      int const numCols   = 6;
+      int const precision = 2;
+      Utils::TableHelper table(numRows, numCols, precision);
+
+      table.DrawRowBorder(0);
+      table.DrawColBorder(0);
+      table.DrawColBorder(numCols);
+      table.DrawRowBorder(numRows);
+
+      table.Set(0, 0, " Src ");
+      table.Set(0, 1, " Src ");
+      table.Set(0, 2, " Dst ");
+      table.Set(0, 3, " Dst ");
+      table.Set(0, 4, " GFX BW ");
+      table.Set(1, 0, " Rank ");
+      table.Set(1, 1, " GPU ");
+      table.Set(1, 2, " Rank ");
+      table.Set(1, 3, " GPU ");
+      table.Set(1, 4, " (GB/s) ");
+      table.DrawColBorder(2);
+      table.DrawColBorder(4);
+
+      if (numQueuePairs > 0) {
+        table.Set(0, 5, " NIC BW ");
+        table.Set(1, 5, " (GB/s) ");
+      } else {
+        table.Set(0, 5, " ");
+        table.Set(1, 5, " ");
+      }
+
+      table.DrawRowBorder(2);
+
+      double gfxMin = std::numeric_limits<double>::max();
+      double gfxAvg = 0.0;
+      double gfxMax = std::numeric_limits<double>::lowest();
+      double nicMin = std::numeric_limits<double>::max();
+      double nicAvg = 0.0;
+      double nicMax = std::numeric_limits<double>::lowest();
+
+      int tfrIdx = 0;
+      for (int i = 0; i < numHops; i++) {
+        int srcIdx = groupBase + i;
+        int dstIdx = groupBase + (i + 1) % groupSize;
+        int row    = 2 + i;
+
+        double gfxBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec;
+        tfrIdx++;
+
+        table.Set(row, 0, " %d ", devices[srcIdx].memRank);
+        table.Set(row, 1, " %d ", devices[srcIdx].memIndex);
+        table.Set(row, 2, " %d ", devices[dstIdx].memRank);
+        table.Set(row, 3, " %d ", devices[dstIdx].memIndex);
+        table.Set(row, 4, " %.2f ", gfxBw);
+
+        gfxMin = std::min(gfxMin, gfxBw);
+        gfxAvg += gfxBw;
+        gfxMax = std::max(gfxMax, gfxBw);
+
+        if (numQueuePairs > 0) {
+          double nicBw = results.tfrResults[tfrIdx].avgBandwidthGbPerSec;
+          tfrIdx++;
+          table.Set(row, 5, " %.2f ", nicBw);
+          nicMin = std::min(nicMin, nicBw);
+          nicAvg += nicBw;
+          nicMax = std::max(nicMax, nicBw);
+        }
+      }
+
+      int summaryBase = 2 + numHops;
+      table.DrawRowBorder(summaryBase);
+      table.Set(summaryBase    , 1, " MAX ");
+      table.Set(summaryBase + 1, 1, " AVG ");
+      table.Set(summaryBase + 2, 1, " MIN ");
+      table.Set(summaryBase    , 4, " %.2f ", gfxMax);
+      table.Set(summaryBase + 1, 4, " %.2f ", gfxAvg / numHops);
+      table.Set(summaryBase + 2, 4, " %.2f ", gfxMin);
+
+      if (numQueuePairs > 0) {
+        table.Set(summaryBase    , 5, " %.2f ", nicMax);
+        table.Set(summaryBase + 1, 5, " %.2f ", nicAvg / numHops);
+        table.Set(summaryBase + 2, 5, " %.2f ", nicMin);
+      }
+
+      table.PrintTable(ev.outputToCsv, ev.showBorders);
+
+      Utils::Print("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
+    }
+  }
+
+  if (!Utils::RankDoesOutput()) return 0;
+
+  if (Utils::HasDuplicateHostname()) {
+    printf("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n");
+  }
+
+  return 0;
+}
diff --git a/src/client/Presets/Presets.hpp b/src/client/Presets/Presets.hpp
index 8354208..5505681 100644
--- a/src/client/Presets/Presets.hpp
+++ b/src/client/Presets/Presets.hpp
@@ -43,6 +43,7 @@ THE SOFTWARE.
 #include "PeerToPeer.hpp"
 #include "PodAllToAll.hpp"
 #include "PodPeerToPeer.hpp"
+#include "PodRing.hpp"
 #include "Scaling.hpp"
 #include "Schmoo.hpp"
 #include "SmokeTest.hpp"
@@ -77,6 +78,7 @@ std::map<std::string, PresetInfo> presetFuncMap =
   {"p2p"   ,      {PeerToPeerPreset,    "Peer-to-peer device memory bandwidth test"}},
   {"poda2a",      {PodAllToAllPreset,   "All-to-all transfers between subgroups of ranks within a pod"}},
   {"podp2p",      {PodPeerToPeerPreset, "Peer-to-peer transfers test among ranks within a pod"}},
+  {"podring",     {PodRingPreset,       "Ring transfers within subgroups of ranks in a pod"}},
   {"rsweep",      {SweepPreset,         "Randomly sweep through sets of Transfers"}},
   {"scaling",     {ScalingPreset,       "Run scaling test from one GPU to other devices"}},
   {"schmoo",      {SchmooPreset,        "Scaling tests for local/remote read/write/copy"}},
diff --git a/src/client/Utilities.hpp b/src/client/Utilities.hpp
index 259e4cc..497770f 100644
--- a/src/client/Utilities.hpp
+++ b/src/client/Utilities.hpp
@@ -155,6 +155,24 @@ namespace TransferBench::Utils
   bool AllocateMemory(MemDevice memDevice, size_t numBytes, void** memPtr);
   bool DeallocateMemory(MemType memType, void *memPtr, size_t const bytes);
 
+  // Reorder elements of list by stepping through with stride k, wrapping around.
+  // When gcd(k, n) > 1 the single cycle breaks into gcd(k, n) orbits which are
+  // concatenated, so every element appears exactly once in the output.
+  // The reordered list will be further separated into different groups.
+  void StrideGenerate(std::vector<int>& list, int k);
+
+  // Returns a schedule of round robin pairing of N elements, using Circle Method.
+  // If parallel, each round contains N/2 pairs, otherwise serial.
+  void RoundRobinSchedule(std::vector<std::vector<std::pair<int, int>>>& schedule,
+                          int N, int parallel = 0);
+
+  // Returns a schedule for ordered 2-combination of N elements
+  // by pairing the list with its rotating self.
+  // Each round contains n pairs, where 1 <= n <= N and N is divisible by n,
+  // and an element cannot appear more than twice in a round.
+  void CombinationSchedule(std::vector<std::vector<std::pair<int, int>>>& schedule,
+                           int N, int n = 0);
+
   // Implementation details below
   //================================================================
   TableHelper::TableHelper(int numRows, int numCols, int precision) :
@@ -769,4 +787,113 @@ namespace TransferBench::Utils
   {
     return (TransferBench::DeallocateMemory(memType, memPtr, bytes).errType != TransferBench::ERR_NONE);
   }
+
+  void StrideGenerate(std::vector<int>& list, int k)
+  {
+    int n = list.size();
+    if (n == 0) return;
+    k = ((k % n) + n) % n;  // normalize to 0..n-1
+    if (k == 0) return;
+
+    int d = std::gcd(k, n);
+    std::vector<int> out;
+    out.reserve(n);
+
+    for (int s = 0; s < d; s++) {
+      for (int j = 0; j < n / d; j++) {
+        out.push_back(list[(s + j * k) % n]);
+      }
+    }
+    list = std::move(out);
+  }
+
+  void RoundRobinSchedule(std::vector<std::vector<std::pair<int, int>>>& schedule,
+                          int N, int parallel)
+  {
+    if (N == 1) {
+      schedule.push_back({{0, 0}});
+      return;
+    }
+    // Generate standard round-robin tournament (maximum parallelism)
+    std::vector<std::vector<std::pair<int, int>>> fullSchedule;
+
+    // Pad odd number of ranks with a dummy round (N+1)
+    int paddedN = N + N % 2;
+    // Round-robin tournament scheduling
+    for (int round = 0; round < paddedN - 1; round++) {
+      std::vector<std::pair<int, int>> roundPairs;
+      std::vector<std::pair<int, int>> roundPairsReversed;
+      for (int i = 0; i < paddedN / 2; i++) {
+        int item1 = i;
+        int item2 = paddedN - 1 - i;
+        if (round > 0) {
+          // Rotate all except the first item
+          if (item1 > 0) item1 = ((item1 - 1 + round) % (paddedN - 1)) + 1;
+          if (item2 > 0) item2 = ((item2 - 1 + round) % (paddedN - 1)) + 1;
+        }
+        // Ignore dummy round, its partner sits out this round
+        if (item1 < N && item2 < N) {
+          roundPairs.push_back({item1, item2});
+          roundPairsReversed.push_back({item2, item1});
+        }
+      }
+      fullSchedule.push_back(roundPairs);
+      fullSchedule.push_back(roundPairsReversed);
+    }
+
+    // A loopback round where all run in parallel
+    std::vector<std::pair<int, int>> selfRound;
+    for (int i = 0; i < N; i++) {
+      selfRound.push_back({i, i});
+    }
+    fullSchedule.push_back(selfRound);
+
+    if (parallel) {
+      schedule = std::move(fullSchedule);
+    } else {
+      // Serialize each round if needed
+      for (auto const& fullRound : fullSchedule) {
+        for (auto const& match : fullRound) {
+          std::vector<std::pair<int, int>> subRound;
+          subRound.push_back({match.first, match.second});
+          schedule.push_back(subRound);
+        }
+      }
+    }
+  }
+
+  void CombinationSchedule(std::vector<std::vector<std::pair<int, int>>>& schedule,
+                           int N, int n)
+  {
+    std::vector<std::vector<std::pair<int, int>>> fullSchedule;
+
+    if (n <= 0) n = N;
+    if (N <= 0 || n > N || N % n != 0) // Assuming balanced load for each round
+    {
+      n = 1;
+      Print("[WARN] cannot create combination schedule, falling back to serial\n");
+    }
+
+    // Generate rounds of combination based on incrementing distance
+    for (int i = 0; i < N; i++) {
+      std::vector<std::pair<int, int>> round;
+      for (int j = 0; j < N; j++) {
+        round.push_back({j, (j + i) % N});
+      }
+      fullSchedule.push_back(round);
+    }
+
+    // Step 2: Split each full round into sub-rounds with at most n pairs
+    for (auto const& fullRound : fullSchedule) {
+      for (size_t start = 0; start < fullRound.size(); start += n) {
+        std::vector<std::pair<int, int>> subRound;
+        for (size_t i = start; i < start + n && i < fullRound.size(); i++) {
+          subRound.push_back(fullRound[i]);
+        }
+        if (!subRound.empty()) {
+          schedule.push_back(subRound);
+        }
+      }
+    }
+  }
 };