Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 2 additions & 96 deletions src/client/Presets/NicPeerToPeer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,100 +22,6 @@ THE SOFTWARE.

// Helper functions

// Returns a schedule of round robin pairing of N elements, using Circle Method
// if parallel, each round contains N/2 pairs, otherwise serial
void RoundRobinSchedule(std::vector<std::vector<std::pair<int, int>>>& schedule,
int N, int parallel = 0) {
if (N == 1) {
schedule.push_back({{0,0}});
return;
}
// Generate standard round-robin tournament (maximum parallelism)
std::vector<std::vector<std::pair<int, int>>> fullSchedule;

// Pad odd number of ranks with a dummy round (N+1)
int paddedN = N + N%2;
// Round-robin tournament scheduling
for (int round = 0; round < paddedN - 1; round++) {
std::vector<std::pair<int, int>> roundPairs;
std::vector<std::pair<int, int>> roundPairsReversed;
for (int i = 0; i < paddedN / 2; i++) {
int item1 = i;
int item2 = paddedN - 1 - i;
if (round > 0) {
// Rotate all except the first item
if (item1 > 0) item1 = ((item1 - 1 + round) % (paddedN - 1)) + 1;
if (item2 > 0) item2 = ((item2 - 1 + round) % (paddedN - 1)) + 1;
}
// Ignore dummy round, its partner sits out this ronud
if (item1 < N && item2 < N){
roundPairs.push_back({item1, item2});
roundPairsReversed.push_back({item2, item1});
}
}
fullSchedule.push_back(roundPairs);
fullSchedule.push_back(roundPairsReversed);
}

// A loopback round where all run in parallel
std::vector<std::pair<int, int>> selfRound;
for (int i = 0; i < N; i++) {
selfRound.push_back({i, i});
}
fullSchedule.push_back(selfRound);

if (parallel) {
schedule = std::move(fullSchedule);
} else {
// Serialize each round if needed
for (auto const& fullRound : fullSchedule) {
for (auto const& match : fullRound) {
std::vector<std::pair<int, int>> subRound;
subRound.push_back({match.first, match.second});
schedule.push_back(subRound);
}
}
}
}

// Returns a schedule for ordered 2-combination of N elements
// by pairing the list with its rotating self,
// each round contains n pairs, where 1 <= n <= N and N is divisible by n
// and an element cannot appear more than twice in a round,
void CombinationSchedule(std::vector<std::vector<std::pair<int, int>>>& schedule,
int N, int n = 0) {
std::vector<std::vector<std::pair<int, int>>> fullSchedule;

if (n <= 0) n = N;
if (N <= 0 || n > N || N % n != 0) // Assuming balanced load for each round
{
n = 1;
Utils::Print("[WARN] cannot create round robin schedule, falling back to serial");
}

// Generate rounds of combination based on incrementing distance
for (int i = 0; i < N; i++) {
std::vector<std::pair<int, int>> round;
for (int j = 0; j < N; j++) {
round.push_back({j, (j+i)%N});
}
fullSchedule.push_back(round);
}

// Step 2: Split each full round into sub-rounds with at most n pairs
for (auto const& fullRound : fullSchedule) {
for (size_t start = 0; start < fullRound.size(); start += n) {
std::vector<std::pair<int, int>> subRound;
for (size_t i = start; i < start + n && i < fullRound.size(); i++) {
subRound.push_back(fullRound[i]);
}
if (!subRound.empty()) {
schedule.push_back(subRound);
}
}
}
}

int GetClosestDeviceToNic(MemType memType, int nicIdx, int rank) {
return TransferBench::IsCpuMemType(memType) ?
TransferBench::GetClosestCpuNumaToNic(nicIdx, rank) :
Expand Down Expand Up @@ -203,8 +109,8 @@ int NicPeerToPeerPreset(EnvVars& ev,
std::vector<std::vector<std::pair<int, int>>> schedule;
std::vector<std::vector<std::pair<int, int>>> nicSchedule;

RoundRobinSchedule(schedule, numRanks, nodeParallel);
CombinationSchedule(nicSchedule, numNicsPerRank, nicParLevel);
Utils::RoundRobinSchedule(schedule, numRanks, nodeParallel);
Utils::CombinationSchedule(nicSchedule, numNicsPerRank, nicParLevel);

int totalTransfers = numRanks * numNicsPerRank * numRanks * numNicsPerRank;
int counter = 0;
Expand Down
167 changes: 111 additions & 56 deletions src/client/Presets/PodAllToAll.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,27 +20,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

// Reorder elements of list by stepping through with stride k, wrapping around.
// When gcd(k, n) > 1 the single cycle breaks into gcd(k, n) orbits which are
// concatenated, so every element appears exactly once in the output.
// The reordered list will be further separated into different groups.
void StrideGenerate(std::vector<int>& list, int k) {
int n = list.size();
k = ((k % n) + n) % n; // normalize to 0..n-1
if (k == 0) return;

int d = std::gcd(k, n);
std::vector<int> out;
out.reserve(n);

for (int s = 0; s < d; s++) {
for (int j = 0; j < n / d; j++) {
out.push_back(list[(s + j * k) % n]);
}
}
list = std::move(out);
}

int PodAllToAllPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName,
Expand Down Expand Up @@ -164,17 +143,25 @@ int PodAllToAllPreset(EnvVars& ev,
std::vector<MemDevice> devices(n);
std::vector<int> indices(n);
for (int k = 0; k < n; k++) indices[k] = k;
StrideGenerate(indices, stride);
Utils::StrideGenerate(indices, stride);
int idx = 0;
for (int rank : ranks) {
for (int devIdx = 0; devIdx < numGpus; devIdx++) {
devices[indices[idx++]] = {memType, devIdx, rank};
}
}

// Build transfers for every group, then run once per pod so all groups share the same
// timed iterations (traffic across groups is concurrent within RunTransfers).
std::vector<Transfer> podTransfers;
std::vector<size_t> groupTransferBase(numGroups);
std::vector<std::vector<std::vector<int>>> groupReIndexes(numGroups);

for (int group = 0; group < numGroups; group++) {
std::vector<std::vector<int>> groupReIndex(groupSize, std::vector<int>(groupSize, -1));
std::vector<Transfer> transfers;
groupTransferBase[group] = podTransfers.size();
groupReIndexes[group].assign(groupSize, std::vector<int>(groupSize, -1));
std::vector<std::vector<int>>& groupReIndex = groupReIndexes[group];

for (int i = group * groupSize; i < (group + 1) * groupSize; i++) {
for (int j = group * groupSize; j < (group + 1) * groupSize; j++) {
if (i == j) {
Expand All @@ -192,8 +179,9 @@ int PodAllToAllPreset(EnvVars& ev,
transfer.numSubExecs = numSubExecs;
int const localI = i - group * groupSize;
int const localJ = j - group * groupSize;
groupReIndex[localI][localJ] = (int)transfers.size();
transfers.push_back(transfer);
groupReIndex[localI][localJ] =
(int)(podTransfers.size() - groupTransferBase[group]);
podTransfers.push_back(transfer);
}

if (numQueuePairs > 0) {
Expand All @@ -206,34 +194,90 @@ int PodAllToAllPreset(EnvVars& ev,
(int32_t)devices[i].memIndex, (int32_t)devices[i].memRank};
transfer.exeSubIndex = devices[next].memIndex;
transfer.numSubExecs = numQueuePairs;
transfers.push_back(transfer);
podTransfers.push_back(transfer);
}
}
TransferBench::TestResults results;
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
for (auto const& err : results.errResults)
Utils::Print("%s\n", err.errMsg.c_str());
return 1;
}
if (showDetails) {
Utils::PrintResults(ev, 1, transfers, results);
}

if (Utils::RankDoesOutput()) {
for (int g = 0; g < numGroups; g++) {
int const gb = g * groupSize;
Utils::Print("A2A group %d:", g);
std::vector<int> ord(groupSize);
for (int i = 0; i < groupSize; i++) ord[i] = i;
std::sort(ord.begin(), ord.end(), [&](int a, int b) {
MemDevice const& da = devices[gb + a];
MemDevice const& db = devices[gb + b];
if (da.memRank != db.memRank) return da.memRank < db.memRank;
return da.memIndex < db.memIndex;
});
for (size_t si = 0; si < ord.size(); si++) {
MemDevice const& d = devices[gb + ord[si]];
Utils::Print("%s R%d:G%d", si ? "," : "", d.memRank, d.memIndex);
}
Utils::Print("\n");
}
}

TransferBench::TestResults results;
if (!TransferBench::RunTransfers(cfg, podTransfers, results)) {
for (auto const& err : results.errResults)
Utils::Print("%s\n", err.errMsg.c_str());
return 1;
}
if (showDetails) {
if (Utils::RankDoesOutput())
Utils::Print("\n--- Pod AllToAll (all %d groups concurrent) ---\n", numGroups);
Utils::PrintResults(ev, 1, podTransfers, results);
Utils::Print("\n");
}

for (int group = 0; group < numGroups; group++) {
std::vector<std::vector<int>> const& groupReIndex = groupReIndexes[group];
size_t const tfrBase = groupTransferBase[group];

// Per-group bandwidth table
std::vector<std::vector<double>> groupBw(groupSize, std::vector<double>(groupSize, -1.0));
for (int localI = 0; localI < groupSize; localI++) {
for (int localJ = 0; localJ < groupSize; localJ++) {
int const k = groupReIndex[localI][localJ];
if (k >= 0)
groupBw[localI][localJ] = results.tfrResults[k].avgBandwidthGbPerSec;
groupBw[localI][localJ] = results.tfrResults[tfrBase + k].avgBandwidthGbPerSec;
}
}
if (Utils::RankDoesOutput()) {
Utils::Print("\n--- Pod AllToAll Group %d ---\n", group);
int const groupBase = group * groupSize;

// Display order: group devices by MPI rank, then GPU index (stride only affects execution order).
std::vector<int> order(groupSize);
for (int i = 0; i < groupSize; i++) order[i] = i;
std::sort(order.begin(), order.end(), [&](int a, int b) {
MemDevice const& da = devices[groupBase + a];
MemDevice const& db = devices[groupBase + b];
if (da.memRank != db.memRank) return da.memRank < db.memRank;
return da.memIndex < db.memIndex;
});
Comment thread
AtlantaPepsi marked this conversation as resolved.
std::vector<int> colRanks;
for (int slot : order) {
int const r = devices[groupBase + slot].memRank;
if (colRanks.empty() || colRanks.back() != r) colRanks.push_back(r);
}
std::vector<std::vector<int>> localsPerCol;
localsPerCol.reserve(colRanks.size());
for (int dr : colRanks) {
std::vector<int> loc;
for (int li = 0; li < groupSize; li++) {
if (devices[groupBase + li].memRank == dr) loc.push_back(li);
}
std::sort(loc.begin(), loc.end(), [&](int a, int b) {
return devices[groupBase + a].memIndex < devices[groupBase + b].memIndex;
});
localsPerCol.push_back(std::move(loc));
}

int const numRows = 2 + groupSize;
int const numCols = 2 + groupSize;
int const numCols = 2 + (int)colRanks.size();
int const precision = 2;
Utils::TableHelper table(numRows, numCols, precision);
table.DrawRowBorder(0);
Expand All @@ -245,35 +289,46 @@ int PodAllToAllPreset(EnvVars& ev,
table.DrawColBorder(1);
table.Set(1, 1, " Mem Device ");

// Column headers
int colPrevRank = -1;
for (int j = 0; j < groupSize; j++) {
int colIdx = 2 + j;
int r = devices[groupBase + j].memRank;
if (r != colPrevRank) {
table.DrawColBorder(colIdx);
table.Set(0, colIdx, " Rank %02d ", r);
colPrevRank = r;
for (size_t c = 0; c < colRanks.size(); c++) {
int const colIdx = 2 + (int)c;
table.DrawColBorder(colIdx);
table.Set(0, colIdx, " Rank %02d ", colRanks[c]);
std::string gpuHdr;
for (int li : localsPerCol[c]) {
char t[24];
snprintf(t, sizeof(t), " GPU %02d ", devices[groupBase + li].memIndex);
gpuHdr += t;
}
table.Set(1, colIdx, " GPU %02d ", devices[groupBase + j].memIndex);
table.Set(1, colIdx, "%s", gpuHdr.c_str());
table.SetColAlignment((int)c + 2, Utils::TableHelper::ALIGN_LEFT);
}

// Row headers and data
int rowPrevRank = -1;
for (int localI = 0; localI < groupSize; localI++) {
int rowIdx = 2 + localI;
int r = devices[groupBase + localI].memRank;
for (int disp = 0; disp < groupSize; disp++) {
int const localI = order[disp];
int const rowIdx = 2 + disp;
int const r = devices[groupBase + localI].memRank;
if (r != rowPrevRank) {
table.DrawRowBorder(rowIdx);
table.Set(rowIdx, 0, " Rank %02d ", r);
rowPrevRank = r;
} else {
table.Set(rowIdx, 0, " ");
}
table.Set(rowIdx, 1, " GPU %02d ", devices[groupBase + localI].memIndex);
for (int localJ = 0; localJ < groupSize; localJ++) {
if (groupBw[localI][localJ] >= 0)
table.Set(rowIdx, 2 + localJ, " %.2f ", groupBw[localI][localJ]);
else
table.Set(rowIdx, 2 + localJ, " N/A ");
for (size_t c = 0; c < colRanks.size(); c++) {
std::string cell;
for (int localJ : localsPerCol[c]) {
char t[16];
if (groupBw[localI][localJ] >= 0)
snprintf(t, sizeof(t), " %7.2f", groupBw[localI][localJ]);
else
snprintf(t, sizeof(t), " %7s", "N/A");
cell += t;
}
int const colIdx = 2 + (int)c;
table.Set(rowIdx, colIdx, "%s", cell.c_str());
table.SetCellAlignment(rowIdx, colIdx, Utils::TableHelper::ALIGN_LEFT);
}
}
table.PrintTable(ev.outputToCsv, ev.showBorders);
Expand Down
2 changes: 1 addition & 1 deletion src/client/Presets/PodPeerToPeer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ int PodPeerToPeerPreset(EnvVars& ev,
} else {
// parallelLevel == 1: node pairs run concurrently, one device pair at a time per node pair
std::vector<std::vector<std::pair<int, int>>> nodePairSchedule;
RoundRobinSchedule(nodePairSchedule, (int)ranks.size(), 1);
Utils::RoundRobinSchedule(nodePairSchedule, (int)ranks.size(), 1);

for (auto const& roundNodePairs : nodePairSchedule) {
for (int srcDev = 0; srcDev < numGpuDevices; srcDev++) {
Expand Down
Loading