-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathtest.cpp
More file actions
149 lines (129 loc) · 5.32 KB
/
test.cpp
File metadata and controls
149 lines (129 loc) · 5.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#include <cstdio>
#include <cstdlib>
#include <cstdint>
#include <sys/time.h>
#include <cuda_runtime.h>
#include <nccl.h>
#ifdef __GNUC__
// For __rdtsc intrinsic on GCC/Clang (x86)
#include <x86intrin.h>
#endif
// ------------------------------------------------------------------
// From the prompt: calibrate() and gettime() for measuring time
// ------------------------------------------------------------------
static double freq = -1;
__attribute__((visibility("hidden"))) void calibrate() {
struct timeval tv;
gettimeofday(&tv, NULL);
uint64_t timeCycles = __rdtsc();
double time = - tv.tv_sec * 1e6 - tv.tv_usec;
uint64_t total = 0ULL;
// Dummy loop to let some time pass
for (int i = 0; i < 10000; i++) {
total += __rdtsc();
}
(void)total; // avoid unused variable warning
gettimeofday(&tv, NULL);
timeCycles = __rdtsc() - timeCycles; // Compute elapsed CPU cycles
time += tv.tv_sec * 1e6 + tv.tv_usec; // Compute elapsed real-world time (microseconds)
freq = timeCycles / time; // cycles / microseconds => cycles per microsecond
}
__attribute__((visibility("hidden"))) double gettime() {
// Return current timestamp in microseconds based on calibrated freq
return __rdtsc() / freq;
}
// ------------------------------------------------------------------
// Helper macro for checking NCCL errors
// ------------------------------------------------------------------
#define NCCL_CALL(cmd) do { \
ncclResult_t r = cmd; \
if (r != ncclSuccess) { \
fprintf(stderr, "NCCL error %s:%d '%s'\n", \
__FILE__, __LINE__, ncclGetErrorString(r)); \
exit(EXIT_FAILURE); \
} \
} while(0)
// ------------------------------------------------------------------
// CUDA error checking
// ------------------------------------------------------------------
#define CUDA_CALL(cmd) do { \
cudaError_t e = cmd; \
if (e != cudaSuccess) { \
fprintf(stderr, "CUDA error %s:%d '%s'\n", \
__FILE__, __LINE__, cudaGetErrorString(e)); \
exit(EXIT_FAILURE); \
} \
} while(0)
// ------------------------------------------------------------------
// Example main
// ------------------------------------------------------------------
int main(int argc, char* argv[]) {
// We will assume at least 2 GPUs are available
// For demonstration, we'll use exactly 2.
// (You can modify 'numGPUs' and 'devs' accordingly for more GPUs.)
int numGPUs = 2;
int devs[2] = {0, 1}; // Use GPU 0 and GPU 1
// Amount of data per GPU (e.g., 256 floats)
size_t N = 256;
// Initialize device buffers and streams
float* d_sendBuff[2];
float* d_recvBuff[2];
cudaStream_t streams[2];
for (int i = 0; i < numGPUs; ++i) {
CUDA_CALL(cudaSetDevice(devs[i]));
CUDA_CALL(cudaMalloc(&d_sendBuff[i], N * sizeof(float)));
CUDA_CALL(cudaMalloc(&d_recvBuff[i], N * sizeof(float)));
CUDA_CALL(cudaStreamCreate(&streams[i]));
// (Optional) Initialize send buffers to some values
// For demonstration, we won't do a separate host/device copy.
// If needed, you can create a host buffer and cudaMemcpyAsync here.
CUDA_CALL(cudaMemset(d_sendBuff[i], i + 1, N * sizeof(float)));
CUDA_CALL(cudaMemset(d_recvBuff[i], 0, N * sizeof(float)));
}
// Create NCCL communicators
ncclComm_t comms[2];
NCCL_CALL(ncclCommInitAll(comms, numGPUs, devs));
// Calibrate TSC frequency before timing
calibrate();
// Measure start time
double start = gettime();
// Do a small for-loop with multiple AllReduce calls
int numIterations = 10;
for (int iter = 0; iter < numIterations; ++iter) {
// Launch an AllReduce on each device
for (int i = 0; i < numGPUs; ++i) {
CUDA_CALL(cudaSetDevice(devs[i]));
// Each GPU calls ncclAllReduce
NCCL_CALL(ncclAllReduce(
(const void*)d_sendBuff[i],
(void*)d_recvBuff[i],
N,
ncclFloat,
ncclSum,
comms[i],
streams[i]));
}
// Synchronize all streams to ensure the operation completes
// before next iteration (or measure outside the loop if you prefer).
for (int i = 0; i < numGPUs; ++i) {
CUDA_CALL(cudaSetDevice(devs[i]));
CUDA_CALL(cudaStreamSynchronize(streams[i]));
}
}
// Measure end time
double end = gettime();
double elapsed = end - start; // in microseconds
// Print timing result
printf("Total time for %d iterations of ncclAllReduce: %.3f microseconds\n",
numIterations, elapsed);
printf("Average per iteration: %.3f microseconds\n",
elapsed / numIterations);
// Cleanup
for (int i = 0; i < numGPUs; ++i) {
NCCL_CALL(ncclCommDestroy(comms[i]));
CUDA_CALL(cudaFree(d_sendBuff[i]));
CUDA_CALL(cudaFree(d_recvBuff[i]));
CUDA_CALL(cudaStreamDestroy(streams[i]));
}
return 0;
}