20
20
#include < stdlib.h>
21
21
#include < string.h>
22
22
#include < argtable2.h>
23
- # include " culsp.h "
23
+
24
24
#include " periodogram.h"
25
+ #include " culsp.h"
25
26
#include " culsp_kernel.cu"
27
+ #include " minmax.cu"
26
28
27
29
// Wrapper macros
28
30
34
36
exit (EXIT_FAILURE); \
35
37
}}
36
38
39
+ #define CUDA_ERR_CHECK () { \
40
+ err = cudaGetLastError (); \
41
+ if (err != cudaSuccess) { \
42
+ fprintf (stderr, " Cuda error: kernel launch failed in file '%s' in line %i : %s.\n " , \
43
+ __FILE__, __LINE__, cudaGetErrorString (err)); \
44
+ exit (EXIT_FAILURE); \
45
+ }}
46
+
47
+
37
48
// Forward declarations
38
49
39
50
// void initialize (int, char **, char **, char **, float *, float *, int *);
@@ -108,6 +119,7 @@ main( int argc, char** argv)
108
119
free (t);
109
120
110
121
// Finish
122
+ return 0 ;
111
123
112
124
}
113
125
@@ -193,6 +205,7 @@ eval_LS_periodogram (int N_t, int N_f, float df, float minf,
193
205
float *t, float *X, float *P)
194
206
{
195
207
208
+
196
209
// Allocate device memory and copy data over
197
210
198
211
float *d_t ;
@@ -217,7 +230,7 @@ eval_LS_periodogram (int N_t, int N_f, float df, float minf,
217
230
218
231
// printf("Launching kernel...\n");
219
232
220
- culsp_kernel<<<grid_dim, block_dim>>> (d_t , d_X, d_P, df, N_t, minf);
233
+ culsp_kernel<<<grid_dim, block_dim>>> (d_t , d_X, d_P, df, N_t, N_f, minf);
221
234
222
235
cudaError_t err = cudaGetLastError ();
223
236
if (err != cudaSuccess) {
@@ -241,3 +254,106 @@ eval_LS_periodogram (int N_t, int N_f, float df, float minf,
241
254
// Finish
242
255
243
256
}
257
+
258
+ void
259
+ bootstrap_LS_periodogram (int N_t, int N_f, float df, float minf,
260
+ float *t, float *X, float *max_heights, int N_bootstrap, int use_gpu_to_get_max){
261
+
262
+
263
+ // Allocate device memory and copy data over
264
+
265
+ float *d_t , *d_X, *d_P;
266
+ float *P, *gmax;
267
+ int i, gd, gdm, gdm0;
268
+ float val;
269
+
270
+ curandState *state;
271
+ cudaError_t err;
272
+
273
+ CUDA_CALL (cudaMalloc ((void **) &d_t , N_t*sizeof (float )));
274
+ CUDA_CALL (cudaMalloc ((void **) &d_X, N_t*sizeof (float )));
275
+ CUDA_CALL (cudaMalloc ((void **) &d_P, N_f*sizeof (float )));
276
+
277
+ CUDA_CALL (cudaMemcpy (d_t , t, N_t*sizeof (float ), cudaMemcpyHostToDevice));
278
+ CUDA_CALL (cudaMemcpy (d_X, X, N_t*sizeof (float ), cudaMemcpyHostToDevice));
279
+
280
+ // Get N_bootstraps LSP; then get max_height of each of these.
281
+ // This can be made faster by altering the bootstrap code to get
282
+ // the max within the function, though this is much more complicated
283
+ // should be small speed increase:
284
+ // <LC> ~ 0.4 MB; transfer time wasted = 2 * (0.4/1000 GB) / (~15 GB/s PCIe3x16)
285
+ // ~ 8E-4 seconds...maaaaaybe significant...
286
+ // timing the results on
287
+
288
+
289
+ gd = N_f/BLOCK_SIZE;
290
+ if (gd * BLOCK_SIZE < N_f) gd += 1 ; // ensure we have enough blocks
291
+
292
+ dim3 grid_dim (gd, 1 , 1 );
293
+ dim3 block_dim (BLOCK_SIZE, 1 , 1 );
294
+
295
+ // setup the random generator
296
+ CUDA_CALL (cudaMalloc ((void **) &state, gd*BLOCK_SIZE * sizeof (curandState)));
297
+ setup_curand_kernel<<<grid_dim, block_dim>>> (state, time (NULL ));
298
+
299
+ if (use_gpu_to_get_max){
300
+ // allocate memory for the maximums array
301
+ CUDA_CALL (cudaMalloc ((void **) &gmax, gd * sizeof (float )));
302
+
303
+ } else {
304
+
305
+ // printf("USING CPU TO FIND MAX(P_LS)\n");
306
+ P = (float *) malloc (N_f * sizeof (float ));
307
+ }
308
+
309
+ for (i=0 ; i<N_bootstrap; i++){
310
+
311
+ bootstrap_kernel<<<grid_dim, block_dim>>> (d_t , d_X, d_P, df, N_t, N_f, minf, state);
312
+ // CUDA_ERR_CHECK();
313
+
314
+ if (use_gpu_to_get_max){
315
+ // calculate the maximum.
316
+ max_reduce<<<grid_dim, block_dim, BLOCK_SIZE * sizeof (float )>>> (d_P, gmax, N_f);
317
+
318
+ // Now reduce until only one block is needed.
319
+ gdm = gd;
320
+ while (gdm > 1 ){
321
+
322
+ gdm0 = gdm;
323
+ gdm /= BLOCK_SIZE;
324
+ if ( gdm * BLOCK_SIZE < gdm0 ) gdm += 1 ;
325
+
326
+ dim3 grid_dim_max (gdm, 1 , 1 );
327
+
328
+ max_reduce<<<grid_dim_max, block_dim, BLOCK_SIZE*sizeof (float )>>> (gmax, gmax, gdm0);
329
+
330
+ }
331
+
332
+ // copy max(P) to the host
333
+ CUDA_CALL (cudaMemcpy (&val, gmax, sizeof (float ), cudaMemcpyDeviceToHost));
334
+
335
+ } else {
336
+
337
+ CUDA_CALL (cudaMemcpy (P, d_P, N_f*sizeof (float ), cudaMemcpyDeviceToHost));
338
+ // printf("CPUMAX");
339
+ val = cpu_maxf (P, N_f);
340
+ }
341
+
342
+ max_heights[i] = val;
343
+ }
344
+
345
+ // CUDA_ERR_CHECK();
346
+
347
+ CUDA_CALL (cudaThreadSynchronize ());
348
+
349
+ CUDA_CALL (cudaFree (d_P));
350
+ CUDA_CALL (cudaFree (d_X));
351
+ CUDA_CALL (cudaFree (d_t ));
352
+ if (use_gpu_to_get_max) {
353
+ CUDA_CALL (cudaFree (state));
354
+ CUDA_CALL (cudaFree (gmax));
355
+ }
356
+
357
+ // Finish
358
+
359
+ }
0 commit comments