johnh2o2
diff --git a/‎Makefile
+9-1 b/‎Makefile
+9-1
diff --git a/‎README
+2 b/‎README
+2
diff --git a/‎culsp.cu
+118-2 b/‎culsp.cu
+118-2
diff --git a/‎culsp.h
+1 b/‎culsp.h
+1
diff --git a/‎culsp.i
+36 b/‎culsp.i
+36
@@ -46,7 +46,7 @@ periodogram_nomain.o : periodogram.cpp
 	$(CXX) -Dmain=oldmain $(CXXFLAGS) -c -o $@ $^
 
 culsp.o : culsp.cu
-	$(NVCC) -Xcompiler -fpic $(NVCCFLAGS) -c -o $@ $^ -I$(cuda_inc)
+	$(NVCC) $(NVCCFLAGS) -c -o $@ $^ -I$(cuda_inc)
 
 culsp_wrap.o : culsp_wrap.cpp
 	$(CXX) -fPIC $(CXXFLAGS) -c -o $@ $^ -I$(python_inc)
@@ -58,7 +58,15 @@ python : culsp_wrap.o culsp.o periodogram_nomain.o
 	#mv _culspy.so culspy/
 	#touch culspy/__init__.py
 
+#testminmax.o : testminmax.cu
+#	$(NVCC) $(NVCCFLAGS) -c -o $@ $^ -I$(cuda_inc)
+
+#testmax : testminmax.o
+#	$(CXX) $(CXXFLAGS) -o $@ $^ -lm -lcudart -L$(cuda_lib)
+
 clean : clean-python
 	rm -f *o $(EXECUTABLE)
+#clean-testmax :
+#	rm -f testmax testminmax.o
 clean-python:
 	rm -r -f *pyc *so CuLSP* dist/ build/ culspy.py
@@ -1,5 +1,7 @@
 Jan 20, 2016 -- John Hoffman
 
+Main python functions are LSP and LSPbootstrap
+
 Python wrappings + updated makefile!!
 
 Modified code from original form to comply with
 
@@ -20,9 +20,11 @@
 #include <stdlib.h>
 #include <string.h>
 #include <argtable2.h>
-#include "culsp.h"
+
 #include "periodogram.h"
+#include "culsp.h"
 #include "culsp_kernel.cu"
+#include "minmax.cu"
 
 // Wrapper macros
 
@@ -34,6 +36,15 @@
       exit(EXIT_FAILURE);						\
     }}
 
+#define CUDA_ERR_CHECK() { \
+    err = cudaGetLastError(); \
+  if(err != cudaSuccess) { \
+    fprintf(stderr, "Cuda error: kernel launch failed in file '%s' in line %i : %s.\n", \
+      __FILE__, __LINE__, cudaGetErrorString(err)); \
+        exit(EXIT_FAILURE); \
+  }}
+
+
 // Forward declarations
 
 //void initialize (int, char **, char **, char **, float *, float *, int *);
@@ -108,6 +119,7 @@ main( int argc, char** argv)
   free(t);
 
   // Finish
+  return 0;
 
 }
 
@@ -193,6 +205,7 @@ eval_LS_periodogram (int N_t, int N_f, float df, float minf,
 		     float *t, float *X, float *P)
 {
 
+
   // Allocate device memory and copy data over
 
   float *d_t;
@@ -217,7 +230,7 @@ eval_LS_periodogram (int N_t, int N_f, float df, float minf,
 
   //printf("Launching kernel...\n");
 
-  culsp_kernel<<<grid_dim, block_dim>>>(d_t, d_X, d_P, df, N_t, minf);
+  culsp_kernel<<<grid_dim, block_dim>>>(d_t, d_X, d_P, df, N_t, N_f, minf);
 
   cudaError_t err = cudaGetLastError();
   if(err != cudaSuccess) {
@@ -241,3 +254,106 @@ eval_LS_periodogram (int N_t, int N_f, float df, float minf,
   // Finish
 
 }
+
+void
+bootstrap_LS_periodogram(int N_t, int N_f, float df, float minf, 
+         float *t, float *X, float *max_heights, int N_bootstrap, int use_gpu_to_get_max){
+
+
+  // Allocate device memory and copy data over
+
+  float *d_t, *d_X, *d_P;
+  float *P, *gmax;
+  int i, gd, gdm, gdm0;
+  float val;
+
+  curandState *state;
+  cudaError_t err; 
+
+  CUDA_CALL(cudaMalloc((void**) &d_t, N_t*sizeof(float)));
+  CUDA_CALL(cudaMalloc((void**) &d_X, N_t*sizeof(float)));
+  CUDA_CALL(cudaMalloc((void**) &d_P, N_f*sizeof(float)));
+
+  CUDA_CALL(cudaMemcpy(d_t, t, N_t*sizeof(float), cudaMemcpyHostToDevice));
+  CUDA_CALL(cudaMemcpy(d_X, X, N_t*sizeof(float), cudaMemcpyHostToDevice));
+
+  // Get N_bootstraps LSP; then get max_height of each of these.
+  // This can be made faster by altering the bootstrap code to get
+  // the max within the function, though this is much more complicated
+  // should be small speed increase: 
+  //    <LC> ~ 0.4 MB; transfer time wasted = 2 * (0.4/1000 GB) / (~15 GB/s PCIe3x16) 
+  //                 ~ 8E-4 seconds...maaaaaybe significant...
+  // timing the results on 
+
+  
+  gd = N_f/BLOCK_SIZE;
+  if (gd * BLOCK_SIZE < N_f) gd += 1; // ensure we have enough blocks
+
+  dim3 grid_dim(gd, 1, 1);
+  dim3 block_dim(BLOCK_SIZE, 1, 1);
+  
+  // setup the random generator
+  CUDA_CALL(cudaMalloc((void **) &state, gd*BLOCK_SIZE * sizeof(curandState)));
+  setup_curand_kernel<<<grid_dim, block_dim>>>(state, time(NULL));
+  
+  if (use_gpu_to_get_max){
+    // allocate memory for the maximums array
+    CUDA_CALL(cudaMalloc((void **) &gmax, gd * sizeof(float)));  
+
+  } else {
+
+    //printf("USING CPU TO FIND MAX(P_LS)\n");
+    P = (float *) malloc(N_f * sizeof(float));
+  }
+
+  for(i=0; i<N_bootstrap; i++){
+
+    bootstrap_kernel<<<grid_dim, block_dim>>>(d_t, d_X, d_P, df, N_t, N_f, minf, state);
+    //CUDA_ERR_CHECK();
+
+    if (use_gpu_to_get_max){
+      // calculate the maximum.
+      max_reduce<<<grid_dim, block_dim, BLOCK_SIZE * sizeof(float)>>>(d_P, gmax, N_f);
+      
+      // Now reduce until only one block is needed.
+      gdm = gd;
+      while (gdm > 1){
+
+        gdm0 = gdm;
+        gdm /= BLOCK_SIZE;
+        if( gdm * BLOCK_SIZE < gdm0 ) gdm += 1;
+        
+        dim3 grid_dim_max(gdm, 1, 1);
+
+        max_reduce<<<grid_dim_max, block_dim, BLOCK_SIZE*sizeof(float)>>>(gmax, gmax, gdm0);
+
+      }
+
+      //copy max(P) to the host
+      CUDA_CALL(cudaMemcpy(&val, gmax, sizeof(float), cudaMemcpyDeviceToHost));
+    
+    } else {
+    
+      CUDA_CALL(cudaMemcpy(P, d_P, N_f*sizeof(float), cudaMemcpyDeviceToHost));
+      //printf("CPUMAX");
+      val = cpu_maxf(P, N_f); 
+    }
+
+    max_heights[i] = val;
+  }
+
+  //CUDA_ERR_CHECK();
+
+  CUDA_CALL(cudaThreadSynchronize());
+
+  CUDA_CALL(cudaFree(d_P));
+  CUDA_CALL(cudaFree(d_X));
+  CUDA_CALL(cudaFree(d_t));
+  if (use_gpu_to_get_max) {
+    CUDA_CALL(cudaFree(state));
+    CUDA_CALL(cudaFree(gmax));
+  }
+
+  // Finish
+
+}
@@ -3,4 +3,5 @@
 void initialize(int, char **, char **, char **, float *, float *, int *);
 void initialize_cuda(int);
 void eval_LS_periodogram(int, int, float, float, float *, float *, float *);
+void bootstrap_LS_periodogram(int, int, float, float, float *, float *, float *, int, int);
 #endif
@@ -5,13 +5,15 @@
 extern void initialize_cuda(int);
 extern void set_frequency_params(int, float *, float, float, int *, float *);
 extern void eval_LS_periodogram(int, int, float,float, float *, float *, float *);
+extern void bootstrap_LS_periodogram(int, int, float,float,  float *, float *, float *, int, int);
 %}
 
 extern void initialize_cuda(int);
 
 %apply int *OUTPUT { int *OUTPUT1 };
 extern void set_frequency_params(int, float *, float, float, int *OUTPUT1, float *OUTPUT);
 extern void eval_LS_periodogram(int, int, float,float,  float *, float *, float *);
+extern void bootstrap_LS_periodogram(int, int, float,float,  float *, float *, float *, int, int);
 
 %inline %{
 
@@ -80,4 +82,38 @@ def LSP(t, x, f_over=1.0, f_high=1.0, minf=0.0, maxf=None, Nf=None ):
 
     return freqs, power
 
+def LSPbootstrap(t, x, f_over=1.0, f_high=1.0, minf=0.0, 
+                            maxf=None, Nf=None, Nbootstrap=100,
+                            use_gpu_to_get_max = True ):
+    Nt = len(t)
+    ct = _convert_to_c(t)
+    cx = _convert_to_c(x)
+
+    if use_gpu_to_get_max: gpumax = 1
+    else: gpumax = 0
+
+    if maxf is not None and Nf is not None:
+        Nf = correct_nf(Nf)
+        df = (maxf - minf)/Nf
+    else:
+        ct = _convert_to_c(t)
+        Nf0, df0 = _culspy.set_frequency_params(Nt, ct, f_over, f_high)
+        if not Nf is None:
+            Nf = correct_nf(Nf)
+            df = (Nf0 * df0)/Nf
+        else:
+            Nf = Nf0
+            df = df0
+            
+    cmax_heights = _culspy.get_float_array(Nbootstrap)
+    _culspy.bootstrap_LS_periodogram(Nt, Nf, df, minf, ct, cx, 
+                                            cmax_heights, Nbootstrap, gpumax)
+
+    max_heights = _convert_to_py(cmax_heights, Nbootstrap)
+
+    return max_heights
+
+
+
+
 %}