Parallel PopGen Package
spectrum.cu
1 /*
2  * spectrum.cu
3  *
4  * Author: David Lawrie
5  */
6 
7 #include "../spectrum.h"
8 #include "../_internal/shared.cuh"
9 #include "../_outside_libraries/cub/device/device_scan.cuh"
10 #include "../_outside_libraries/cub/block/block_reduce.cuh"
11 #include <math_constants.h>
12 
14 namespace Spectrum_details{
15 
16 class transfer_allele_trajectories{
17 
18  struct time_sample{
19  float * mutations_freq; //allele frequency of mutations in final generation
20  bool * extinct; //extinct[pop] == true, flag if population is extinct by end of simulation
21  int * Nchrom_e; //effective number of chromosomes in each population
22  int num_mutations; //number of mutations in frequency array (columns array length for freq)
23  int sampled_generation; //number of generations in the simulation at point of sampling
24 
25  time_sample(): num_mutations(0), sampled_generation(0) { mutations_freq = NULL; extinct = NULL; Nchrom_e = NULL; }
26  time_sample(const GO_Fish::allele_trajectories & in, int sample_index): num_mutations(in.time_samples[sample_index]->num_mutations), sampled_generation(in.time_samples[sample_index]->sampled_generation){
27  //can replace with weak pointers when moving to CUDA 7+ and C++11 (or maybe not, see notes)
28  mutations_freq = in.time_samples[sample_index]->mutations_freq;
29  extinct = in.time_samples[sample_index]->extinct;
30  Nchrom_e = in.time_samples[sample_index]->Nchrom_e;
31  }
32  ~time_sample(){ mutations_freq = NULL; extinct = NULL; Nchrom_e = NULL; } //don't actually delete information, just null pointers as this just points to the real data held
33  };
34 
35  time_sample ** time_samples;
36  GO_Fish::mutID * mutations_ID; //unique ID consisting of generation, population, threadID, and device
37  unsigned int num_samples;
38  int all_mutations; //number of mutations in mutation ID array - maximal set of mutations in the simulation
39 
40  //----- initialization parameters -----
41  struct sim_constants{
42  int num_generations;
43  float num_sites;
44  int num_populations;
45 
46  sim_constants();
47  sim_constants(const GO_Fish::allele_trajectories & in){
48  num_generations = in.sim_run_constants.num_generations;
49  num_sites = in.sim_run_constants.num_sites;
50  num_populations = in.sim_run_constants.num_populations;
51  }
52  }sim_run_constants;
53  //----- end -----
54 
55  transfer_allele_trajectories(): num_samples(0), all_mutations(0) { time_samples = 0; mutations_ID = 0; }
56 
57  transfer_allele_trajectories(const GO_Fish::allele_trajectories & in): sim_run_constants(in), all_mutations(in.all_mutations) {
58  //can replace with weak pointers when moving to CUDA 7+ and C++11 (or maybe not, see notes)
59  if(!in.time_samples || in.num_samples == 0){ fprintf(stderr,"error transferring allele_trajectories to spectrum: empty allele_trajectories\n"); exit(1); }
60  num_samples = in.num_samples;
61  time_samples = new time_sample *[num_samples];
62 
63  for(int i = 0; i < num_samples; i++){ time_samples[i] = new time_sample(in,i); }
64  mutations_ID = in.mutations_ID;
65  }
66 
67  friend void Spectrum::population_frequency_histogram(SFS & mySFS, const GO_Fish::allele_trajectories & all_results, const int sample_index, const int population_index, int cuda_device);
68 
69  friend void Spectrum::site_frequency_spectrum(SFS & mySFS, const GO_Fish::allele_trajectories & all_results, const int sample_index, const int population_index, const int sample_size, int cuda_device);
70 
71  ~transfer_allele_trajectories(){ delete [] time_samples; time_samples = NULL; mutations_ID = NULL; num_samples = 0; } //don't actually delete anything, this is just a pointer class, actual data held by GO_Fish::trajectory, delete [] time_samples won't call individual destructors and even if it did, the spectrum time sample destructors don't delete anything
72 };
73 
74 __global__ void population_hist(unsigned int * out_histogram, float * in_mutation_freq, int Nchrome_e, int num_mutations, int num_sites){
75  int myID = blockIdx.x*blockDim.x + threadIdx.x;
76 
77  for(int id = myID; id < num_mutations; id+= blockDim.x*gridDim.x){
78  int index = round(Nchrome_e*in_mutation_freq[id]);
79  if(index == Nchrome_e){ index = 0; }
80  atomicAdd(&out_histogram[index],1);
81  }
82  if(myID == 0){ atomicAdd(&out_histogram[0], (num_sites - num_mutations)); }
83 }
84 
85 __global__ void uint_to_float(float * out_array, unsigned int * in_array, int N){
86  int myID = blockIdx.x*blockDim.x + threadIdx.x;
87  for(int id = myID; id < N; id+= blockDim.x*gridDim.x){ out_array[id] = in_array[id]; }
88 }
89 
90 __global__ void binom_coeff(float * binom_coeff, int half_n, int n){
91  int myIDx = blockIdx.x*blockDim.x + threadIdx.x;
92 
93  for(int idx = (myIDx+1); idx < half_n; idx+= blockDim.x*gridDim.x){ binom_coeff[idx] = logf((n+1.f-idx)/((float)idx)); }
94  if(myIDx == 0){ binom_coeff[0] = 0.0; }
95 }
96 
97 __global__ void print_Device_array_float(float * array, int num){
98 
99  //if(i%1000 == 0){ printf("\n"); }
100  for(int j = 0; j < num; j++){ printf("%d: %f\t",j,array[j]); }
101  printf("\n");
102 }
103 
104 __global__ void print_Device_array_double(double * array, int start, int end){
105 
106  //if(i%1000 == 0){ printf("\n"); }
107  for(int j = start; j < end; j++){ printf("%d: %f\t",j,array[j]); }
108  printf("\n");
109 }
110 
111 template <int BLOCK_THREADS>
112 __global__ void binom(float * d_histogram, const float * const d_mutations_freq, const float * const d_binom_coeff, const int half_n, const int num_levels, float num_sites, int num_mutations){
113  int myIDx = blockIdx.x*blockDim.x + threadIdx.x;
114  int myIDy = blockIdx.y;
115  typedef cub::BlockReduce<float, BLOCK_THREADS> BlockReduceT;
116  __shared__ typename BlockReduceT::TempStorage temp_storage;
117  float thread_data[1];
118 
119  for(int idy = myIDy; idy <= num_levels; idy+= blockDim.y*gridDim.y){
120  thread_data[0] = 0;
121  for(int idx = myIDx; idx < num_mutations; idx+= blockDim.x*gridDim.x){
122  float pf = d_mutations_freq[idx];
123  if(pf > 0 && pf < 1){ //segregating in this population
124  float qf = 1-pf;
125  float powp = idy*logf(pf);
126  float powq = (num_levels-idy)*logf(qf);
127  float coeff;
128  if(idy < half_n){ coeff = d_binom_coeff[idy]; } else{ coeff = d_binom_coeff[num_levels-idy]; }
129  thread_data[0] += expf(coeff+powp+powq);
130  }else if(idy == 0){ thread_data[0] += 1.f; } //segregating in other populations: this is the marginal SFS in one population, so they count as monomorphic only
131  }
132  float aggregate = BlockReduceT(temp_storage).Sum(thread_data);
133  if(threadIdx.x == 0){
134  if(idy == num_levels){ atomicAdd(&d_histogram[0],aggregate); }
135  else{ atomicAdd(&d_histogram[idy],aggregate); }
136  }
137  }
138  if(myIDx == 0 && myIDy == 0){ atomicAdd(&d_histogram[0],(float)(num_sites-num_mutations)); }
139 }
140 
141 } /*----- end namespace Spectrum_details ----- */
143 
147 namespace Spectrum{
148 
149 SFS::SFS(): num_populations(0), num_sites(0), num_mutations(0), sampled_generation(0) {frequency_spectrum = NULL; populations = NULL; sample_size = NULL;}
150 SFS::~SFS(){ if(frequency_spectrum){ delete [] frequency_spectrum; frequency_spectrum = NULL; } if(populations){ delete[] populations; populations = NULL; } if(sample_size){ delete[] sample_size; sample_size = NULL; }}
151 
152 //frequency histogram of mutations at a single time point in a single population
153 void population_frequency_histogram(SFS & mySFS, const GO_Fish::allele_trajectories & all_results, const int sample_index, const int population_index, int cuda_device){
154  using namespace Spectrum_details;
155  set_cuda_device(cuda_device);
156 
157  cudaStream_t stream;
158  cudaCheckErrors(cudaStreamCreate(&stream),-1,-1);
159 
160  float * d_mutations_freq;
161  float * d_histogram, * h_histogram;
162  transfer_allele_trajectories sample(all_results);
163  if(!(sample_index >= 0 && sample_index < sample.num_samples) || !(population_index >= 0 && population_index < sample.sim_run_constants.num_populations)){
164  fprintf(stderr,"site_frequency_spectrum error: requested indices out of bounds: sample %d [0 %d), population %d [0 %d)\n",sample_index,sample.num_samples,population_index,sample.sim_run_constants.num_populations); exit(1);
165  }
166 
167  int population_size = sample.time_samples[sample_index]->Nchrom_e[population_index];
168  int num_mutations = sample.time_samples[sample_index]->num_mutations;
169  float num_sites = sample.sim_run_constants.num_sites;
170 
171  cudaCheckErrorsAsync(cudaMalloc((void**)&d_mutations_freq, sample.time_samples[sample_index]->num_mutations*sizeof(float)),-1,-1);
172  cudaCheckErrorsAsync(cudaMalloc((void**)&d_histogram, population_size*sizeof(float)),-1,-1);
173  cudaCheckErrors(cudaHostRegister(&sample.time_samples[sample_index]->mutations_freq[population_index*num_mutations], num_mutations*sizeof(float), cudaHostRegisterPortable),-1,-1);
174  cudaCheckErrorsAsync(cudaMemcpyAsync(d_mutations_freq, &sample.time_samples[sample_index]->mutations_freq[population_index*num_mutations], num_mutations*sizeof(float), cudaMemcpyHostToDevice, stream),-1,-1);
175  cudaCheckErrors(cudaHostUnregister(&sample.time_samples[sample_index]->mutations_freq[population_index*num_mutations]),-1,-1);
176 
177  unsigned int * d_pop_histogram;
178  cudaCheckErrorsAsync(cudaMalloc((void**)&d_pop_histogram, population_size*sizeof(unsigned int)),-1,-1);
179  cudaCheckErrorsAsync(cudaMemsetAsync(d_pop_histogram, 0, population_size*sizeof(unsigned int), stream),-1,-1);
180  population_hist<<<50,1024,0,stream>>>(d_pop_histogram, d_mutations_freq, population_size, num_mutations, num_sites);
181  cudaCheckErrorsAsync(cudaPeekAtLastError(),-1,-1);
182 
183  int num_threads = 1024;
184  if(population_size < 1024){ num_threads = 256; if(population_size < 256){ num_threads = 128; } }
185  int num_blocks = max(population_size/num_threads,1);
186  uint_to_float<<<num_blocks,num_threads,0,stream>>>(d_histogram, d_pop_histogram, population_size);
187  cudaCheckErrorsAsync(cudaPeekAtLastError(),-1,-1);
188  cudaCheckErrorsAsync(cudaFree(d_pop_histogram),-1,-1);
189 
190  h_histogram = new float[population_size];
191  cudaCheckErrors(cudaHostRegister(h_histogram, sizeof(float)*population_size, cudaHostRegisterPortable),-1,-1);
192  cudaCheckErrorsAsync(cudaMemcpyAsync(h_histogram, d_histogram, population_size*sizeof(float), cudaMemcpyDeviceToHost, stream),-1,-1);
193  cudaCheckErrors(cudaHostUnregister(h_histogram),-1,-1);
194 
195  if(cudaStreamQuery(stream) != cudaSuccess){ cudaCheckErrors(cudaStreamSynchronize(stream), -1, -1); } //wait for writes to host to finish
196 
197  mySFS.frequency_spectrum = h_histogram;
198  mySFS.num_populations = 1;
199  mySFS.sample_size = new int[1];
200  mySFS.sample_size[0] = population_size;
201  mySFS.num_sites = sample.sim_run_constants.num_sites;
202  mySFS.num_mutations = mySFS.num_sites - mySFS.frequency_spectrum[0];
203  mySFS.populations = new int[1];
204  mySFS.populations[0] = population_index;
205  mySFS.sampled_generation = sample.time_samples[sample_index]->sampled_generation;
206 
207  cudaCheckErrorsAsync(cudaFree(d_mutations_freq),-1,-1);
208  cudaCheckErrorsAsync(cudaFree(d_histogram),-1,-1);
209  cudaCheckErrorsAsync(cudaStreamDestroy(stream),-1,-1);
210 }
211 
212 //single-population SFS
213 void site_frequency_spectrum(SFS & mySFS, const GO_Fish::allele_trajectories & all_results, const int sample_index, const int population_index, const int sample_size, int cuda_device){
214  using namespace Spectrum_details;
215  set_cuda_device(cuda_device);
216 
217  cudaStream_t stream;
218  cudaCheckErrors(cudaStreamCreate(&stream),-1,-1);
219 
220  float * d_mutations_freq;
221  float * d_histogram, * h_histogram;
222  transfer_allele_trajectories sample(all_results);
223  if(!(sample_index >= 0 && sample_index < sample.num_samples) || !(population_index >= 0 && population_index < sample.sim_run_constants.num_populations)){
224  fprintf(stderr,"site_frequency_spectrum error: requested indices out of bounds: sample %d [0 %d), population %d [0 %d)\n",sample_index,sample.num_samples,population_index,sample.sim_run_constants.num_populations); exit(1);
225  }
226 
227  int num_levels = sample_size;
228  int population_size = sample.time_samples[sample_index]->Nchrom_e[population_index];
229  if((sample_size <= 0) || (sample_size >= population_size)){ fprintf(stderr,"site_frequency_spectrum error: requested sample_size out of range [1,population_size): sample_size %d [1,%d)",sample_size,population_size); exit(1); }
230 
231  if(sample_size == 0){ num_levels = population_size; }
232  int num_mutations = sample.time_samples[sample_index]->num_mutations;
233  float num_sites = sample.sim_run_constants.num_sites;
234 
235  cudaCheckErrorsAsync(cudaMalloc((void**)&d_mutations_freq, sample.time_samples[sample_index]->num_mutations*sizeof(float)),-1,-1);
236  cudaCheckErrorsAsync(cudaMalloc((void**)&d_histogram, num_levels*sizeof(float)),-1,-1);
237  cudaCheckErrors(cudaHostRegister(&sample.time_samples[sample_index]->mutations_freq[population_index*num_mutations], num_mutations*sizeof(float), cudaHostRegisterPortable),-1,-1);
238  cudaCheckErrorsAsync(cudaMemcpyAsync(d_mutations_freq, &sample.time_samples[sample_index]->mutations_freq[population_index*num_mutations], num_mutations*sizeof(float), cudaMemcpyHostToDevice, stream),-1,-1);
239  cudaCheckErrors(cudaHostUnregister(&sample.time_samples[sample_index]->mutations_freq[population_index*num_mutations]),-1,-1);
240 
241  int half_n;
242  if((num_levels) % 2 == 0){ half_n = (num_levels)/2+1; }
243  else{ half_n = (num_levels+1)/2; }
244 
245  float * d_binom_partial_coeff;
246  cudaCheckErrorsAsync(cudaMalloc((void**)&d_binom_partial_coeff, half_n*sizeof(float)),-1,-1);
247  int num_threads = 1024;
248  if(half_n < 1024){ num_threads = 256; if(half_n < 256){ num_threads = 128; } }
249  int num_blocks = max(num_levels/num_threads,1);
250  binom_coeff<<<num_blocks,num_threads,0,stream>>>(d_binom_partial_coeff, half_n, num_levels);
251  cudaCheckErrorsAsync(cudaPeekAtLastError(),-1,-1);
252 
253  float * d_binom_coeff;
254  cudaCheckErrorsAsync(cudaMalloc((void**)&d_binom_coeff, half_n*sizeof(float)),-1,-1);
255 
256  void *d_temp_storage = NULL;
257  size_t temp_storage_bytes = 0;
258  cudaCheckErrorsAsync(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_binom_partial_coeff, d_binom_coeff, half_n, stream),-1,-1);
259  cudaCheckErrorsAsync(cudaMalloc(&d_temp_storage, temp_storage_bytes),-1,-1);
260  cudaCheckErrorsAsync(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_binom_partial_coeff, d_binom_coeff, half_n, stream),-1,-1);
261  cudaCheckErrorsAsync(cudaFree(d_temp_storage),-1,-1);
262  cudaCheckErrorsAsync(cudaFree(d_binom_partial_coeff),-1,-1);
263  //print_Device_array_double<<<1,1,0,stream>>>(d_binom, 0, half_n);
264 
265  const dim3 gridsize(200,20,1);
266  const int num_threads_binom = 1024;
267  cudaCheckErrorsAsync(cudaMemsetAsync(d_histogram, 0, num_levels*sizeof(float), stream),-1,-1);
268  binom<num_threads_binom><<<gridsize,num_threads_binom,0,stream>>>(d_histogram, d_mutations_freq, d_binom_coeff, half_n, num_levels, num_sites, num_mutations);
269  cudaCheckErrorsAsync(cudaPeekAtLastError(),-1,-1);
270  cudaCheckErrorsAsync(cudaFree(d_binom_coeff),-1,-1);
271  //slight differences in each run of the above reduction are due to different floating point error accumulations as different blocks execute in different orders each time
272  //can be ignored, might switch back to using doubles (at least for summing into d_histogram & not calculating d_binom_coeff, speed difference was negligble for the former)
273 
274  h_histogram = new float[num_levels];
275  cudaCheckErrors(cudaHostRegister(h_histogram, sizeof(float)*num_levels, cudaHostRegisterPortable),-1,-1);
276  cudaCheckErrorsAsync(cudaMemcpyAsync(h_histogram, d_histogram, num_levels*sizeof(float), cudaMemcpyDeviceToHost, stream),-1,-1);
277  cudaCheckErrors(cudaHostUnregister(h_histogram),-1,-1);
278 
279  if(cudaStreamQuery(stream) != cudaSuccess){ cudaCheckErrors(cudaStreamSynchronize(stream), -1, -1); } //wait for writes to host to finish
280 
281  mySFS.frequency_spectrum = h_histogram;
282  mySFS.num_populations = 1;
283  mySFS.sample_size = new int[1];
284  mySFS.sample_size[0] = num_levels;
285  mySFS.num_sites = sample.sim_run_constants.num_sites;
286  mySFS.num_mutations = mySFS.num_sites - mySFS.frequency_spectrum[0];
287  mySFS.populations = new int[1];
288  mySFS.populations[0] = population_index;
289  mySFS.sampled_generation = sample.time_samples[sample_index]->sampled_generation;
290 
291  cudaCheckErrorsAsync(cudaFree(d_mutations_freq),-1,-1);
292  cudaCheckErrorsAsync(cudaFree(d_histogram),-1,-1);
293  cudaCheckErrorsAsync(cudaStreamDestroy(stream),-1,-1);
294 }
295 
296 } /*----- end namespace SPECTRUM ----- */
float * frequency_spectrum
site frequency spectrum data structure
Definition: spectrum.h:25
control and output data structure for GO_Fish simulation
int sampled_generation
number of generations in the simulation at time of sampling
Definition: spectrum.h:31
site frequency spectrum data structure (at the moment, functions only generate SFS for a single popul...
Definition: spectrum.h:24
int * sample_size
number of samples taken for each population
Definition: spectrum.h:27
int * populations
which populations are in SFS
Definition: spectrum.h:26
Namespace for site frequency spectrum data structure and functions. (in prototype-phase) ...
Definition: spectrum.h:21
float num_sites
number of sites in SFS
Definition: spectrum.h:29
int num_populations
number of populations in simulation
float num_mutations
number of segregating mutations in SFS
Definition: spectrum.h:30
void population_frequency_histogram(SFS &mySFS, const GO_Fish::allele_trajectories &all_results, const int sample_index, const int population_index, int cuda_device=-1)
create a frequency histogram of mutations at a single time point sample_index in a single population ...
Definition: spectrum.cu:153
void site_frequency_spectrum(SFS &mySFS, const GO_Fish::allele_trajectories &all_results, const int sample_index, const int population_index, const int sample_size, int cuda_device=-1)
create a single-population SFS of size sample_size from a single time point sample_index in a single ...
Definition: spectrum.cu:213
~SFS()
default destructor
Definition: spectrum.cu:150
structure specifying the ID for a mutation in a GO_Fish simulation
float num_sites
number of sites in simulation
int num_populations
number of populations in SFS
Definition: spectrum.h:28
int num_generations
number of generations in simulation