Parallel PopGen Package
go_fish_impl.cuh
1 /*
2  * go_fish_impl.cuh
3  *
4  * Author: David Lawrie
5  * implementation of template and inline functions for GO Fish simulation
6  */
7 
8 #ifndef GO_FISH_IMPL_CUH_
9 #define GO_FISH_IMPL_CUH_
10 
11 #include "../_outside_libraries/cub/device/device_scan.cuh"
12 #include "../_internal/shared.cuh"
13 #include "../go_fish_data_struct.h"
14 
16 namespace go_fish_details{
17 
18 __device__ __forceinline__ float4 operator-(float a, float4 b){ return make_float4((a-b.x), (a-b.y), (a-b.z), (a-b.w)); }
19 
20 __device__ __forceinline__ float mse(float i, int N, float F, float h, float s){
21  return exp(2*N*s*i*((2*h+(1-2*h)*i)*(1-F) + 2*F)/(1+F)); //works for either haploid or diploid, N should be number of individuals, for haploid, F = 1
22 }
23 
24 template <typename Functor_selection>
25 struct mse_integrand{
26  Functor_selection sel_coeff;
27  int N, pop, gen;
28  float F, h;
29 
30  mse_integrand(): N(0), h(0), F(0), pop(0), gen(0) {}
31  mse_integrand(Functor_selection xsel_coeff, int xN, float xF, float xh, int xpop, int xgen = 0): sel_coeff(xsel_coeff), N(xN), F(xF), h(xh), pop(xpop), gen(xgen) { }
32 
33  __device__ __forceinline__ float operator()(float i) const{
34  float s = max(sel_coeff(pop, gen, i),-1.f);
35  return mse(i, N, F, h, -1*s); //exponent term in integrand is negative inverse
36  }
37 };
38 
39 template<typename Functor_function>
40 struct trapezoidal_upper{
41  Functor_function fun;
42  trapezoidal_upper() { }
43  trapezoidal_upper(Functor_function xfun): fun(xfun) { }
44  __device__ __forceinline__ float operator()(float a, float step_size) const{ return step_size*(fun(a)+fun(a-step_size))/2; } //upper integral
45 };
46 
47 //generates an array of areas from 1 to 0 of frequencies at every step size
48 template <typename Functor_Integrator>
49 __global__ void calculate_area(float * d_freq, const int num_freq, const float step_size, Functor_Integrator trapezoidal){
50  int myID = blockIdx.x*blockDim.x + threadIdx.x;
51 
52  for(int id = myID; id < num_freq; id += blockDim.x*gridDim.x){ d_freq[id] = trapezoidal((1.0 - id*step_size), step_size); }
53 }
54 
55 __global__ void reverse_array(float * array, const int N);
56 
57 //determines number of mutations at each frequency in the initial population, sets it equal to mutation-selection balance
58 template <typename Functor_selection>
59 __global__ void initialize_mse_frequency_array(int * freq_index, float * mse_integral, const int offset, const float mu, const int Nind, const int Nchrom, const float L, const Functor_selection sel_coeff, const float F, const float h, const int2 seed, const int population){
60  int myID = blockIdx.x*blockDim.x + threadIdx.x;
61  float mse_total = mse_integral[0]; //integral from frequency 0 to 1
62  for(int id = myID; id < (Nchrom-1); id += blockDim.x*gridDim.x){ //exclusive, number of freq in pop is chromosome population size N-1
63  float i = (id+1.f)/Nchrom;
64  float j = ((Nchrom - id)+1.f)/Nchrom; //ensures that when i gets close to be rounded to 1, Ni doesn't become 0 when it isn't actually 0 unlike simply taking 1-i
65  float s = sel_coeff(population, 0, i);
66  float lambda;
67  if(s == 0){ lambda = 2*mu*L/i; }
68  else{ lambda = 2*mu*L*(mse(i, Nind, F, h, s)*mse_integral[id])/(mse_total*i*j); }
69  freq_index[offset+id] = max(RNG::ApproxRandPois1(lambda, lambda, mu, L*Nchrom, seed, 0, id, population),0);//mutations are poisson distributed in each frequency class //for round(lambda);//rounding can significantly under count for large N: //
70  }
71 }
72 
73 //fills in mutation array using the freq and scan indices
74 //y threads correspond to freq_index/scan_index indices, use grid-stride loops
75 //x threads correspond to mutation array indices, use grid-stride loops
76 //using scan number to define start of array, freq_index to define num_new_mutations_index (if 0 simply ignore) and myIDx used to calculate allele_count
77 __global__ void initialize_mse_mutation_array(float * mutations_freq, const int * freq_index, const int * scan_index, const int offset, const int Nchrom, const int population, const int num_populations, const int array_Length);
78 
79 __global__ void mse_set_mutID(int4 * mutations_ID, const float * const mutations_freq, const int mutations_Index, const int num_populations, const int array_Length, const bool preserve_mutations);
80 
81 /*__global__ void print_Device_array_uint(unsigned int * array, int num);
82 __global__ void sum_Device_array_bit(unsigned int * array, int num);
83 __global__ void sum_Device_array_uint(unsigned int * array, int num);
84 __global__ void sum_Device_array_float(float * array, int start, int end);
85 __global__ void compareDevicearray(int * array1, int * array2, int array_length);
86 __global__ void copyDevicearray(int * array1, int * array2, int array_length);
87 __global__ void compareDevicearray(float * array1, float * array2, int array_length);
88 __global__ void copyDevicearray(float * array1, float * array2, int array_length);
89 __global__ void print_Device_array_float(float * array, int num);*/
90 
91 //calculates new frequencies for every mutation in the population
92 //seed for random number generator philox's key space, id, generation for its counter space in the pseudorandom sequence
93 template <typename Functor_migration, typename Functor_selection>
94 __global__ void migration_selection_drift(float * mutations_freq, float * const prev_freq, int4 * const mutations_ID, const int mutations_Index, const int array_Length, const int N, const Functor_migration mig_prop, const Functor_selection sel_coeff, const float F, const float h, const int2 seed, const int population, const int num_populations, const int generation){
95  int myID = blockIdx.x*blockDim.x + threadIdx.x;
96 
97  for(int id = myID; id < mutations_Index/4; id+= blockDim.x*gridDim.x){
98  float4 i_mig = make_float4(0);
99  for(int pop = 0; pop < num_populations; pop++){
100  float4 i = reinterpret_cast<float4*>(prev_freq)[pop*array_Length/4+id]; //allele frequency in previous population //make sure array length is divisible by 4 (preferably divisible by 32 or warp_size)!!!!!!
101  i_mig += mig_prop(pop,population,generation)*i; //if population is size 0 or extinct < this does not protect user if they have an incorrect migration function (shouldn't happen anyway, error msg will be spit out in check_sim_paramters)
102  }
103 
104  float4 s = make_float4(max(sel_coeff(population,generation,i_mig.x),-1.f),max(sel_coeff(population,generation,i_mig.y),-1.f),max(sel_coeff(population,generation,i_mig.z),-1.f),max(sel_coeff(population,generation,i_mig.w),-1.f));
105  float4 i_mig_sel = (s*i_mig*i_mig+i_mig+(F+h-h*F)*s*i_mig*(1-i_mig))/(i_mig*i_mig*s+(F+2*h-2*h*F)*s*i_mig*(1-i_mig)+1);
106  float4 mean = i_mig_sel*N; //expected allele count in new generation
107  int4 j_mig_sel_drift = clamp(RNG::ApproxRandBinom4(mean,(1.0-i_mig_sel)*mean,i_mig_sel,N,seed,(id + 2),generation,population), 0, N);
108  reinterpret_cast<float4*>(mutations_freq)[population*array_Length/4+id] = make_float4(j_mig_sel_drift)/N; //final allele freq in new generation //make sure array length is divisible by 4 (preferably 32/warp_size)!!!!!!
109  }
110  int id = myID + mutations_Index/4 * 4; //only works if minimum of 3 threads are launched
111  if(id < mutations_Index){
112  float i_mig = 0;
113  for(int pop = 0; pop < num_populations; pop++){
114  float i = prev_freq[pop*array_Length+id]; //allele frequency in previous population
115  i_mig += mig_prop(pop,population,generation)*i;
116  }
117 
118  float s = max(sel_coeff(population,generation,i_mig),-1.f);
119  float i_mig_sel = (s*i_mig*i_mig+i_mig+(F+h-h*F)*s*i_mig*(1-i_mig))/(i_mig*i_mig*s+(F+2*h-2*h*F)*s*i_mig*(1-i_mig)+1);
120  float mean = i_mig_sel*N; //expected allele count in new generation
121  int j_mig_sel_drift = clamp(RNG::ApproxRandBinom1(mean,(1.0-i_mig_sel)*mean,i_mig_sel,N,seed,(id + 2),generation,population), 0, N);
122  mutations_freq[population*array_Length+id] = float(j_mig_sel_drift)/N; //final allele freq in new generation
123  }
124 }
125 
126 __global__ void add_new_mutations(float * mutations_freq, int4 * mutations_ID, const int prev_mutations_Index, const int new_mutations_Index, const int array_Length, float freq, const int population, const int num_populations, const int generation);
127 
128 __device__ __forceinline__ bool boundary_0(float freq){
129  return (freq <= 0.f);
130 }
131 
132 __device__ __forceinline__ bool boundary_1(float freq){
133  return (freq >= 1.f);
134 }
135 
136 //tests indicate accumulating mutations in non-migrating populations is not much of a problem
137 template <typename Functor_demography>
138 __global__ void flag_segregating_mutations(unsigned int * flag, unsigned int * counter, const Functor_demography demography, const float * const mutations_freq, const int4 * const mutations_ID, const int num_populations, const int padded_mut_Index, const int mutations_Index, const int array_Length, const int generation, const int warp_size){
139 //adapted from https://www.csuohio.edu/engineering/sites/csuohio.edu.engineering/files/Research_Day_2015_EECS_Poster_14.pdf
140  int myID = blockIdx.x*blockDim.x + threadIdx.x;
141  for(int id = myID; id < (padded_mut_Index >> 5); id+= blockDim.x*gridDim.x){
142  int lnID = threadIdx.x % warp_size;
143  int warpID = id >> 5;
144 
145  unsigned int mask;
146  unsigned int cnt=0;
147 
148  for(int j = 0; j < 32; j++){
149  bool zero = 1;
150  bool one = 1;
151  bool preserve = 0;
152  int index = (warpID<<10)+(j<<5)+lnID;
153  if(index >= mutations_Index){ zero *= 1; one = 0; }
154  else{
155  for(int pop = 0; pop < num_populations; pop++){
156  if(demography(pop,generation) > 0){ //not protected if population goes extinct but demography function becomes non-zero again (shouldn't happen anyway, error msg will be spit out in check_sim_paramters)
157  float i = mutations_freq[pop*array_Length+index];
158  zero = zero & boundary_0(i);
159  one = one & boundary_1(i);
160  //must be lost in all or gained in all populations to be lost or fixed
161  }
162  }
163 
164  preserve = (mutations_ID[index].z < 0);
165  }
166  mask = __ballot((!(zero|one) || preserve)); //1 if allele is segregating in any population, 0 otherwise
167 
168  if(lnID == 0) {
169  flag[(warpID<<5)+j] = mask;
170  cnt += __popc(mask);
171  }
172  }
173 
174  if(lnID == 0) counter[warpID] = cnt; //store sum
175  }
176 }
177 
178 __global__ void scatter_arrays(float * new_mutations_freq, int4 * new_mutations_ID, const float * const mutations_freq, const int4 * const mutations_ID, const unsigned int * const flag, const unsigned int * const scan_Index, const int padded_mut_Index, const int new_array_Length, const int old_array_Length, const bool preserve_mutations, const int warp_size);
179 
180 __global__ void preserve_prev_run_mutations(int4 * mutations_ID, const int mutations_Index);
181 
182 //for internal simulation function passing
183 struct sim_struct{
184  //device arrays
185  float * d_mutations_freq; //allele frequency of current mutations
186  float * d_prev_freq; // meant for storing frequency values so changes in previous populations' frequencies don't affect later populations' migration
187  int4 * d_mutations_ID; //generation in which mutation appeared, population in which mutation first arose, ID that generated mutation (negative value indicates preserved state), reserved for later use
188 
189  int h_num_populations; //number of populations in the simulation (# rows for freq)
190  int h_array_Length; //full length of the mutation array, total number of mutations across all populations (# columns for freq)
191  int h_mutations_Index; //number of mutations in the population (last mutation is at h_mutations_Index-1)
192  float h_num_sites; //number of sites in the simulation
193  int * h_new_mutation_Indices; //indices of new mutations, current age/freq index of mutations is at position 0, index for mutations in population 0 to be added to array is at position 1, etc ...
194  bool * h_extinct; //boolean if population has gone extinct
195  int warp_size; //device warp size, determines the amount of extra padding on array to allow for memory coalscence
196 
197  sim_struct(): h_num_populations(0), h_array_Length(0), h_mutations_Index(0), h_num_sites(0), warp_size(0) { d_mutations_freq = NULL; d_prev_freq = NULL; d_mutations_ID = NULL; h_new_mutation_Indices = NULL; h_extinct = NULL;}
198  ~sim_struct(){ cudaCheckErrorsAsync(cudaFree(d_mutations_freq),-1,-1); cudaCheckErrorsAsync(cudaFree(d_prev_freq),-1,-1); cudaCheckErrorsAsync(cudaFree(d_mutations_ID),-1,-1); if(h_new_mutation_Indices) { delete [] h_new_mutation_Indices; } if(h_extinct){ delete [] h_extinct; } }
199 };
200 
201 template <typename Functor_selection>
202 __host__ void integrate_mse(float * d_mse_integral, const int N_ind, const int Nchrom_e, const Functor_selection sel_coeff, const float F, const float h, int pop, cudaStream_t pop_stream){
203  float * d_freq;
204 
205  cudaCheckErrorsAsync(cudaMalloc((void**)&d_freq, Nchrom_e*sizeof(float)),0,pop);
206 
207  mse_integrand<Functor_selection> mse_fun(sel_coeff, N_ind, F, h, pop);
208  trapezoidal_upper< mse_integrand<Functor_selection> > trap(mse_fun);
209 
210  calculate_area<<<10,1024,0,pop_stream>>>(d_freq, Nchrom_e, (float)1.0/(Nchrom_e), trap); //setup array frequency values to integrate over (upper integral from 1 to 0)
211  cudaCheckErrorsAsync(cudaPeekAtLastError(),0,pop);
212 
213  void * d_temp_storage = NULL;
214  size_t temp_storage_bytes = 0;
215  cudaCheckErrorsAsync(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_freq, d_mse_integral, Nchrom_e, pop_stream),0,pop);
216  cudaCheckErrorsAsync(cudaMalloc(&d_temp_storage, temp_storage_bytes),0,pop);
217  cudaCheckErrorsAsync(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_freq, d_mse_integral, Nchrom_e, pop_stream),0,pop);
218  cudaCheckErrorsAsync(cudaFree(d_temp_storage),0,pop);
219  cudaCheckErrorsAsync(cudaFree(d_freq),0,pop);
220 
221  reverse_array<<<10,1024,0,pop_stream>>>(d_mse_integral, Nchrom_e);
222  cudaCheckErrorsAsync(cudaPeekAtLastError(),0,pop);
223 }
224 
225 template <typename Functor_mu, typename Functor_demography, typename Functor_inbreeding>
226 __host__ void set_Index_Length(sim_struct & mutations, const int num_mutations, const Functor_mu mu_rate, const Functor_demography demography, const Functor_inbreeding FI, const float num_sites, const int compact_interval, const int generation, const int final_generation){
227  mutations.h_mutations_Index = num_mutations;
228  mutations.h_array_Length = mutations.h_mutations_Index;
229  int limit = compact_interval;
230  if(limit == 0){ limit = final_generation - generation; }
231  for(int gen = generation+1; gen <= (generation+limit) && gen <= final_generation; gen++){
232  for(int pop = 0; pop < mutations.h_num_populations; pop++){
233  int Nchrom_e = 2*demography(pop,gen)/(1+FI(pop,gen));
234  if(Nchrom_e <= 0 || mutations.h_extinct[pop]){ continue; }
235  mutations.h_array_Length += mu_rate(pop,gen)*Nchrom_e*num_sites + 7.f*sqrtf(mu_rate(pop,gen)*Nchrom_e*num_sites); //maximum distance of floating point normal rng is <7 stdevs from mean
236  }
237  }
238 
239  mutations.h_array_Length = (int)(mutations.h_array_Length/mutations.warp_size + 1*(mutations.h_array_Length%mutations.warp_size!=0))*mutations.warp_size; //extra padding for coalesced global memory access, /watch out: one day warp sizes may not be multiples of 4
240 }
241 
242 template <typename Functor_mutation, typename Functor_demography, typename Functor_selection, typename Functor_inbreeding, typename Functor_dominance>
243 __host__ void initialize_mse(sim_struct & mutations, const Functor_mutation mu_rate, const Functor_demography demography, const Functor_selection sel_coeff, const Functor_inbreeding FI, const Functor_dominance dominance, const int final_generation, const int2 seed, const bool preserve_mutations, const int compact_interval, cudaStream_t * pop_streams, cudaEvent_t * pop_events){
244 
245  int num_freq = 0; //number of frequencies
246  for(int pop = 0; pop < mutations.h_num_populations; pop++){
247  float F = FI(pop,0);
248  int Nchrom_e = 2*demography(pop,0)/(1+F);
249  if(Nchrom_e <= 1){ continue; }
250  num_freq += (Nchrom_e - 1);
251  }
252 
253  int * d_freq_index;
254  cudaCheckErrorsAsync(cudaMalloc((void**)&d_freq_index, num_freq*sizeof(int)),0,-1);
255  int * d_scan_index;
256  cudaCheckErrorsAsync(cudaMalloc((void**)&d_scan_index, num_freq*sizeof(int)),0,-1);
257  float ** mse_integral = new float *[mutations.h_num_populations];
258 
259  int offset = 0;
260  for(int pop = 0; pop < mutations.h_num_populations; pop++){
261  int N_ind = demography(pop,0);
262  float mu = mu_rate(pop,0);
263  float F = FI(pop,0);
264  int Nchrom_e = 2*N_ind/(1+F);
265  float h = dominance(pop,0);
266  cudaCheckErrorsAsync(cudaMalloc((void**)&mse_integral[pop], Nchrom_e*sizeof(float)),0,pop);
267  if(Nchrom_e <= 1){ continue; }
268  integrate_mse(mse_integral[pop], N_ind, Nchrom_e, sel_coeff, F, h, pop, pop_streams[pop]);
269  initialize_mse_frequency_array<<<6,1024,0,pop_streams[pop]>>>(d_freq_index, mse_integral[pop], offset, mu, N_ind, Nchrom_e, mutations.h_num_sites, sel_coeff, F, h, seed, pop);
270  cudaCheckErrorsAsync(cudaPeekAtLastError(),0,pop);
271  offset += (Nchrom_e - 1);
272  }
273 
274  for(int pop = 0; pop < mutations.h_num_populations; pop++){
275  cudaCheckErrorsAsync(cudaFree(mse_integral[pop]),0,pop);
276  cudaCheckErrorsAsync(cudaEventRecord(pop_events[pop],pop_streams[pop]),0,pop);
277  cudaCheckErrorsAsync(cudaStreamWaitEvent(pop_streams[0],pop_events[pop],0),0,pop);
278  }
279 
280  delete [] mse_integral;
281 
282  void * d_temp_storage = NULL;
283  size_t temp_storage_bytes = 0;
284  cudaCheckErrorsAsync(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_freq_index, d_scan_index, num_freq, pop_streams[0]),0,-1);
285  cudaCheckErrorsAsync(cudaMalloc(&d_temp_storage, temp_storage_bytes),0,-1);
286  cudaCheckErrorsAsync(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_freq_index, d_scan_index, num_freq, pop_streams[0]),0,-1);
287  cudaCheckErrorsAsync(cudaFree(d_temp_storage),0,-1);
288 
289  cudaCheckErrorsAsync(cudaEventRecord(pop_events[0],pop_streams[0]),0,-1);
290  for(int pop = 0; pop < mutations.h_num_populations; pop++){
291  cudaCheckErrorsAsync(cudaStreamWaitEvent(pop_streams[pop],pop_events[0],0),0,pop);
292  }
293 
294  int prefix_sum_result;
295  int final_freq_count;
296  //final index is numfreq-1
297  cudaCheckErrorsAsync(cudaMemcpyAsync(&prefix_sum_result, &d_scan_index[(num_freq-1)], sizeof(int), cudaMemcpyDeviceToHost, pop_streams[0]),0,-1); //has to be in sync with host as result is used straight afterwards
298  cudaCheckErrorsAsync(cudaMemcpyAsync(&final_freq_count, &d_freq_index[(num_freq-1)], sizeof(int), cudaMemcpyDeviceToHost, pop_streams[0]),0,-1); //has to be in sync with host as result is used straight afterwards
299  if(cudaStreamQuery(pop_streams[0]) != cudaSuccess){ cudaCheckErrors(cudaStreamSynchronize(pop_streams[0]),0,-1); } //has to be in sync with the host since h_num_seq_mutations is manipulated on CPU right after
300  int num_mutations = prefix_sum_result+final_freq_count;
301  set_Index_Length(mutations, num_mutations, mu_rate, demography, FI, mutations.h_num_sites, compact_interval, 0, final_generation);
302  //cout<<"initial length " << mutations.h_array_Length << endl;
303 
304  cudaCheckErrorsAsync(cudaMalloc((void**)&mutations.d_mutations_freq, mutations.h_num_populations*mutations.h_array_Length*sizeof(float)),0,-1);
305  cudaCheckErrorsAsync(cudaMalloc((void**)&mutations.d_prev_freq, mutations.h_num_populations*mutations.h_array_Length*sizeof(float)),0,-1);
306  cudaCheckErrorsAsync(cudaMalloc((void**)&mutations.d_mutations_ID, mutations.h_array_Length*sizeof(int4)),0,-1);
307 
308  const dim3 blocksize(4,256,1);
309  const dim3 gridsize(32,32,1);
310  offset = 0;
311  for(int pop = 0; pop < mutations.h_num_populations; pop++){
312  float F = FI(pop,0);
313  int Nchrom_e = 2*demography(pop,0)/(1+F);
314  if(Nchrom_e <= 1){ continue; }
315  initialize_mse_mutation_array<<<gridsize,blocksize,0,pop_streams[pop]>>>(mutations.d_prev_freq, d_freq_index, d_scan_index, offset, Nchrom_e, pop, mutations.h_num_populations, mutations.h_array_Length);
316  cudaCheckErrorsAsync(cudaPeekAtLastError(),0,pop);
317  offset += (Nchrom_e - 1);
318  }
319 
320  mse_set_mutID<<<50,1024,0,pop_streams[mutations.h_num_populations]>>>(mutations.d_mutations_ID, mutations.d_prev_freq, mutations.h_mutations_Index, mutations.h_num_populations, mutations.h_array_Length, preserve_mutations);
321  cudaCheckErrorsAsync(cudaPeekAtLastError(),0,-1);
322 
323  for(int pop = 0; pop <= mutations.h_num_populations; pop++){
324  cudaCheckErrorsAsync(cudaEventRecord(pop_events[pop],pop_streams[pop]),0,pop);
325  for(int pop2 = mutations.h_num_populations+1; pop2 < 2*mutations.h_num_populations; pop2++){
326  cudaCheckErrorsAsync(cudaStreamWaitEvent(pop_streams[pop2],pop_events[pop],0),0,pop); //tells every pop_stream not used above to wait until initialization is done
327  }
328  }
329 
330  cudaCheckErrorsAsync(cudaFree(d_freq_index),0,-1);
331  cudaCheckErrorsAsync(cudaFree(d_scan_index),0,-1);
332 }
333 
334 //assumes prev_sim.num_sites is equivalent to current simulations num_sites or prev_sim.num_mutations == 0 (initialize to blank)
335 template <typename Functor_mutation, typename Functor_demography, typename Functor_inbreeding, typename Functor_preserve, typename Functor_timesample>
336 __host__ void init_blank_prev_run(sim_struct & mutations, int & generation_shift, int & final_generation, const bool use_prev_sim, const float prev_sim_num_sites, const int prev_sim_num_populations, const int prev_sim_sampled_generation, const bool * const prev_sim_extinct, const int prev_sim_num_mutations, float * prev_sim_mutations_freq, GO_Fish::mutID * prev_sim_mutations_ID, const Functor_mutation mu_rate, const Functor_demography demography, const Functor_inbreeding FI, const Functor_preserve preserve_mutations, const Functor_timesample take_sample, const int compact_interval, cudaStream_t * pop_streams, cudaEvent_t * pop_events){
337  //if prev_sim.num_mutations == 0 or num sites or num_populations between two runs are not equivalent, don't copy (initialize to blank)
338  int num_mutations = 0;
339  if(prev_sim_num_mutations != 0 && use_prev_sim){
340  generation_shift = prev_sim_sampled_generation;
341  final_generation += generation_shift;
342  num_mutations = prev_sim_num_mutations;
343  for(int i = 0; i < mutations.h_num_populations; i++){ mutations.h_extinct[i] = prev_sim_extinct[i]; }
344  }
345 
346  set_Index_Length(mutations, num_mutations, mu_rate, demography, FI, mutations.h_num_sites, compact_interval, generation_shift, final_generation);
347  cudaCheckErrorsAsync(cudaMalloc((void**)&mutations.d_mutations_freq, mutations.h_num_populations*mutations.h_array_Length*sizeof(float)),0,-1);
348  cudaCheckErrorsAsync(cudaMalloc((void**)&mutations.d_prev_freq, mutations.h_num_populations*mutations.h_array_Length*sizeof(float)),0,-1);
349  cudaCheckErrorsAsync(cudaMalloc((void**)&mutations.d_mutations_ID, mutations.h_array_Length*sizeof(int4)),0,-1);
350 
351  if(prev_sim_num_mutations != 0 && use_prev_sim){
352  cudaCheckErrors(cudaHostRegister(prev_sim_mutations_freq,prev_sim_num_populations*prev_sim_num_mutations*sizeof(float),cudaHostRegisterPortable),generation_shift,-1); //pinned memory allows for asynchronous transfer to host
353  cudaCheckErrorsAsync(cudaMemcpy2DAsync(mutations.d_prev_freq, mutations.h_array_Length*sizeof(float), prev_sim_mutations_freq, prev_sim_num_mutations*sizeof(float), prev_sim_num_mutations*sizeof(float), prev_sim_num_populations, cudaMemcpyHostToDevice, pop_streams[0]),0,-1);
354  cudaCheckErrors(cudaHostUnregister(prev_sim_mutations_freq),generation_shift,-1);
355  cudaCheckErrors(cudaHostRegister(prev_sim_mutations_ID,prev_sim_num_mutations*sizeof(GO_Fish::mutID),cudaHostRegisterPortable),generation_shift,-1); //pinned memory allows for asynchronous transfer to host
356  cudaCheckErrorsAsync(cudaMemcpyAsync(mutations.d_mutations_ID, prev_sim_mutations_ID, prev_sim_num_mutations*sizeof(int4), cudaMemcpyHostToDevice, pop_streams[1]),0,-1); //While host can continue, memory copies will not overlap as in the same direction
357  cudaCheckErrors(cudaHostUnregister(prev_sim_mutations_ID),generation_shift,-1);
358  if(preserve_mutations(generation_shift) | take_sample(generation_shift)){ preserve_prev_run_mutations<<<200,512,0,pop_streams[1]>>>(mutations.d_mutations_ID, mutations.h_mutations_Index); }
359  cudaCheckErrorsAsync(cudaEventRecord(pop_events[0],pop_streams[0]),0,-1);
360  cudaCheckErrorsAsync(cudaEventRecord(pop_events[1],pop_streams[1]),0,-1);
361 
362  //wait until initialization is complete
363  for(int pop = 2; pop < 2*mutations.h_num_populations; pop++){
364  cudaCheckErrorsAsync(cudaStreamWaitEvent(pop_streams[pop],pop_events[0],0),0,pop);
365  cudaCheckErrorsAsync(cudaStreamWaitEvent(pop_streams[pop],pop_events[1],0),0,pop);
366  }
367  }
368 }
369 
370 template <typename Functor_mutation, typename Functor_demography, typename Functor_inbreeding>
371 __host__ void compact(sim_struct & mutations, const Functor_mutation mu_rate, const Functor_demography demography, const Functor_inbreeding FI, const int generation, const int final_generation, const bool preserve_mutations, const int compact_interval, cudaStream_t * control_streams, cudaEvent_t * control_events, cudaStream_t * pop_streams){
372  unsigned int * d_flag;
373  unsigned int * d_count;
374 
375  int padded_mut_index = (((mutations.h_mutations_Index>>10)+1*(mutations.h_mutations_Index%1024!=0))<<10);
376 
377  cudaCheckErrorsAsync(cudaFree(mutations.d_mutations_freq),generation,-1);
378  cudaCheckErrorsAsync(cudaMalloc((void**)&d_flag,(padded_mut_index>>5)*sizeof(unsigned int)),generation,-1);
379  cudaCheckErrorsAsync(cudaMalloc((void**)&d_count,(padded_mut_index>>10)*sizeof(unsigned int)),generation,-1);
380 
381  flag_segregating_mutations<<<800,128,0,control_streams[0]>>>(d_flag, d_count, demography, mutations.d_prev_freq, mutations.d_mutations_ID, mutations.h_num_populations, padded_mut_index, mutations.h_mutations_Index, mutations.h_array_Length, generation, mutations.warp_size);
382  cudaCheckErrorsAsync(cudaPeekAtLastError(),generation,-1);
383 
384  unsigned int * d_scan_Index;
385  cudaCheckErrorsAsync(cudaMalloc((void**)&d_scan_Index,(padded_mut_index>>10)*sizeof(unsigned int)),generation,-1);
386 
387  void * d_temp_storage = NULL;
388  size_t temp_storage_bytes = 0;
389  cudaCheckErrorsAsync(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_count, d_scan_Index, (padded_mut_index>>10), control_streams[0]),generation,-1);
390  cudaCheckErrorsAsync(cudaMalloc(&d_temp_storage, temp_storage_bytes),generation,-1);
391  cudaCheckErrorsAsync(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_count, d_scan_Index, (padded_mut_index>>10), control_streams[0]),generation,-1);
392  cudaCheckErrorsAsync(cudaFree(d_temp_storage),generation,-1);
393 
394  cudaCheckErrorsAsync(cudaPeekAtLastError(),generation,-1);
395 
396  int h_num_seg_mutations;
397  cudaCheckErrorsAsync(cudaMemcpyAsync(&h_num_seg_mutations, &d_scan_Index[(padded_mut_index>>10)-1], sizeof(int), cudaMemcpyDeviceToHost, control_streams[0]),generation,-1);
398  if(cudaStreamQuery(control_streams[0]) != cudaSuccess){ cudaCheckErrors(cudaStreamSynchronize(control_streams[0]),generation,-1); } //has to be in sync with the host since h_num_seq_mutations is manipulated on CPU right after
399 
400  int old_array_Length = mutations.h_array_Length;
401  set_Index_Length(mutations, h_num_seg_mutations, mu_rate, demography, FI, mutations.h_num_sites, compact_interval, generation, final_generation);
402 
403  float * d_temp;
404  int4 * d_temp2;
405  cudaCheckErrorsAsync(cudaMalloc((void**)&d_temp,mutations.h_num_populations*mutations.h_array_Length*sizeof(float)),generation,-1);
406  cudaCheckErrorsAsync(cudaMalloc((void**)&d_temp2,mutations.h_array_Length*sizeof(int4)),generation,-1);
407 
408  const dim3 gridsize(800,mutations.h_num_populations,1);
409  scatter_arrays<<<gridsize,128,0,control_streams[0]>>>(d_temp, d_temp2, mutations.d_prev_freq, mutations.d_mutations_ID, d_flag, d_scan_Index, padded_mut_index, mutations.h_array_Length, old_array_Length, preserve_mutations, mutations.warp_size);
410  cudaCheckErrorsAsync(cudaPeekAtLastError(),generation,-1);
411 
412  cudaCheckErrorsAsync(cudaEventRecord(control_events[0],control_streams[0]),generation,-1);
413 
414  for(int pop = 0; pop < 2*mutations.h_num_populations; pop++){
415  cudaCheckErrorsAsync(cudaStreamWaitEvent(pop_streams[pop],control_events[0],0),generation,pop);
416  }
417 
418  cudaCheckErrorsAsync(cudaFree(mutations.d_prev_freq),generation,-1);
419  cudaCheckErrorsAsync(cudaFree(mutations.d_mutations_ID),generation,-1);
420 
421  mutations.d_prev_freq = d_temp;
422  mutations.d_mutations_ID = d_temp2;
423 
424  cudaCheckErrorsAsync(cudaFree(d_flag),generation,-1);
425  cudaCheckErrorsAsync(cudaFree(d_scan_Index),generation,-1);
426  cudaCheckErrorsAsync(cudaFree(d_count),generation,-1);
427 
428  cudaCheckErrorsAsync(cudaMalloc((void**)&mutations.d_mutations_freq,mutations.h_num_populations*mutations.h_array_Length*sizeof(float)),generation,-1);
429 }
430 
431 template <typename Functor_mutation, typename Functor_demography, typename Functor_migration, typename Functor_inbreeding>
432 __host__ void check_sim_parameters(const Functor_mutation mu_rate, const Functor_demography demography, const Functor_migration mig_prop, const Functor_inbreeding FI, sim_struct & mutations, const int generation){
433  int num_pop = mutations.h_num_populations;
434  for(int pop = 0; pop < num_pop; pop++){
435  double migration = 0;
436  if(mu_rate(pop,generation) < 0){ fprintf(stderr,"check_sim_parameters: mutation error: mu_rate < 0\tgeneration %d\t population %d\n",generation,pop); exit(1); }
437  int N = demography(pop,generation);
438  if(N > 0 && mutations.h_extinct[pop]){ fprintf(stderr,"check_sim_parameters: demography error: extinct population with population size > 0\tgeneration %d\t population %d\n",generation,pop); exit(1); }
439  float fi = FI(pop,generation);
440  if(fi < 0) { fprintf(stderr,"check_sim_parameters: inbreeding error: inbreeding coefficient < 0\tgeneration %d\t population %d\n",generation,pop); exit(1); }
441  if(fi > 1) { fprintf(stderr,"check_sim_parameters: inbreeding error: inbreeding coefficient > 1\tgeneration %d\t population %d\n",generation,pop); exit(1); }
442  for(int pop2 = 0; pop2 < num_pop; pop2++){
443  float m_from = mig_prop(pop,pop2,generation);
444  if(m_from < 0){ fprintf(stderr,"check_sim_parameters: migration error: migration rate < 0\tgeneration %d\t population_from %d\t population_to %d\n",generation,pop,pop2); exit(1); }
445  if(m_from > 0 && ((N <= 0 || mutations.h_extinct[pop]) && !(demography(pop2,generation) <= 0 || mutations.h_extinct[pop2]))){ fprintf(stderr,"migration error, migration from non-existant population\tgeneration %d\t population_from %d\t population_to %d\n",generation,pop,pop2); exit(1); } //if population doesn't exist, doesn't matter if anyone migrates to it
446 
447  float m_to = mig_prop(pop2,pop,generation);
448  migration += (double)m_to;
449  }
450  if((float)migration != 1.f && !(N <= 0 || mutations.h_extinct[pop])){ fprintf(stderr,"check_sim_parameters: migration error: migration rate does not sum to 1\tgeneration %d\t population_TO %d\t total_migration_proportion %f\n",generation,pop,migration); exit(1); }
451  }
452 }
453 
454 template <typename Functor_mutation, typename Functor_demography, typename Functor_inbreeding>
455 __host__ void calc_new_mutations_Index(sim_struct & mutations, const Functor_mutation mu_rate, const Functor_demography demography, const Functor_inbreeding FI, const int2 seed, const int generation){
456  int num_new_mutations = 0;
457  mutations.h_new_mutation_Indices[0] = mutations.h_mutations_Index;
458  for(int pop = 0; pop < mutations.h_num_populations; pop++){
459  int Nchrom_e = 2*demography(pop,generation)/(1+FI(pop,generation));
460  float mu = mu_rate(pop, generation);
461  float lambda = mu*Nchrom_e*mutations.h_num_sites;
462  if(Nchrom_e == 0 || lambda == 0 || mutations.h_extinct[pop]){ mutations.h_new_mutation_Indices[pop+1] = mutations.h_new_mutation_Indices[pop]; continue; }
463  int temp = max(RNG::ApproxRandPois1(lambda, lambda, mu, Nchrom_e*mutations.h_num_sites, seed, 1, generation, pop),0);
464  num_new_mutations += temp;
465  mutations.h_new_mutation_Indices[pop+1] = num_new_mutations + mutations.h_mutations_Index;
466  }
467 }
468 
469 template <typename Functor_timesample>
470 __host__ __forceinline__ int calc_sim_result_vector_length(Functor_timesample take_sample, int starting_generation, int final_generation) {
471  int length = 0;
472  for(int i = starting_generation; i < final_generation; i++){ if(take_sample(i)){ length++; } }
473  length++;//always takes sample of final generation
474  return length;
475 }
476 
477 template <typename Functor_demography, typename Functor_inbreeding>
478 __host__ __forceinline__ void store_time_sample(int & out_num_mutations, int & out_max_num_mutations, int & out_sampled_generation, float *& out_mutations_freq, GO_Fish::mutID *& out_mutations_ID, bool *& out_extinct, int *& out_Nchrom_e, sim_struct & mutations, Functor_demography demography, Functor_inbreeding FI, int sampled_generation, int final_generation, cudaStream_t * control_streams, cudaEvent_t * control_events){
479  out_num_mutations = mutations.h_mutations_Index;
480  out_sampled_generation = sampled_generation;
481  out_mutations_freq = new float[mutations.h_num_populations*out_num_mutations];
482  cudaCheckErrors(cudaHostRegister(out_mutations_freq,mutations.h_num_populations*out_num_mutations*sizeof(float),cudaHostRegisterPortable),sampled_generation,-1); //pinned memory allows for asynchronous transfer to host
483  cudaCheckErrorsAsync(cudaMemcpy2DAsync(out_mutations_freq, out_num_mutations*sizeof(float), mutations.d_prev_freq, mutations.h_array_Length*sizeof(float), out_num_mutations*sizeof(float), mutations.h_num_populations, cudaMemcpyDeviceToHost, control_streams[0]),sampled_generation,-1); //removes padding
484  cudaCheckErrors(cudaHostUnregister(out_mutations_freq),sampled_generation,-1);
485  if(sampled_generation == final_generation){
486  out_mutations_ID = (GO_Fish::mutID *)malloc(out_num_mutations*sizeof(GO_Fish::mutID)); //use malloc to avoid calling mutID constructor (new calls constructor and is slower by a couple of percent)
487  cudaCheckErrors(cudaHostRegister(out_mutations_ID,out_num_mutations*sizeof(int4),cudaHostRegisterPortable),sampled_generation,-1); //pinned memory allows for asynchronous transfer to host
488  cudaCheckErrorsAsync(cudaMemcpyAsync(out_mutations_ID, mutations.d_mutations_ID, out_num_mutations*sizeof(int4), cudaMemcpyDeviceToHost, control_streams[0]),sampled_generation,-1); //mutations array is 1D
489  cudaCheckErrors(cudaHostUnregister(out_mutations_ID),sampled_generation,-1);
490  out_max_num_mutations = out_num_mutations;
491  }
492  out_extinct = new bool[mutations.h_num_populations];
493  out_Nchrom_e = new int[mutations.h_num_populations];
494  for(int i = 0; i < mutations.h_num_populations; i++){
495  out_extinct[i] = mutations.h_extinct[i];
496  out_Nchrom_e[i] = 2*demography(i,sampled_generation)/(1+FI(i,sampled_generation));
497  }
498 
499  cudaCheckErrorsAsync(cudaEventRecord(control_events[0],control_streams[0]),sampled_generation,-1);
500  //1 round of migration_selection_drift and add_new_mutations can be done simultaneously with above as they change d_mutations_freq array, not d_prev_freq
501 }
502 
503 } /* ----- end namespace go_fish_details ----- */
505 
518 namespace GO_Fish{
519 
524 template <typename Functor_mutation, typename Functor_demography, typename Functor_migration, typename Functor_selection, typename Functor_inbreeding, typename Functor_dominance, typename Functor_preserve, typename Functor_timesample>
525 __host__ void run_sim(allele_trajectories & all_results, const Functor_mutation mu_rate, const Functor_demography demography, const Functor_migration mig_prop, const Functor_selection sel_coeff, const Functor_inbreeding FI, const Functor_dominance dominance, const Functor_preserve preserve_mutations, const Functor_timesample take_sample){
526  run_sim(all_results, mu_rate, demography, mig_prop, sel_coeff, FI, dominance, preserve_mutations, take_sample, allele_trajectories());
527 }
528 
553 template <typename Functor_mutation, typename Functor_demography, typename Functor_migration, typename Functor_selection, typename Functor_inbreeding, typename Functor_dominance, typename Functor_preserve, typename Functor_timesample>
554 __host__ void run_sim(allele_trajectories & all_results, const Functor_mutation mu_rate, const Functor_demography demography, const Functor_migration mig_prop, const Functor_selection sel_coeff, const Functor_inbreeding FI, const Functor_dominance dominance, const Functor_preserve preserve_mutations, const Functor_timesample take_sample, const allele_trajectories & prev_sim){
555 
556  using namespace go_fish_details;
557 
558  int2 seed;
559  seed.x = all_results.sim_input_constants.seed1;
560  seed.y = all_results.sim_input_constants.seed2;
561  cudaDeviceProp devProp = set_cuda_device(all_results.sim_input_constants.device);
562 
563  sim_struct mutations;
564 
565  mutations.h_num_populations = all_results.sim_input_constants.num_populations;
566  mutations.h_new_mutation_Indices = new int[mutations.h_num_populations+1];
567  mutations.h_num_sites = all_results.sim_input_constants.num_sites;
568  mutations.h_extinct = new bool[mutations.h_num_populations];
569  for(int i = 0; i < mutations.h_num_populations; i++){ mutations.h_extinct[i] = 0; } //some compilers won't default to 0
570  mutations.warp_size = devProp.warpSize;
571 
572  cudaStream_t * pop_streams = new cudaStream_t[2*mutations.h_num_populations];
573  cudaEvent_t * pop_events = new cudaEvent_t[2*mutations.h_num_populations];
574 
575  int num_control_streams = 1;
576  cudaStream_t * control_streams = new cudaStream_t[num_control_streams];;
577  cudaEvent_t * control_events = new cudaEvent_t[num_control_streams];;
578 
579  for(int pop = 0; pop < 2*mutations.h_num_populations; pop++){
580  cudaCheckErrors(cudaStreamCreate(&pop_streams[pop]),-1,pop);
581  cudaCheckErrorsAsync(cudaEventCreateWithFlags(&pop_events[pop],cudaEventDisableTiming),-1,pop);
582  }
583 
584  for(int stream = 0; stream < num_control_streams; stream++){
585  cudaCheckErrors(cudaStreamCreate(&control_streams[stream]),-1,stream);
586  cudaCheckErrorsAsync(cudaEventCreateWithFlags(&control_events[stream],cudaEventDisableTiming),-1,stream);
587  }
588 
589  int generation = 0;
590  int final_generation = all_results.sim_input_constants.num_generations;
591  int compact_interval = all_results.sim_input_constants.compact_interval;
592 
593  //----- initialize simulation -----
594  if(all_results.sim_input_constants.init_mse){
595  //----- mutation-selection equilibrium (mse) (default) -----
596  check_sim_parameters(mu_rate, demography, mig_prop, FI, mutations, generation);
597  initialize_mse(mutations, mu_rate, demography, sel_coeff, FI, dominance, final_generation, seed, (preserve_mutations(0)|take_sample(0)), compact_interval, pop_streams, pop_events);
598  //----- end -----
599  }else{
600  //----- initialize from results of previous simulation run or initialize to blank (blank will often take many generations to reach equilibrium) -----
601  int sample_index = all_results.sim_input_constants.prev_sim_sample;
602  if((sample_index >= 0 && sample_index < all_results.num_samples) && prev_sim.time_samples){
603  float prev_sim_num_sites = prev_sim.sim_run_constants.num_sites;
604  int prev_sim_num_populations = prev_sim.sim_run_constants.num_populations;
605  if(mutations.h_num_sites == prev_sim_num_sites && mutations.h_num_populations == prev_sim_num_populations){
606  //initialize from previous simulation
607  init_blank_prev_run(mutations, generation, final_generation, true, prev_sim_num_sites, prev_sim_num_populations, prev_sim.time_samples[sample_index]->sampled_generation, prev_sim.time_samples[sample_index]->extinct, prev_sim.time_samples[sample_index]->num_mutations, prev_sim.time_samples[sample_index]->mutations_freq, prev_sim.mutations_ID, mu_rate, demography, FI, preserve_mutations, take_sample, compact_interval, pop_streams, pop_events);
608  }else{
609  fprintf(stderr,"run_sim error: prev_sim parameters do not match current simulation parameters: prev_sim num_sites %f\tcurrent_sim num_sites %f,\tprev_sim num_populations %d\tcurrent_sim num_populations %d\n",prev_sim_num_sites,mutations.h_num_sites,prev_sim_num_populations,mutations.h_num_populations); exit(1);
610  }
611  }
612  else if(sample_index < 0){
613  //initialize blank simulation
614  init_blank_prev_run(mutations, generation, final_generation, false, -1.f, -1, -1, NULL, 0, NULL, NULL, mu_rate, demography, FI, preserve_mutations, take_sample, compact_interval, pop_streams, pop_events);
615  }
616  else{
617  if(!prev_sim.time_samples){ fprintf(stderr,"run_sim error: requested time sample from empty prev_sim\n"); exit(1); }
618  fprintf(stderr,"run_sim error: requested sample index out of bounds for prev_sim: sample %d\t[0\t %d)\n",sample_index,prev_sim.num_samples); exit(1);
619  }
620 
621  check_sim_parameters(mu_rate, demography, mig_prop, FI, mutations, generation);
622  //----- end -----
623  }
624 
625  int new_length = calc_sim_result_vector_length(take_sample,generation,final_generation);
626  all_results.initialize_sim_result_vector(new_length);
627 
628  //----- take time samples of allele trajectory -----
629  int sample_index = 0;
630  if(take_sample(generation) && generation != final_generation){
631  for(int pop = 0; pop < 2*mutations.h_num_populations; pop++){ cudaCheckErrorsAsync(cudaStreamWaitEvent(control_streams[0],pop_events[pop],0), generation, pop); } //wait to sample until after initialization is done
632  store_time_sample(all_results.time_samples[sample_index]->num_mutations, all_results.all_mutations, all_results.time_samples[sample_index]->sampled_generation, all_results.time_samples[sample_index]->mutations_freq, all_results.mutations_ID, all_results.time_samples[sample_index]->extinct, all_results.time_samples[sample_index]->Nchrom_e, mutations, demography, FI, generation, final_generation, control_streams, control_events);
633  sample_index++;
634  }
635  //----- end -----
636  //----- end -----
637 
638 // std::cout<< std::endl <<"initial length " << mutations.h_array_Length << std::endl;
639 // std::cout<<"initial num_mutations " << mutations.h_mutations_Index;
640 // std::cout<< std::endl <<"generation " << generation;
641 
642  //----- simulation steps -----
643  int next_compact_generation = generation + compact_interval;
644 
645  while((generation+1) <= final_generation){ //end of simulation
646  generation++;
647  check_sim_parameters(mu_rate, demography, mig_prop, FI, mutations, generation);
648  //----- migration, selection, drift -----
649  for(int pop = 0; pop < mutations.h_num_populations; pop++){
650  int N_ind = demography(pop,generation);
651  if(mutations.h_extinct[pop]){ continue; }
652  if(N_ind <= 0){
653  if(demography(pop,generation-1) > 0){ //previous generation, the population was alive
654  N_ind = 0; //allow to go extinct
655  mutations.h_extinct[pop] = true; //next generation will not process
656  } else{ continue; } //if population has not yet arisen, it will have a population size of 0, can simply not process
657  }
658  float F = FI(pop,generation);
659  int Nchrom_e = 2*N_ind/(1+F);
660 
661  float h = dominance(pop,generation);
662  //10^5 mutations: 600 blocks for 1 population, 300 blocks for 3 pops
663  migration_selection_drift<<<600,128,0,pop_streams[pop]>>>(mutations.d_mutations_freq, mutations.d_prev_freq, mutations.d_mutations_ID, mutations.h_mutations_Index, mutations.h_array_Length, Nchrom_e, mig_prop, sel_coeff, F, h, seed, pop, mutations.h_num_populations, generation);
664  cudaCheckErrorsAsync(cudaPeekAtLastError(),generation,pop);
665  cudaCheckErrorsAsync(cudaEventRecord(pop_events[pop],pop_streams[pop]),generation,pop);
666  }
667  //----- end -----
668 
669  //----- generate new mutations -----
670  calc_new_mutations_Index(mutations, mu_rate, demography, FI, seed, generation);
671  for(int pop = 0; pop < mutations.h_num_populations; pop++){
672  int N_ind = demography(pop,generation);
673  if((N_ind <= 0) || mutations.h_extinct[pop]){ continue; }
674  float F = FI(pop,generation);
675  int Nchrom_e = 2*N_ind/(1+F);
676  float freq = 1.f/Nchrom_e;
677  int prev_Index = mutations.h_new_mutation_Indices[pop];
678  int new_Index = mutations.h_new_mutation_Indices[pop+1];
679  add_new_mutations<<<20,512,0,pop_streams[pop+mutations.h_num_populations]>>>(mutations.d_mutations_freq, mutations.d_mutations_ID, prev_Index, new_Index, mutations.h_array_Length, freq, pop, mutations.h_num_populations, generation);
680  cudaCheckErrorsAsync(cudaPeekAtLastError(),generation,pop);
681  cudaCheckErrorsAsync(cudaEventRecord(pop_events[pop+mutations.h_num_populations],pop_streams[pop+mutations.h_num_populations]),generation,pop);
682  }
683  mutations.h_mutations_Index = mutations.h_new_mutation_Indices[mutations.h_num_populations];
684  //----- end -----
685 
686  bool preserve = (preserve_mutations(generation) | take_sample(generation)); //preserve mutations in sample
687 
688  for(int pop1 = 0; pop1 < mutations.h_num_populations; pop1++){
689  for(int pop2 = 0; pop2 < mutations.h_num_populations; pop2++){
690  cudaCheckErrorsAsync(cudaStreamWaitEvent(pop_streams[pop1],pop_events[pop2],0), generation, pop1*mutations.h_num_populations+pop2); //wait to do the next round of mig_sel_drift until mig_sel_drift is done
691  cudaCheckErrorsAsync(cudaStreamWaitEvent(pop_streams[pop1],pop_events[pop2+mutations.h_num_populations],0), generation, pop1*mutations.h_num_populations+pop2); //wait to do the next round of mig_sel_drift until add_mut is done
692  }
693  if(generation == next_compact_generation || generation == final_generation || preserve){
694  cudaCheckErrorsAsync(cudaStreamWaitEvent(control_streams[0],pop_events[pop1],0), generation, pop1); //wait to compact/sample until after mig_sel_drift and add_new_mut are done
695  cudaCheckErrorsAsync(cudaStreamWaitEvent(control_streams[0],pop_events[pop1+mutations.h_num_populations],0), generation, pop1);
696  }
697  }
698 
699  //tell add mutations and migseldrift to pause here if not yet done streaming recoded data to host (store_time_sample) from previous generation
700  if(take_sample(generation-1)){ for(int pop = 0; pop < 2*mutations.h_num_populations; pop++){ cudaCheckErrorsAsync(cudaStreamWaitEvent(pop_streams[pop],control_events[0],0),generation,pop); } }
701 
702  std::swap(mutations.d_prev_freq,mutations.d_mutations_freq); //even if previous kernels/cuda commands not finished yet, the fact that the pointer labels have switched doesn't change which pointers they were passed, still need to sync kernels with StreamWaitEvents but don't need to synchronize CPU and GPU here
703 
704  //----- compact every compact_interval generations, final generation, and before preserving mutations; compact_interval == 0 shuts off compact -----
705  if((generation == next_compact_generation || generation == final_generation || preserve) && compact_interval > 0){ compact(mutations, mu_rate, demography, FI, generation, final_generation, preserve, compact_interval, control_streams, control_events, pop_streams); next_compact_generation = generation + compact_interval; }
706  //----- end -----
707 
708  //----- take time samples of allele trajectories -----
709  if(take_sample(generation) && generation != final_generation){
710  store_time_sample(all_results.time_samples[sample_index]->num_mutations, all_results.all_mutations, all_results.time_samples[sample_index]->sampled_generation, all_results.time_samples[sample_index]->mutations_freq, all_results.mutations_ID, all_results.time_samples[sample_index]->extinct, all_results.time_samples[sample_index]->Nchrom_e, mutations, demography, FI, generation, final_generation, control_streams, control_events);
711  sample_index++;
712  }
713  //----- end -----
714  }
715 
716  //----- take final time sample of allele trajectories -----
717  store_time_sample(all_results.time_samples[sample_index]->num_mutations, all_results.all_mutations, all_results.time_samples[sample_index]->sampled_generation, all_results.time_samples[sample_index]->mutations_freq, all_results.mutations_ID, all_results.time_samples[sample_index]->extinct, all_results.time_samples[sample_index]->Nchrom_e, mutations, demography, FI, generation, final_generation, control_streams, control_events);
718  //----- end -----
719  //----- end -----
720 
721  if(cudaStreamQuery(control_streams[0]) != cudaSuccess){ cudaCheckErrors(cudaStreamSynchronize(control_streams[0]), generation, -1); } //ensures writes to host are finished before host can manipulate the data
722 
723  for(int pop = 0; pop < 2*mutations.h_num_populations; pop++){ cudaCheckErrorsAsync(cudaStreamDestroy(pop_streams[pop]),generation,pop); cudaCheckErrorsAsync(cudaEventDestroy(pop_events[pop]),generation,pop); }
724  for(int stream = 0; stream < num_control_streams; stream++){ cudaCheckErrorsAsync(cudaStreamDestroy(control_streams[stream]),generation,stream); cudaCheckErrorsAsync(cudaEventDestroy(control_events[stream]),generation,stream); }
725 
726  delete [] pop_streams;
727  delete [] pop_events;
728  delete [] control_streams;
729  delete [] control_events;
730 }
731 
732 } /* ----- end namespace GO_Fish ----- */
733 
734 #endif /* GO_FISH_IMPL_CUH */
int prev_sim_sample
time sample of previous simulation to use for initializing current simulation
control and output data structure for GO_Fish simulation
int compact_interval
how often to compact the simulation and remove fixed or lost mutations
int num_populations
number of populations in simulation
structure specifying the ID for a mutation in a GO_Fish simulation
void swap(allele_trajectories &a, allele_trajectories &b)
swaps data held by allele_trajectories a and b
__host__ void run_sim(allele_trajectories &all_results, const Functor_mutation mu_rate, const Functor_demography demography, const Functor_migration mig_prop, const Functor_selection sel_coeff, const Functor_inbreeding FI, const Functor_dominance dominance, const Functor_preserve preserve_mutations, const Functor_timesample take_sample)
runs a single-locus Wright-Fisher simulation specified by the given simulation functions and sim_cons...
float num_sites
number of sites in simulation
bool init_mse
true: initialize simulation in mutation_selection_equilibrium; false: initialize blank simulation or ...
int device
GPU identity to run simulation on, if -1 next available GPU will be assigned.
Namespace for single-locus, forward, Monte-Carlo Wright-Fisher simulation and output data structures...
Definition: go_fish.cuh:326
sim_constants sim_input_constants
constants for initializing the next simulation
int num_generations
number of generations in simulation