8 #ifndef GO_FISH_IMPL_CUH_ 9 #define GO_FISH_IMPL_CUH_ 11 #include "../_outside_libraries/cub/device/device_scan.cuh" 12 #include "../_internal/shared.cuh" 13 #include "../go_fish_data_struct.h" 16 namespace go_fish_details{
18 __device__ __forceinline__ float4 operator-(
float a, float4 b){
return make_float4((a-b.x), (a-b.y), (a-b.z), (a-b.w)); }
20 __device__ __forceinline__
float mse(
float i,
int N,
float F,
float h,
float s){
21 return exp(2*N*s*i*((2*h+(1-2*h)*i)*(1-F) + 2*F)/(1+F));
24 template <
typename Functor_selection>
26 Functor_selection sel_coeff;
30 mse_integrand(): N(0), h(0), F(0), pop(0), gen(0) {}
31 mse_integrand(Functor_selection xsel_coeff,
int xN,
float xF,
float xh,
int xpop,
int xgen = 0): sel_coeff(xsel_coeff), N(xN), F(xF), h(xh), pop(xpop), gen(xgen) { }
33 __device__ __forceinline__
float operator()(
float i)
const{
34 float s = max(sel_coeff(pop, gen, i),-1.f);
35 return mse(i, N, F, h, -1*s);
39 template<
typename Functor_function>
40 struct trapezoidal_upper{
42 trapezoidal_upper() { }
43 trapezoidal_upper(Functor_function xfun): fun(xfun) { }
44 __device__ __forceinline__
float operator()(
float a,
float step_size)
const{
return step_size*(fun(a)+fun(a-step_size))/2; }
48 template <
typename Functor_Integrator>
49 __global__
void calculate_area(
float * d_freq,
const int num_freq,
const float step_size, Functor_Integrator trapezoidal){
50 int myID = blockIdx.x*blockDim.x + threadIdx.x;
52 for(
int id = myID;
id < num_freq;
id += blockDim.x*gridDim.x){ d_freq[id] = trapezoidal((1.0 -
id*step_size), step_size); }
55 __global__
void reverse_array(
float * array,
const int N);
58 template <
typename Functor_selection>
59 __global__
void initialize_mse_frequency_array(
int * freq_index,
float * mse_integral,
const int offset,
const float mu,
const int Nind,
const int Nchrom,
const float L,
const Functor_selection sel_coeff,
const float F,
const float h,
const int2 seed,
const int population){
60 int myID = blockIdx.x*blockDim.x + threadIdx.x;
61 float mse_total = mse_integral[0];
62 for(
int id = myID;
id < (Nchrom-1);
id += blockDim.x*gridDim.x){
63 float i = (
id+1.f)/Nchrom;
64 float j = ((Nchrom - id)+1.f)/Nchrom;
65 float s = sel_coeff(population, 0, i);
67 if(s == 0){ lambda = 2*mu*L/i; }
68 else{ lambda = 2*mu*L*(mse(i, Nind, F, h, s)*mse_integral[id])/(mse_total*i*j); }
69 freq_index[offset+id] = max(RNG::ApproxRandPois1(lambda, lambda, mu, L*Nchrom, seed, 0,
id, population),0);
77 __global__
void initialize_mse_mutation_array(
float * mutations_freq,
const int * freq_index,
const int * scan_index,
const int offset,
const int Nchrom,
const int population,
const int num_populations,
const int array_Length);
79 __global__
void mse_set_mutID(int4 * mutations_ID,
const float *
const mutations_freq,
const int mutations_Index,
const int num_populations,
const int array_Length,
const bool preserve_mutations);
93 template <
typename Functor_migration,
typename Functor_selection>
94 __global__
void migration_selection_drift(
float * mutations_freq,
float *
const prev_freq, int4 *
const mutations_ID,
const int mutations_Index,
const int array_Length,
const int N,
const Functor_migration mig_prop,
const Functor_selection sel_coeff,
const float F,
const float h,
const int2 seed,
const int population,
const int num_populations,
const int generation){
95 int myID = blockIdx.x*blockDim.x + threadIdx.x;
97 for(
int id = myID;
id < mutations_Index/4;
id+= blockDim.x*gridDim.x){
98 float4 i_mig = make_float4(0);
99 for(
int pop = 0; pop < num_populations; pop++){
100 float4 i =
reinterpret_cast<float4*
>(prev_freq)[pop*array_Length/4+
id];
101 i_mig += mig_prop(pop,population,generation)*i;
104 float4 s = make_float4(max(sel_coeff(population,generation,i_mig.x),-1.f),max(sel_coeff(population,generation,i_mig.y),-1.f),max(sel_coeff(population,generation,i_mig.z),-1.f),max(sel_coeff(population,generation,i_mig.w),-1.f));
105 float4 i_mig_sel = (s*i_mig*i_mig+i_mig+(F+h-h*F)*s*i_mig*(1-i_mig))/(i_mig*i_mig*s+(F+2*h-2*h*F)*s*i_mig*(1-i_mig)+1);
106 float4 mean = i_mig_sel*N;
107 int4 j_mig_sel_drift = clamp(RNG::ApproxRandBinom4(mean,(1.0-i_mig_sel)*mean,i_mig_sel,N,seed,(
id + 2),generation,population), 0, N);
108 reinterpret_cast<float4*
>(mutations_freq)[population*array_Length/4+
id] = make_float4(j_mig_sel_drift)/N;
110 int id = myID + mutations_Index/4 * 4;
111 if(
id < mutations_Index){
113 for(
int pop = 0; pop < num_populations; pop++){
114 float i = prev_freq[pop*array_Length+id];
115 i_mig += mig_prop(pop,population,generation)*i;
118 float s = max(sel_coeff(population,generation,i_mig),-1.f);
119 float i_mig_sel = (s*i_mig*i_mig+i_mig+(F+h-h*F)*s*i_mig*(1-i_mig))/(i_mig*i_mig*s+(F+2*h-2*h*F)*s*i_mig*(1-i_mig)+1);
120 float mean = i_mig_sel*N;
121 int j_mig_sel_drift = clamp(RNG::ApproxRandBinom1(mean,(1.0-i_mig_sel)*mean,i_mig_sel,N,seed,(
id + 2),generation,population), 0, N);
122 mutations_freq[population*array_Length+id] = float(j_mig_sel_drift)/N;
126 __global__
void add_new_mutations(
float * mutations_freq, int4 * mutations_ID,
const int prev_mutations_Index,
const int new_mutations_Index,
const int array_Length,
float freq,
const int population,
const int num_populations,
const int generation);
128 __device__ __forceinline__
bool boundary_0(
float freq){
129 return (freq <= 0.f);
132 __device__ __forceinline__
bool boundary_1(
float freq){
133 return (freq >= 1.f);
137 template <
typename Functor_demography>
138 __global__
void flag_segregating_mutations(
unsigned int * flag,
unsigned int * counter,
const Functor_demography demography,
const float *
const mutations_freq,
const int4 *
const mutations_ID,
const int num_populations,
const int padded_mut_Index,
const int mutations_Index,
const int array_Length,
const int generation,
const int warp_size){
140 int myID = blockIdx.x*blockDim.x + threadIdx.x;
141 for(
int id = myID; id < (padded_mut_Index >> 5);
id+= blockDim.x*gridDim.x){
142 int lnID = threadIdx.x % warp_size;
143 int warpID =
id >> 5;
148 for(
int j = 0; j < 32; j++){
152 int index = (warpID<<10)+(j<<5)+lnID;
153 if(index >= mutations_Index){ zero *= 1; one = 0; }
155 for(
int pop = 0; pop < num_populations; pop++){
156 if(demography(pop,generation) > 0){
157 float i = mutations_freq[pop*array_Length+index];
158 zero = zero & boundary_0(i);
159 one = one & boundary_1(i);
164 preserve = (mutations_ID[index].z < 0);
166 mask = __ballot((!(zero|one) || preserve));
169 flag[(warpID<<5)+j] = mask;
174 if(lnID == 0) counter[warpID] = cnt;
178 __global__
void scatter_arrays(
float * new_mutations_freq, int4 * new_mutations_ID,
const float *
const mutations_freq,
const int4 *
const mutations_ID,
const unsigned int *
const flag,
const unsigned int *
const scan_Index,
const int padded_mut_Index,
const int new_array_Length,
const int old_array_Length,
const bool preserve_mutations,
const int warp_size);
180 __global__
void preserve_prev_run_mutations(int4 * mutations_ID,
const int mutations_Index);
185 float * d_mutations_freq;
187 int4 * d_mutations_ID;
189 int h_num_populations;
191 int h_mutations_Index;
193 int * h_new_mutation_Indices;
197 sim_struct(): h_num_populations(0), h_array_Length(0), h_mutations_Index(0), h_num_sites(0), warp_size(0) { d_mutations_freq = NULL; d_prev_freq = NULL; d_mutations_ID = NULL; h_new_mutation_Indices = NULL; h_extinct = NULL;}
198 ~sim_struct(){ cudaCheckErrorsAsync(cudaFree(d_mutations_freq),-1,-1); cudaCheckErrorsAsync(cudaFree(d_prev_freq),-1,-1); cudaCheckErrorsAsync(cudaFree(d_mutations_ID),-1,-1);
if(h_new_mutation_Indices) {
delete [] h_new_mutation_Indices; }
if(h_extinct){
delete [] h_extinct; } }
201 template <
typename Functor_selection>
202 __host__
void integrate_mse(
float * d_mse_integral,
const int N_ind,
const int Nchrom_e,
const Functor_selection sel_coeff,
const float F,
const float h,
int pop, cudaStream_t pop_stream){
205 cudaCheckErrorsAsync(cudaMalloc((
void**)&d_freq, Nchrom_e*
sizeof(
float)),0,pop);
207 mse_integrand<Functor_selection> mse_fun(sel_coeff, N_ind, F, h, pop);
208 trapezoidal_upper< mse_integrand<Functor_selection> > trap(mse_fun);
210 calculate_area<<<10,1024,0,pop_stream>>>(d_freq, Nchrom_e, (float)1.0/(Nchrom_e), trap);
211 cudaCheckErrorsAsync(cudaPeekAtLastError(),0,pop);
213 void * d_temp_storage = NULL;
214 size_t temp_storage_bytes = 0;
215 cudaCheckErrorsAsync(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_freq, d_mse_integral, Nchrom_e, pop_stream),0,pop);
216 cudaCheckErrorsAsync(cudaMalloc(&d_temp_storage, temp_storage_bytes),0,pop);
217 cudaCheckErrorsAsync(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_freq, d_mse_integral, Nchrom_e, pop_stream),0,pop);
218 cudaCheckErrorsAsync(cudaFree(d_temp_storage),0,pop);
219 cudaCheckErrorsAsync(cudaFree(d_freq),0,pop);
221 reverse_array<<<10,1024,0,pop_stream>>>(d_mse_integral, Nchrom_e);
222 cudaCheckErrorsAsync(cudaPeekAtLastError(),0,pop);
225 template <
typename Functor_mu,
typename Functor_demography,
typename Functor_inbreeding>
226 __host__
void set_Index_Length(sim_struct & mutations,
const int num_mutations,
const Functor_mu mu_rate,
const Functor_demography demography,
const Functor_inbreeding FI,
const float num_sites,
const int compact_interval,
const int generation,
const int final_generation){
227 mutations.h_mutations_Index = num_mutations;
228 mutations.h_array_Length = mutations.h_mutations_Index;
229 int limit = compact_interval;
230 if(limit == 0){ limit = final_generation - generation; }
231 for(
int gen = generation+1; gen <= (generation+limit) && gen <= final_generation; gen++){
232 for(
int pop = 0; pop < mutations.h_num_populations; pop++){
233 int Nchrom_e = 2*demography(pop,gen)/(1+FI(pop,gen));
234 if(Nchrom_e <= 0 || mutations.h_extinct[pop]){
continue; }
235 mutations.h_array_Length += mu_rate(pop,gen)*Nchrom_e*num_sites + 7.f*sqrtf(mu_rate(pop,gen)*Nchrom_e*num_sites);
239 mutations.h_array_Length = (int)(mutations.h_array_Length/mutations.warp_size + 1*(mutations.h_array_Length%mutations.warp_size!=0))*mutations.warp_size;
242 template <
typename Functor_mutation,
typename Functor_demography,
typename Functor_selection,
typename Functor_inbreeding,
typename Functor_dominance>
243 __host__
void initialize_mse(sim_struct & mutations,
const Functor_mutation mu_rate,
const Functor_demography demography,
const Functor_selection sel_coeff,
const Functor_inbreeding FI,
const Functor_dominance dominance,
const int final_generation,
const int2 seed,
const bool preserve_mutations,
const int compact_interval, cudaStream_t * pop_streams, cudaEvent_t * pop_events){
246 for(
int pop = 0; pop < mutations.h_num_populations; pop++){
248 int Nchrom_e = 2*demography(pop,0)/(1+F);
249 if(Nchrom_e <= 1){
continue; }
250 num_freq += (Nchrom_e - 1);
254 cudaCheckErrorsAsync(cudaMalloc((
void**)&d_freq_index, num_freq*
sizeof(
int)),0,-1);
256 cudaCheckErrorsAsync(cudaMalloc((
void**)&d_scan_index, num_freq*
sizeof(
int)),0,-1);
257 float ** mse_integral =
new float *[mutations.h_num_populations];
260 for(
int pop = 0; pop < mutations.h_num_populations; pop++){
261 int N_ind = demography(pop,0);
262 float mu = mu_rate(pop,0);
264 int Nchrom_e = 2*N_ind/(1+F);
265 float h = dominance(pop,0);
266 cudaCheckErrorsAsync(cudaMalloc((
void**)&mse_integral[pop], Nchrom_e*
sizeof(
float)),0,pop);
267 if(Nchrom_e <= 1){
continue; }
268 integrate_mse(mse_integral[pop], N_ind, Nchrom_e, sel_coeff, F, h, pop, pop_streams[pop]);
269 initialize_mse_frequency_array<<<6,1024,0,pop_streams[pop]>>>(d_freq_index, mse_integral[pop], offset, mu, N_ind, Nchrom_e, mutations.h_num_sites, sel_coeff, F, h, seed, pop);
270 cudaCheckErrorsAsync(cudaPeekAtLastError(),0,pop);
271 offset += (Nchrom_e - 1);
274 for(
int pop = 0; pop < mutations.h_num_populations; pop++){
275 cudaCheckErrorsAsync(cudaFree(mse_integral[pop]),0,pop);
276 cudaCheckErrorsAsync(cudaEventRecord(pop_events[pop],pop_streams[pop]),0,pop);
277 cudaCheckErrorsAsync(cudaStreamWaitEvent(pop_streams[0],pop_events[pop],0),0,pop);
280 delete [] mse_integral;
282 void * d_temp_storage = NULL;
283 size_t temp_storage_bytes = 0;
284 cudaCheckErrorsAsync(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_freq_index, d_scan_index, num_freq, pop_streams[0]),0,-1);
285 cudaCheckErrorsAsync(cudaMalloc(&d_temp_storage, temp_storage_bytes),0,-1);
286 cudaCheckErrorsAsync(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_freq_index, d_scan_index, num_freq, pop_streams[0]),0,-1);
287 cudaCheckErrorsAsync(cudaFree(d_temp_storage),0,-1);
289 cudaCheckErrorsAsync(cudaEventRecord(pop_events[0],pop_streams[0]),0,-1);
290 for(
int pop = 0; pop < mutations.h_num_populations; pop++){
291 cudaCheckErrorsAsync(cudaStreamWaitEvent(pop_streams[pop],pop_events[0],0),0,pop);
294 int prefix_sum_result;
295 int final_freq_count;
297 cudaCheckErrorsAsync(cudaMemcpyAsync(&prefix_sum_result, &d_scan_index[(num_freq-1)],
sizeof(
int), cudaMemcpyDeviceToHost, pop_streams[0]),0,-1);
298 cudaCheckErrorsAsync(cudaMemcpyAsync(&final_freq_count, &d_freq_index[(num_freq-1)],
sizeof(
int), cudaMemcpyDeviceToHost, pop_streams[0]),0,-1);
299 if(cudaStreamQuery(pop_streams[0]) != cudaSuccess){ cudaCheckErrors(cudaStreamSynchronize(pop_streams[0]),0,-1); }
300 int num_mutations = prefix_sum_result+final_freq_count;
301 set_Index_Length(mutations, num_mutations, mu_rate, demography, FI, mutations.h_num_sites, compact_interval, 0, final_generation);
304 cudaCheckErrorsAsync(cudaMalloc((
void**)&mutations.d_mutations_freq, mutations.h_num_populations*mutations.h_array_Length*
sizeof(
float)),0,-1);
305 cudaCheckErrorsAsync(cudaMalloc((
void**)&mutations.d_prev_freq, mutations.h_num_populations*mutations.h_array_Length*
sizeof(
float)),0,-1);
306 cudaCheckErrorsAsync(cudaMalloc((
void**)&mutations.d_mutations_ID, mutations.h_array_Length*
sizeof(int4)),0,-1);
308 const dim3 blocksize(4,256,1);
309 const dim3 gridsize(32,32,1);
311 for(
int pop = 0; pop < mutations.h_num_populations; pop++){
313 int Nchrom_e = 2*demography(pop,0)/(1+F);
314 if(Nchrom_e <= 1){
continue; }
315 initialize_mse_mutation_array<<<gridsize,blocksize,0,pop_streams[pop]>>>(mutations.d_prev_freq, d_freq_index, d_scan_index, offset, Nchrom_e, pop, mutations.h_num_populations, mutations.h_array_Length);
316 cudaCheckErrorsAsync(cudaPeekAtLastError(),0,pop);
317 offset += (Nchrom_e - 1);
320 mse_set_mutID<<<50,1024,0,pop_streams[mutations.h_num_populations]>>>(mutations.d_mutations_ID, mutations.d_prev_freq, mutations.h_mutations_Index, mutations.h_num_populations, mutations.h_array_Length, preserve_mutations);
321 cudaCheckErrorsAsync(cudaPeekAtLastError(),0,-1);
323 for(
int pop = 0; pop <= mutations.h_num_populations; pop++){
324 cudaCheckErrorsAsync(cudaEventRecord(pop_events[pop],pop_streams[pop]),0,pop);
325 for(
int pop2 = mutations.h_num_populations+1; pop2 < 2*mutations.h_num_populations; pop2++){
326 cudaCheckErrorsAsync(cudaStreamWaitEvent(pop_streams[pop2],pop_events[pop],0),0,pop);
330 cudaCheckErrorsAsync(cudaFree(d_freq_index),0,-1);
331 cudaCheckErrorsAsync(cudaFree(d_scan_index),0,-1);
335 template <
typename Functor_mutation,
typename Functor_demography,
typename Functor_inbreeding,
typename Functor_preserve,
typename Functor_timesample>
336 __host__
void init_blank_prev_run(sim_struct & mutations,
int & generation_shift,
int & final_generation,
const bool use_prev_sim,
const float prev_sim_num_sites,
const int prev_sim_num_populations,
const int prev_sim_sampled_generation,
const bool *
const prev_sim_extinct,
const int prev_sim_num_mutations,
float * prev_sim_mutations_freq,
GO_Fish::mutID * prev_sim_mutations_ID,
const Functor_mutation mu_rate,
const Functor_demography demography,
const Functor_inbreeding FI,
const Functor_preserve preserve_mutations,
const Functor_timesample take_sample,
const int compact_interval, cudaStream_t * pop_streams, cudaEvent_t * pop_events){
338 int num_mutations = 0;
339 if(prev_sim_num_mutations != 0 && use_prev_sim){
340 generation_shift = prev_sim_sampled_generation;
341 final_generation += generation_shift;
342 num_mutations = prev_sim_num_mutations;
343 for(
int i = 0; i < mutations.h_num_populations; i++){ mutations.h_extinct[i] = prev_sim_extinct[i]; }
346 set_Index_Length(mutations, num_mutations, mu_rate, demography, FI, mutations.h_num_sites, compact_interval, generation_shift, final_generation);
347 cudaCheckErrorsAsync(cudaMalloc((
void**)&mutations.d_mutations_freq, mutations.h_num_populations*mutations.h_array_Length*
sizeof(
float)),0,-1);
348 cudaCheckErrorsAsync(cudaMalloc((
void**)&mutations.d_prev_freq, mutations.h_num_populations*mutations.h_array_Length*
sizeof(
float)),0,-1);
349 cudaCheckErrorsAsync(cudaMalloc((
void**)&mutations.d_mutations_ID, mutations.h_array_Length*
sizeof(int4)),0,-1);
351 if(prev_sim_num_mutations != 0 && use_prev_sim){
352 cudaCheckErrors(cudaHostRegister(prev_sim_mutations_freq,prev_sim_num_populations*prev_sim_num_mutations*
sizeof(
float),cudaHostRegisterPortable),generation_shift,-1);
353 cudaCheckErrorsAsync(cudaMemcpy2DAsync(mutations.d_prev_freq, mutations.h_array_Length*
sizeof(
float), prev_sim_mutations_freq, prev_sim_num_mutations*
sizeof(
float), prev_sim_num_mutations*
sizeof(
float), prev_sim_num_populations, cudaMemcpyHostToDevice, pop_streams[0]),0,-1);
354 cudaCheckErrors(cudaHostUnregister(prev_sim_mutations_freq),generation_shift,-1);
355 cudaCheckErrors(cudaHostRegister(prev_sim_mutations_ID,prev_sim_num_mutations*
sizeof(
GO_Fish::mutID),cudaHostRegisterPortable),generation_shift,-1);
356 cudaCheckErrorsAsync(cudaMemcpyAsync(mutations.d_mutations_ID, prev_sim_mutations_ID, prev_sim_num_mutations*
sizeof(int4), cudaMemcpyHostToDevice, pop_streams[1]),0,-1);
357 cudaCheckErrors(cudaHostUnregister(prev_sim_mutations_ID),generation_shift,-1);
358 if(preserve_mutations(generation_shift) | take_sample(generation_shift)){ preserve_prev_run_mutations<<<200,512,0,pop_streams[1]>>>(mutations.d_mutations_ID, mutations.h_mutations_Index); }
359 cudaCheckErrorsAsync(cudaEventRecord(pop_events[0],pop_streams[0]),0,-1);
360 cudaCheckErrorsAsync(cudaEventRecord(pop_events[1],pop_streams[1]),0,-1);
363 for(
int pop = 2; pop < 2*mutations.h_num_populations; pop++){
364 cudaCheckErrorsAsync(cudaStreamWaitEvent(pop_streams[pop],pop_events[0],0),0,pop);
365 cudaCheckErrorsAsync(cudaStreamWaitEvent(pop_streams[pop],pop_events[1],0),0,pop);
370 template <
typename Functor_mutation,
typename Functor_demography,
typename Functor_inbreeding>
371 __host__
void compact(sim_struct & mutations,
const Functor_mutation mu_rate,
const Functor_demography demography,
const Functor_inbreeding FI,
const int generation,
const int final_generation,
const bool preserve_mutations,
const int compact_interval, cudaStream_t * control_streams, cudaEvent_t * control_events, cudaStream_t * pop_streams){
372 unsigned int * d_flag;
373 unsigned int * d_count;
375 int padded_mut_index = (((mutations.h_mutations_Index>>10)+1*(mutations.h_mutations_Index%1024!=0))<<10);
377 cudaCheckErrorsAsync(cudaFree(mutations.d_mutations_freq),generation,-1);
378 cudaCheckErrorsAsync(cudaMalloc((
void**)&d_flag,(padded_mut_index>>5)*
sizeof(
unsigned int)),generation,-1);
379 cudaCheckErrorsAsync(cudaMalloc((
void**)&d_count,(padded_mut_index>>10)*
sizeof(
unsigned int)),generation,-1);
381 flag_segregating_mutations<<<800,128,0,control_streams[0]>>>(d_flag, d_count, demography, mutations.d_prev_freq, mutations.d_mutations_ID, mutations.h_num_populations, padded_mut_index, mutations.h_mutations_Index, mutations.h_array_Length, generation, mutations.warp_size);
382 cudaCheckErrorsAsync(cudaPeekAtLastError(),generation,-1);
384 unsigned int * d_scan_Index;
385 cudaCheckErrorsAsync(cudaMalloc((
void**)&d_scan_Index,(padded_mut_index>>10)*
sizeof(
unsigned int)),generation,-1);
387 void * d_temp_storage = NULL;
388 size_t temp_storage_bytes = 0;
389 cudaCheckErrorsAsync(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_count, d_scan_Index, (padded_mut_index>>10), control_streams[0]),generation,-1);
390 cudaCheckErrorsAsync(cudaMalloc(&d_temp_storage, temp_storage_bytes),generation,-1);
391 cudaCheckErrorsAsync(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_count, d_scan_Index, (padded_mut_index>>10), control_streams[0]),generation,-1);
392 cudaCheckErrorsAsync(cudaFree(d_temp_storage),generation,-1);
394 cudaCheckErrorsAsync(cudaPeekAtLastError(),generation,-1);
396 int h_num_seg_mutations;
397 cudaCheckErrorsAsync(cudaMemcpyAsync(&h_num_seg_mutations, &d_scan_Index[(padded_mut_index>>10)-1],
sizeof(
int), cudaMemcpyDeviceToHost, control_streams[0]),generation,-1);
398 if(cudaStreamQuery(control_streams[0]) != cudaSuccess){ cudaCheckErrors(cudaStreamSynchronize(control_streams[0]),generation,-1); }
400 int old_array_Length = mutations.h_array_Length;
401 set_Index_Length(mutations, h_num_seg_mutations, mu_rate, demography, FI, mutations.h_num_sites, compact_interval, generation, final_generation);
405 cudaCheckErrorsAsync(cudaMalloc((
void**)&d_temp,mutations.h_num_populations*mutations.h_array_Length*
sizeof(
float)),generation,-1);
406 cudaCheckErrorsAsync(cudaMalloc((
void**)&d_temp2,mutations.h_array_Length*
sizeof(int4)),generation,-1);
408 const dim3 gridsize(800,mutations.h_num_populations,1);
409 scatter_arrays<<<gridsize,128,0,control_streams[0]>>>(d_temp, d_temp2, mutations.d_prev_freq, mutations.d_mutations_ID, d_flag, d_scan_Index, padded_mut_index, mutations.h_array_Length, old_array_Length, preserve_mutations, mutations.warp_size);
410 cudaCheckErrorsAsync(cudaPeekAtLastError(),generation,-1);
412 cudaCheckErrorsAsync(cudaEventRecord(control_events[0],control_streams[0]),generation,-1);
414 for(
int pop = 0; pop < 2*mutations.h_num_populations; pop++){
415 cudaCheckErrorsAsync(cudaStreamWaitEvent(pop_streams[pop],control_events[0],0),generation,pop);
418 cudaCheckErrorsAsync(cudaFree(mutations.d_prev_freq),generation,-1);
419 cudaCheckErrorsAsync(cudaFree(mutations.d_mutations_ID),generation,-1);
421 mutations.d_prev_freq = d_temp;
422 mutations.d_mutations_ID = d_temp2;
424 cudaCheckErrorsAsync(cudaFree(d_flag),generation,-1);
425 cudaCheckErrorsAsync(cudaFree(d_scan_Index),generation,-1);
426 cudaCheckErrorsAsync(cudaFree(d_count),generation,-1);
428 cudaCheckErrorsAsync(cudaMalloc((
void**)&mutations.d_mutations_freq,mutations.h_num_populations*mutations.h_array_Length*
sizeof(
float)),generation,-1);
431 template <
typename Functor_mutation,
typename Functor_demography,
typename Functor_migration,
typename Functor_inbreeding>
432 __host__
void check_sim_parameters(
const Functor_mutation mu_rate,
const Functor_demography demography,
const Functor_migration mig_prop,
const Functor_inbreeding FI, sim_struct & mutations,
const int generation){
433 int num_pop = mutations.h_num_populations;
434 for(
int pop = 0; pop < num_pop; pop++){
435 double migration = 0;
436 if(mu_rate(pop,generation) < 0){ fprintf(stderr,
"check_sim_parameters: mutation error: mu_rate < 0\tgeneration %d\t population %d\n",generation,pop); exit(1); }
437 int N = demography(pop,generation);
438 if(N > 0 && mutations.h_extinct[pop]){ fprintf(stderr,
"check_sim_parameters: demography error: extinct population with population size > 0\tgeneration %d\t population %d\n",generation,pop); exit(1); }
439 float fi = FI(pop,generation);
440 if(fi < 0) { fprintf(stderr,
"check_sim_parameters: inbreeding error: inbreeding coefficient < 0\tgeneration %d\t population %d\n",generation,pop); exit(1); }
441 if(fi > 1) { fprintf(stderr,
"check_sim_parameters: inbreeding error: inbreeding coefficient > 1\tgeneration %d\t population %d\n",generation,pop); exit(1); }
442 for(
int pop2 = 0; pop2 < num_pop; pop2++){
443 float m_from = mig_prop(pop,pop2,generation);
444 if(m_from < 0){ fprintf(stderr,
"check_sim_parameters: migration error: migration rate < 0\tgeneration %d\t population_from %d\t population_to %d\n",generation,pop,pop2); exit(1); }
445 if(m_from > 0 && ((N <= 0 || mutations.h_extinct[pop]) && !(demography(pop2,generation) <= 0 || mutations.h_extinct[pop2]))){ fprintf(stderr,
"migration error, migration from non-existant population\tgeneration %d\t population_from %d\t population_to %d\n",generation,pop,pop2); exit(1); }
447 float m_to = mig_prop(pop2,pop,generation);
448 migration += (double)m_to;
450 if((
float)migration != 1.f && !(N <= 0 || mutations.h_extinct[pop])){ fprintf(stderr,
"check_sim_parameters: migration error: migration rate does not sum to 1\tgeneration %d\t population_TO %d\t total_migration_proportion %f\n",generation,pop,migration); exit(1); }
454 template <
typename Functor_mutation,
typename Functor_demography,
typename Functor_inbreeding>
455 __host__
void calc_new_mutations_Index(sim_struct & mutations,
const Functor_mutation mu_rate,
const Functor_demography demography,
const Functor_inbreeding FI,
const int2 seed,
const int generation){
456 int num_new_mutations = 0;
457 mutations.h_new_mutation_Indices[0] = mutations.h_mutations_Index;
458 for(
int pop = 0; pop < mutations.h_num_populations; pop++){
459 int Nchrom_e = 2*demography(pop,generation)/(1+FI(pop,generation));
460 float mu = mu_rate(pop, generation);
461 float lambda = mu*Nchrom_e*mutations.h_num_sites;
462 if(Nchrom_e == 0 || lambda == 0 || mutations.h_extinct[pop]){ mutations.h_new_mutation_Indices[pop+1] = mutations.h_new_mutation_Indices[pop];
continue; }
463 int temp = max(RNG::ApproxRandPois1(lambda, lambda, mu, Nchrom_e*mutations.h_num_sites, seed, 1, generation, pop),0);
464 num_new_mutations += temp;
465 mutations.h_new_mutation_Indices[pop+1] = num_new_mutations + mutations.h_mutations_Index;
469 template <
typename Functor_timesample>
470 __host__ __forceinline__
int calc_sim_result_vector_length(Functor_timesample take_sample,
int starting_generation,
int final_generation) {
472 for(
int i = starting_generation; i < final_generation; i++){
if(take_sample(i)){ length++; } }
477 template <
typename Functor_demography,
typename Functor_inbreeding>
478 __host__ __forceinline__
void store_time_sample(
int & out_num_mutations,
int & out_max_num_mutations,
int & out_sampled_generation,
float *& out_mutations_freq,
GO_Fish::mutID *& out_mutations_ID,
bool *& out_extinct,
int *& out_Nchrom_e, sim_struct & mutations, Functor_demography demography, Functor_inbreeding FI,
int sampled_generation,
int final_generation, cudaStream_t * control_streams, cudaEvent_t * control_events){
479 out_num_mutations = mutations.h_mutations_Index;
480 out_sampled_generation = sampled_generation;
481 out_mutations_freq =
new float[mutations.h_num_populations*out_num_mutations];
482 cudaCheckErrors(cudaHostRegister(out_mutations_freq,mutations.h_num_populations*out_num_mutations*
sizeof(
float),cudaHostRegisterPortable),sampled_generation,-1);
483 cudaCheckErrorsAsync(cudaMemcpy2DAsync(out_mutations_freq, out_num_mutations*
sizeof(
float), mutations.d_prev_freq, mutations.h_array_Length*
sizeof(
float), out_num_mutations*
sizeof(
float), mutations.h_num_populations, cudaMemcpyDeviceToHost, control_streams[0]),sampled_generation,-1);
484 cudaCheckErrors(cudaHostUnregister(out_mutations_freq),sampled_generation,-1);
485 if(sampled_generation == final_generation){
487 cudaCheckErrors(cudaHostRegister(out_mutations_ID,out_num_mutations*
sizeof(int4),cudaHostRegisterPortable),sampled_generation,-1);
488 cudaCheckErrorsAsync(cudaMemcpyAsync(out_mutations_ID, mutations.d_mutations_ID, out_num_mutations*
sizeof(int4), cudaMemcpyDeviceToHost, control_streams[0]),sampled_generation,-1);
489 cudaCheckErrors(cudaHostUnregister(out_mutations_ID),sampled_generation,-1);
490 out_max_num_mutations = out_num_mutations;
492 out_extinct =
new bool[mutations.h_num_populations];
493 out_Nchrom_e =
new int[mutations.h_num_populations];
494 for(
int i = 0; i < mutations.h_num_populations; i++){
495 out_extinct[i] = mutations.h_extinct[i];
496 out_Nchrom_e[i] = 2*demography(i,sampled_generation)/(1+FI(i,sampled_generation));
499 cudaCheckErrorsAsync(cudaEventRecord(control_events[0],control_streams[0]),sampled_generation,-1);
524 template <
typename Functor_mutation,
typename Functor_demography,
typename Functor_migration,
typename Functor_selection,
typename Functor_inbreeding,
typename Functor_dominance,
typename Functor_preserve,
typename Functor_timesample>
525 __host__
void run_sim(
allele_trajectories & all_results,
const Functor_mutation mu_rate,
const Functor_demography demography,
const Functor_migration mig_prop,
const Functor_selection sel_coeff,
const Functor_inbreeding FI,
const Functor_dominance dominance,
const Functor_preserve preserve_mutations,
const Functor_timesample take_sample){
526 run_sim(all_results, mu_rate, demography, mig_prop, sel_coeff, FI, dominance, preserve_mutations, take_sample,
allele_trajectories());
553 template <
typename Functor_mutation,
typename Functor_demography,
typename Functor_migration,
typename Functor_selection,
typename Functor_inbreeding,
typename Functor_dominance,
typename Functor_preserve,
typename Functor_timesample>
554 __host__
void run_sim(
allele_trajectories & all_results,
const Functor_mutation mu_rate,
const Functor_demography demography,
const Functor_migration mig_prop,
const Functor_selection sel_coeff,
const Functor_inbreeding FI,
const Functor_dominance dominance,
const Functor_preserve preserve_mutations,
const Functor_timesample take_sample,
const allele_trajectories & prev_sim){
556 using namespace go_fish_details;
563 sim_struct mutations;
566 mutations.h_new_mutation_Indices =
new int[mutations.h_num_populations+1];
568 mutations.h_extinct =
new bool[mutations.h_num_populations];
569 for(
int i = 0; i < mutations.h_num_populations; i++){ mutations.h_extinct[i] = 0; }
570 mutations.warp_size = devProp.warpSize;
572 cudaStream_t * pop_streams =
new cudaStream_t[2*mutations.h_num_populations];
573 cudaEvent_t * pop_events =
new cudaEvent_t[2*mutations.h_num_populations];
575 int num_control_streams = 1;
576 cudaStream_t * control_streams =
new cudaStream_t[num_control_streams];;
577 cudaEvent_t * control_events =
new cudaEvent_t[num_control_streams];;
579 for(
int pop = 0; pop < 2*mutations.h_num_populations; pop++){
580 cudaCheckErrors(cudaStreamCreate(&pop_streams[pop]),-1,pop);
581 cudaCheckErrorsAsync(cudaEventCreateWithFlags(&pop_events[pop],cudaEventDisableTiming),-1,pop);
584 for(
int stream = 0; stream < num_control_streams; stream++){
585 cudaCheckErrors(cudaStreamCreate(&control_streams[stream]),-1,stream);
586 cudaCheckErrorsAsync(cudaEventCreateWithFlags(&control_events[stream],cudaEventDisableTiming),-1,stream);
596 check_sim_parameters(mu_rate, demography, mig_prop, FI, mutations, generation);
597 initialize_mse(mutations, mu_rate, demography, sel_coeff, FI, dominance, final_generation, seed, (preserve_mutations(0)|take_sample(0)), compact_interval, pop_streams, pop_events);
602 if((sample_index >= 0 && sample_index < all_results.num_samples) && prev_sim.time_samples){
603 float prev_sim_num_sites = prev_sim.sim_run_constants.
num_sites;
604 int prev_sim_num_populations = prev_sim.sim_run_constants.
num_populations;
605 if(mutations.h_num_sites == prev_sim_num_sites && mutations.h_num_populations == prev_sim_num_populations){
607 init_blank_prev_run(mutations, generation, final_generation,
true, prev_sim_num_sites, prev_sim_num_populations, prev_sim.time_samples[sample_index]->sampled_generation, prev_sim.time_samples[sample_index]->extinct, prev_sim.time_samples[sample_index]->num_mutations, prev_sim.time_samples[sample_index]->mutations_freq, prev_sim.mutations_ID, mu_rate, demography, FI, preserve_mutations, take_sample, compact_interval, pop_streams, pop_events);
609 fprintf(stderr,
"run_sim error: prev_sim parameters do not match current simulation parameters: prev_sim num_sites %f\tcurrent_sim num_sites %f,\tprev_sim num_populations %d\tcurrent_sim num_populations %d\n",prev_sim_num_sites,mutations.h_num_sites,prev_sim_num_populations,mutations.h_num_populations); exit(1);
612 else if(sample_index < 0){
614 init_blank_prev_run(mutations, generation, final_generation,
false, -1.f, -1, -1, NULL, 0, NULL, NULL, mu_rate, demography, FI, preserve_mutations, take_sample, compact_interval, pop_streams, pop_events);
617 if(!prev_sim.time_samples){ fprintf(stderr,
"run_sim error: requested time sample from empty prev_sim\n"); exit(1); }
618 fprintf(stderr,
"run_sim error: requested sample index out of bounds for prev_sim: sample %d\t[0\t %d)\n",sample_index,prev_sim.num_samples); exit(1);
621 check_sim_parameters(mu_rate, demography, mig_prop, FI, mutations, generation);
625 int new_length = calc_sim_result_vector_length(take_sample,generation,final_generation);
626 all_results.initialize_sim_result_vector(new_length);
629 int sample_index = 0;
630 if(take_sample(generation) && generation != final_generation){
631 for(
int pop = 0; pop < 2*mutations.h_num_populations; pop++){ cudaCheckErrorsAsync(cudaStreamWaitEvent(control_streams[0],pop_events[pop],0), generation, pop); }
632 store_time_sample(all_results.time_samples[sample_index]->num_mutations, all_results.all_mutations, all_results.time_samples[sample_index]->sampled_generation, all_results.time_samples[sample_index]->mutations_freq, all_results.mutations_ID, all_results.time_samples[sample_index]->extinct, all_results.time_samples[sample_index]->Nchrom_e, mutations, demography, FI, generation, final_generation, control_streams, control_events);
643 int next_compact_generation = generation + compact_interval;
645 while((generation+1) <= final_generation){
647 check_sim_parameters(mu_rate, demography, mig_prop, FI, mutations, generation);
649 for(
int pop = 0; pop < mutations.h_num_populations; pop++){
650 int N_ind = demography(pop,generation);
651 if(mutations.h_extinct[pop]){
continue; }
653 if(demography(pop,generation-1) > 0){
655 mutations.h_extinct[pop] =
true;
658 float F = FI(pop,generation);
659 int Nchrom_e = 2*N_ind/(1+F);
661 float h = dominance(pop,generation);
663 migration_selection_drift<<<600,128,0,pop_streams[pop]>>>(mutations.d_mutations_freq, mutations.d_prev_freq, mutations.d_mutations_ID, mutations.h_mutations_Index, mutations.h_array_Length, Nchrom_e, mig_prop, sel_coeff, F, h, seed, pop, mutations.h_num_populations, generation);
664 cudaCheckErrorsAsync(cudaPeekAtLastError(),generation,pop);
665 cudaCheckErrorsAsync(cudaEventRecord(pop_events[pop],pop_streams[pop]),generation,pop);
670 calc_new_mutations_Index(mutations, mu_rate, demography, FI, seed, generation);
671 for(
int pop = 0; pop < mutations.h_num_populations; pop++){
672 int N_ind = demography(pop,generation);
673 if((N_ind <= 0) || mutations.h_extinct[pop]){
continue; }
674 float F = FI(pop,generation);
675 int Nchrom_e = 2*N_ind/(1+F);
676 float freq = 1.f/Nchrom_e;
677 int prev_Index = mutations.h_new_mutation_Indices[pop];
678 int new_Index = mutations.h_new_mutation_Indices[pop+1];
679 add_new_mutations<<<20,512,0,pop_streams[pop+mutations.h_num_populations]>>>(mutations.d_mutations_freq, mutations.d_mutations_ID, prev_Index, new_Index, mutations.h_array_Length, freq, pop, mutations.h_num_populations, generation);
680 cudaCheckErrorsAsync(cudaPeekAtLastError(),generation,pop);
681 cudaCheckErrorsAsync(cudaEventRecord(pop_events[pop+mutations.h_num_populations],pop_streams[pop+mutations.h_num_populations]),generation,pop);
683 mutations.h_mutations_Index = mutations.h_new_mutation_Indices[mutations.h_num_populations];
686 bool preserve = (preserve_mutations(generation) | take_sample(generation));
688 for(
int pop1 = 0; pop1 < mutations.h_num_populations; pop1++){
689 for(
int pop2 = 0; pop2 < mutations.h_num_populations; pop2++){
690 cudaCheckErrorsAsync(cudaStreamWaitEvent(pop_streams[pop1],pop_events[pop2],0), generation, pop1*mutations.h_num_populations+pop2);
691 cudaCheckErrorsAsync(cudaStreamWaitEvent(pop_streams[pop1],pop_events[pop2+mutations.h_num_populations],0), generation, pop1*mutations.h_num_populations+pop2);
693 if(generation == next_compact_generation || generation == final_generation || preserve){
694 cudaCheckErrorsAsync(cudaStreamWaitEvent(control_streams[0],pop_events[pop1],0), generation, pop1);
695 cudaCheckErrorsAsync(cudaStreamWaitEvent(control_streams[0],pop_events[pop1+mutations.h_num_populations],0), generation, pop1);
700 if(take_sample(generation-1)){
for(
int pop = 0; pop < 2*mutations.h_num_populations; pop++){ cudaCheckErrorsAsync(cudaStreamWaitEvent(pop_streams[pop],control_events[0],0),generation,pop); } }
702 std::swap(mutations.d_prev_freq,mutations.d_mutations_freq);
705 if((generation == next_compact_generation || generation == final_generation || preserve) && compact_interval > 0){ compact(mutations, mu_rate, demography, FI, generation, final_generation, preserve, compact_interval, control_streams, control_events, pop_streams); next_compact_generation = generation + compact_interval; }
709 if(take_sample(generation) && generation != final_generation){
710 store_time_sample(all_results.time_samples[sample_index]->num_mutations, all_results.all_mutations, all_results.time_samples[sample_index]->sampled_generation, all_results.time_samples[sample_index]->mutations_freq, all_results.mutations_ID, all_results.time_samples[sample_index]->extinct, all_results.time_samples[sample_index]->Nchrom_e, mutations, demography, FI, generation, final_generation, control_streams, control_events);
717 store_time_sample(all_results.time_samples[sample_index]->num_mutations, all_results.all_mutations, all_results.time_samples[sample_index]->sampled_generation, all_results.time_samples[sample_index]->mutations_freq, all_results.mutations_ID, all_results.time_samples[sample_index]->extinct, all_results.time_samples[sample_index]->Nchrom_e, mutations, demography, FI, generation, final_generation, control_streams, control_events);
721 if(cudaStreamQuery(control_streams[0]) != cudaSuccess){ cudaCheckErrors(cudaStreamSynchronize(control_streams[0]), generation, -1); }
723 for(
int pop = 0; pop < 2*mutations.h_num_populations; pop++){ cudaCheckErrorsAsync(cudaStreamDestroy(pop_streams[pop]),generation,pop); cudaCheckErrorsAsync(cudaEventDestroy(pop_events[pop]),generation,pop); }
724 for(
int stream = 0; stream < num_control_streams; stream++){ cudaCheckErrorsAsync(cudaStreamDestroy(control_streams[stream]),generation,stream); cudaCheckErrorsAsync(cudaEventDestroy(control_events[stream]),generation,stream); }
726 delete [] pop_streams;
727 delete [] pop_events;
728 delete [] control_streams;
729 delete [] control_events;
int prev_sim_sample
time sample of previous simulation to use for initializing current simulation
int seed2
random number seed 2 of 2
control and output data structure for GO_Fish simulation
int compact_interval
how often to compact the simulation and remove fixed or lost mutations
int num_populations
number of populations in simulation
int seed1
random number seed 1 of 2
structure specifying the ID for a mutation in a GO_Fish simulation
void swap(allele_trajectories &a, allele_trajectories &b)
swaps data held by allele_trajectories a and b
__host__ void run_sim(allele_trajectories &all_results, const Functor_mutation mu_rate, const Functor_demography demography, const Functor_migration mig_prop, const Functor_selection sel_coeff, const Functor_inbreeding FI, const Functor_dominance dominance, const Functor_preserve preserve_mutations, const Functor_timesample take_sample)
runs a single-locus Wright-Fisher simulation specified by the given simulation functions and sim_cons...
float num_sites
number of sites in simulation
bool init_mse
true: initialize simulation in mutation_selection_equilibrium; false: initialize blank simulation or ...
int device
GPU identity to run simulation on, if -1 next available GPU will be assigned.
Namespace for single-locus, forward, Monte-Carlo Wright-Fisher simulation and output data structures...
sim_constants sim_input_constants
constants for initializing the next simulation
int num_generations
number of generations in simulation