Parallel PopGen Package
go_fish_impl.cu
1 /*
2  * go_fish_impl.cu
3  *
4  * Author: David Lawrie
5  * implementation of non-template, non-inline functions for GO Fish simulation
6  */
7 
8 #include "../_internal/go_fish_impl.cuh"
9 
11 namespace go_fish_details{
12 
13 /*
14  * CUB scan (sum) phenomenon: float errors in the mse_integral can accumulate differently each run
15  * e.g.
16  * GO_Fish::const_parameter mutation(pow(10.f,-9)); //per-site mutation rate
17  GO_Fish::const_parameter inbreeding(1.f); //constant inbreeding
18  GO_Fish::const_demography demography(pow(10.f,5)*(1+inbreeding(0,0))); //number of individuals in population, set to maintain consistent effective number of chromosomes
19  GO_Fish::const_equal_migration migration(0.f,a.sim_input_constants.num_populations); //constant migration rate
20  float gamma = -5; //effective selection
21  GO_Fish::const_selection selection(gamma/(2*demography(0,0))); //constant selection coefficient
22  GO_Fish::const_parameter dominance(0.f); //constant allele dominance
23  * a.sim_input_constants.compact_interval = 20;
24  a.sim_input_constants.num_generations = pow(10.f,3);
25  a.sim_input_constants.num_sites = 20*2*pow(10.f,7);
26  a.sim_input_constants.seed1 = 0xbeeff00d + 2*14; //random number seeds
27  a.sim_input_constants.seed2 = 0xdecafbad - 2*14;
28 
29  one solution is to round the results in reverse array
30  another is to ignore (currently implemented)
31  another is to switch back to using doubles ... at least when summing up the mse_integral (difference in speed was slight)
32  */
33 
34 __global__ void reverse_array(float * array, const int N){
35  int myID = blockIdx.x*blockDim.x + threadIdx.x;
36  for(int id = myID; id < N/2; id += blockDim.x*gridDim.x){
37  float temp = array[N - id - 1];
38  array[N - id - 1] = array[id];
39  //float temp = roundf(10000*array[N - id - 1])/10000.f;
40  //array[N - id - 1] = roundf(10000*array[id])/10000.f;
41  array[id] = temp;
42  }
43 }
44 
45 __global__ void initialize_mse_mutation_array(float * mutations_freq, const int * freq_index, const int * scan_index, const int offset, const int Nchrom, const int population, const int num_populations, const int array_Length){
46  int myIDy = blockIdx.y*blockDim.y + threadIdx.y;
47  for(int idy = myIDy; idy < (Nchrom-1); idy+= blockDim.y*gridDim.y){
48  int myIDx = blockIdx.x*blockDim.x + threadIdx.x;
49  int start = scan_index[offset+idy];
50  int num_mutations = freq_index[offset+idy];
51  float freq = (idy+1.f)/Nchrom;
52  for(int idx = myIDx; idx < num_mutations; idx+= blockDim.x*gridDim.x){
53  for(int pop = 0; pop < num_populations; pop++){ mutations_freq[pop*array_Length + start + idx] = 0; }
54  mutations_freq[population*array_Length + start + idx] = freq;
55  }
56  }
57 }
58 
59 __global__ void mse_set_mutID(int4 * mutations_ID, const float * const mutations_freq, const int mutations_Index, const int num_populations, const int array_Length, const bool preserve_mutations){
60  int myID = blockIdx.x*blockDim.x + threadIdx.x;
61  for(int id = myID; id < mutations_Index; id+= blockDim.x*gridDim.x){
62  for(int pop = 0; pop < num_populations; pop++){
63  if(mutations_freq[pop*array_Length+id] > 0){
64  if(!preserve_mutations){ mutations_ID[id] = make_int4(0,pop,(id+1),0); }
65  else{ mutations_ID[id] = make_int4(0,pop,-1*(id+1),0); } //age: eventually will replace where mutations have age <= 0 (age before sim start)//threadID//population//to ensure that ID is non-zero, so that preservation flag can be a -ID
66  break; //assumes mutations are only in one population at start
67  }
68  }
69  }
70 }
71 
72 /*__global__ void print_Device_array_uint(unsigned int * array, int num){
73 
74  for(int i = 0; i < num; i++){
75  //if(i%1000 == 0){ printf("\n"); }
76  printf("%d: %d\t",i,array[i]);
77  }
78 }
79 
80 __global__ void sum_Device_array_bit(unsigned int * array, int num){
81 // int sum = 0;
82  for(int i = 0; i < num; i++){
83  //if(i%1000 == 0){ printf("\n"); }
84  unsigned int n = array[i];
85  while (n) {
86  if (n & 1)
87  sum+=1;
88  n >>= 1;
89  }
90  printf("%d\t",__popc(array[i]));
91  }
92 }
93 
94 __global__ void sum_Device_array_uint(unsigned int * array, int num){
95  int j = 0;
96  for(int i = 0; i < num; i++){
97  j += array[i];
98  }
99  printf("%d",j);
100 }
101 
102 __global__ void sum_Device_array_float(float * array, int start, int end){
103  double j = 0;
104  for(int i = start; i < end; i++){
105  j += array[i];
106  }
107  printf("%lf\n",j);
108 }
109 
110 __global__ void compareDevicearray(int * array1, int * array2, int array_length){
111  int myID = blockIdx.x*blockDim.x + threadIdx.x;
112  for(int id = myID; id < array_length; id+= blockDim.x*gridDim.x){
113  if(array1[id] != array2[id]){ printf("%d,%d,%d\t",id,array1[id],array2[id]); }
114  }
115 }
116 
117 __global__ void copyDevicearray(int * array1, int * array2, int array_length){
118  int myID = blockIdx.x*blockDim.x + threadIdx.x;
119  for(int id = myID; id < array_length; id+= blockDim.x*gridDim.x){ array1[id] = array2[id]; }
120 }
121 
122 __global__ void compareDevicearray(float * array1, float * array2, int array_length){
123  int myID = blockIdx.x*blockDim.x + threadIdx.x;
124  for(int id = myID; id < array_length; id+= blockDim.x*gridDim.x){
125  if(array1[id] != array2[id]){ printf("%d,%f,%f\t",id,array1[id],array2[id]); return; }
126  }
127 }
128 
129 __global__ void copyDevicearray(float * array1, float * array2, int array_length){
130  int myID = blockIdx.x*blockDim.x + threadIdx.x;
131  for(int id = myID; id < array_length; id+= blockDim.x*gridDim.x){ array1[id] = array2[id]; }
132 }
133 
134 __global__ void print_Device_array_float(float * array, int num){
135  printf("%5.10e\n",array[num]);
136 }*/
137 
138 __global__ void add_new_mutations(float * mutations_freq, int4 * mutations_ID, const int prev_mutations_Index, const int new_mutations_Index, const int array_Length, float freq, const int population, const int num_populations, const int generation){
139  int myID = blockIdx.x*blockDim.x + threadIdx.x;
140  for(int id = myID; (id < (new_mutations_Index-prev_mutations_Index)) && ((id + prev_mutations_Index) < array_Length); id+= blockDim.x*gridDim.x){
141  for(int pop = 0; pop < num_populations; pop++){ mutations_freq[(pop*array_Length+prev_mutations_Index+id)] = 0; }
142  mutations_freq[(population*array_Length+prev_mutations_Index+id)] = freq;
143  mutations_ID[(prev_mutations_Index+id)] = make_int4(generation,population,(id+1),0); //to ensure that ID is non-zero, so that preservation flag can be a -ID
144  }
145 }
146 
147 __global__ void scatter_arrays(float * new_mutations_freq, int4 * new_mutations_ID, const float * const mutations_freq, const int4 * const mutations_ID, const unsigned int * const flag, const unsigned int * const scan_Index, const int padded_mut_Index, const int new_array_Length, const int old_array_Length, const bool preserve_mutations, const int warp_size){
148 //adapted from https://www.csuohio.edu/engineering/sites/csuohio.edu.engineering/files/Research_Day_2015_EECS_Poster_14.pdf
149  int myID = blockIdx.x*blockDim.x + threadIdx.x;
150  int population = blockIdx.y;
151 
152  for(int id = myID; id < (padded_mut_Index >> 5); id+= blockDim.x*gridDim.x){
153  int lnID = threadIdx.x % warp_size;
154  int warpID = id >> 5;
155 
156  unsigned int predmask;
157  unsigned int cnt;
158 
159  predmask = flag[(warpID<<5)+lnID];
160  cnt = __popc(predmask);
161 
162  //parallel prefix sum
163 #pragma unroll
164  for(int offset = 1; offset < 32; offset<<=1){
165  unsigned int n = __shfl_up(cnt, offset);
166  if(lnID >= offset) cnt += n;
167  }
168 
169  unsigned int global_index = 0;
170  if(warpID > 0) global_index = scan_Index[warpID - 1];
171 
172  for(int i = 0; i < 32; i++){
173  unsigned int mask = __shfl(predmask, i); //broadcast from thread i
174  unsigned int sub_group_index = 0;
175  if(i > 0) sub_group_index = __shfl(cnt, i-1);
176  if(mask & (1 << lnID)){
177  int write = global_index + sub_group_index + __popc(mask & ((1 << lnID) - 1));
178  int read = (warpID<<10)+(i<<5)+lnID;
179  new_mutations_freq[population*new_array_Length + write] = mutations_freq[population*old_array_Length+read];
180  if(population == 0){
181  if(preserve_mutations){
182  int4 ID = mutations_ID[read];
183  new_mutations_ID[write] = make_int4(ID.x,ID.y,-1*abs(ID.z),ID.w);
184  }else{ new_mutations_ID[write] = mutations_ID[read]; }
185  }
186  }
187  }
188  }
189 }
190 
191 __global__ void preserve_prev_run_mutations(int4 * mutations_ID, const int mutations_Index){
192  int myID = blockIdx.x*blockDim.x + threadIdx.x;
193  for(int id = myID; id < mutations_Index; id+= blockDim.x*gridDim.x){ mutations_ID[id].z = -1*abs(mutations_ID[id].z); } //preservation flag is a -ID, use of absolute value is to ensure that if ID is already
194 }
195 
196 } /* ----- end namespace go_fish_details ----- */