7 #include "../spectrum.h" 8 #include "../_internal/shared.cuh" 9 #include "../_outside_libraries/cub/device/device_scan.cuh" 10 #include "../_outside_libraries/cub/block/block_reduce.cuh" 11 #include <math_constants.h> 14 namespace Spectrum_details{
16 class transfer_allele_trajectories{
19 float * mutations_freq;
23 int sampled_generation;
25 time_sample(): num_mutations(0), sampled_generation(0) { mutations_freq = NULL; extinct = NULL; Nchrom_e = NULL; }
26 time_sample(
const GO_Fish::allele_trajectories & in,
int sample_index): num_mutations(in.time_samples[sample_index]->num_mutations), sampled_generation(in.time_samples[sample_index]->sampled_generation){
28 mutations_freq = in.time_samples[sample_index]->mutations_freq;
29 extinct = in.time_samples[sample_index]->extinct;
30 Nchrom_e = in.time_samples[sample_index]->Nchrom_e;
32 ~time_sample(){ mutations_freq = NULL; extinct = NULL; Nchrom_e = NULL; }
35 time_sample ** time_samples;
37 unsigned int num_samples;
49 num_sites = in.sim_run_constants.
num_sites;
55 transfer_allele_trajectories(): num_samples(0), all_mutations(0) { time_samples = 0; mutations_ID = 0; }
59 if(!in.time_samples || in.num_samples == 0){ fprintf(stderr,
"error transferring allele_trajectories to spectrum: empty allele_trajectories\n"); exit(1); }
60 num_samples = in.num_samples;
61 time_samples =
new time_sample *[num_samples];
63 for(
int i = 0; i < num_samples; i++){ time_samples[i] =
new time_sample(in,i); }
64 mutations_ID = in.mutations_ID;
71 ~transfer_allele_trajectories(){
delete [] time_samples; time_samples = NULL; mutations_ID = NULL; num_samples = 0; }
74 __global__
void population_hist(
unsigned int * out_histogram,
float * in_mutation_freq,
int Nchrome_e,
int num_mutations,
int num_sites){
75 int myID = blockIdx.x*blockDim.x + threadIdx.x;
77 for(
int id = myID;
id < num_mutations;
id+= blockDim.x*gridDim.x){
78 int index = round(Nchrome_e*in_mutation_freq[
id]);
79 if(index == Nchrome_e){ index = 0; }
80 atomicAdd(&out_histogram[index],1);
82 if(myID == 0){ atomicAdd(&out_histogram[0], (num_sites - num_mutations)); }
85 __global__
void uint_to_float(
float * out_array,
unsigned int * in_array,
int N){
86 int myID = blockIdx.x*blockDim.x + threadIdx.x;
87 for(
int id = myID;
id < N;
id+= blockDim.x*gridDim.x){ out_array[id] = in_array[id]; }
90 __global__
void binom_coeff(
float * binom_coeff,
int half_n,
int n){
91 int myIDx = blockIdx.x*blockDim.x + threadIdx.x;
93 for(
int idx = (myIDx+1); idx < half_n; idx+= blockDim.x*gridDim.x){ binom_coeff[idx] = logf((n+1.f-idx)/((
float)idx)); }
94 if(myIDx == 0){ binom_coeff[0] = 0.0; }
97 __global__
void print_Device_array_float(
float * array,
int num){
100 for(
int j = 0; j < num; j++){ printf(
"%d: %f\t",j,array[j]); }
104 __global__
void print_Device_array_double(
double * array,
int start,
int end){
107 for(
int j = start; j < end; j++){ printf(
"%d: %f\t",j,array[j]); }
111 template <
int BLOCK_THREADS>
112 __global__
void binom(
float * d_histogram,
const float *
const d_mutations_freq,
const float *
const d_binom_coeff,
const int half_n,
const int num_levels,
float num_sites,
int num_mutations){
113 int myIDx = blockIdx.x*blockDim.x + threadIdx.x;
114 int myIDy = blockIdx.y;
115 typedef cub::BlockReduce<float, BLOCK_THREADS> BlockReduceT;
116 __shared__
typename BlockReduceT::TempStorage temp_storage;
117 float thread_data[1];
119 for(
int idy = myIDy; idy <= num_levels; idy+= blockDim.y*gridDim.y){
121 for(
int idx = myIDx; idx < num_mutations; idx+= blockDim.x*gridDim.x){
122 float pf = d_mutations_freq[idx];
123 if(pf > 0 && pf < 1){
125 float powp = idy*logf(pf);
126 float powq = (num_levels-idy)*logf(qf);
128 if(idy < half_n){ coeff = d_binom_coeff[idy]; }
else{ coeff = d_binom_coeff[num_levels-idy]; }
129 thread_data[0] += expf(coeff+powp+powq);
130 }
else if(idy == 0){ thread_data[0] += 1.f; }
132 float aggregate = BlockReduceT(temp_storage).Sum(thread_data);
133 if(threadIdx.x == 0){
134 if(idy == num_levels){ atomicAdd(&d_histogram[0],aggregate); }
135 else{ atomicAdd(&d_histogram[idy],aggregate); }
138 if(myIDx == 0 && myIDy == 0){ atomicAdd(&d_histogram[0],(
float)(num_sites-num_mutations)); }
154 using namespace Spectrum_details;
155 set_cuda_device(cuda_device);
158 cudaCheckErrors(cudaStreamCreate(&stream),-1,-1);
160 float * d_mutations_freq;
161 float * d_histogram, * h_histogram;
162 transfer_allele_trajectories sample(all_results);
163 if(!(sample_index >= 0 && sample_index < sample.num_samples) || !(population_index >= 0 && population_index < sample.sim_run_constants.num_populations)){
164 fprintf(stderr,
"site_frequency_spectrum error: requested indices out of bounds: sample %d [0 %d), population %d [0 %d)\n",sample_index,sample.num_samples,population_index,sample.sim_run_constants.num_populations); exit(1);
167 int population_size = sample.time_samples[sample_index]->Nchrom_e[population_index];
168 int num_mutations = sample.time_samples[sample_index]->num_mutations;
169 float num_sites = sample.sim_run_constants.num_sites;
171 cudaCheckErrorsAsync(cudaMalloc((
void**)&d_mutations_freq, sample.time_samples[sample_index]->num_mutations*
sizeof(
float)),-1,-1);
172 cudaCheckErrorsAsync(cudaMalloc((
void**)&d_histogram, population_size*
sizeof(
float)),-1,-1);
173 cudaCheckErrors(cudaHostRegister(&sample.time_samples[sample_index]->mutations_freq[population_index*num_mutations], num_mutations*
sizeof(
float), cudaHostRegisterPortable),-1,-1);
174 cudaCheckErrorsAsync(cudaMemcpyAsync(d_mutations_freq, &sample.time_samples[sample_index]->mutations_freq[population_index*num_mutations], num_mutations*
sizeof(
float), cudaMemcpyHostToDevice, stream),-1,-1);
175 cudaCheckErrors(cudaHostUnregister(&sample.time_samples[sample_index]->mutations_freq[population_index*num_mutations]),-1,-1);
177 unsigned int * d_pop_histogram;
178 cudaCheckErrorsAsync(cudaMalloc((
void**)&d_pop_histogram, population_size*
sizeof(
unsigned int)),-1,-1);
179 cudaCheckErrorsAsync(cudaMemsetAsync(d_pop_histogram, 0, population_size*
sizeof(
unsigned int), stream),-1,-1);
180 population_hist<<<50,1024,0,stream>>>(d_pop_histogram, d_mutations_freq, population_size,
num_mutations,
num_sites);
181 cudaCheckErrorsAsync(cudaPeekAtLastError(),-1,-1);
183 int num_threads = 1024;
184 if(population_size < 1024){ num_threads = 256;
if(population_size < 256){ num_threads = 128; } }
185 int num_blocks = max(population_size/num_threads,1);
186 uint_to_float<<<num_blocks,num_threads,0,stream>>>(d_histogram, d_pop_histogram, population_size);
187 cudaCheckErrorsAsync(cudaPeekAtLastError(),-1,-1);
188 cudaCheckErrorsAsync(cudaFree(d_pop_histogram),-1,-1);
190 h_histogram =
new float[population_size];
191 cudaCheckErrors(cudaHostRegister(h_histogram,
sizeof(
float)*population_size, cudaHostRegisterPortable),-1,-1);
192 cudaCheckErrorsAsync(cudaMemcpyAsync(h_histogram, d_histogram, population_size*
sizeof(
float), cudaMemcpyDeviceToHost, stream),-1,-1);
193 cudaCheckErrors(cudaHostUnregister(h_histogram),-1,-1);
195 if(cudaStreamQuery(stream) != cudaSuccess){ cudaCheckErrors(cudaStreamSynchronize(stream), -1, -1); }
201 mySFS.
num_sites = sample.sim_run_constants.num_sites;
207 cudaCheckErrorsAsync(cudaFree(d_mutations_freq),-1,-1);
208 cudaCheckErrorsAsync(cudaFree(d_histogram),-1,-1);
209 cudaCheckErrorsAsync(cudaStreamDestroy(stream),-1,-1);
214 using namespace Spectrum_details;
215 set_cuda_device(cuda_device);
218 cudaCheckErrors(cudaStreamCreate(&stream),-1,-1);
220 float * d_mutations_freq;
221 float * d_histogram, * h_histogram;
222 transfer_allele_trajectories sample(all_results);
223 if(!(sample_index >= 0 && sample_index < sample.num_samples) || !(population_index >= 0 && population_index < sample.sim_run_constants.num_populations)){
224 fprintf(stderr,
"site_frequency_spectrum error: requested indices out of bounds: sample %d [0 %d), population %d [0 %d)\n",sample_index,sample.num_samples,population_index,sample.sim_run_constants.num_populations); exit(1);
228 int population_size = sample.time_samples[sample_index]->Nchrom_e[population_index];
229 if((sample_size <= 0) || (sample_size >= population_size)){ fprintf(stderr,
"site_frequency_spectrum error: requested sample_size out of range [1,population_size): sample_size %d [1,%d)",sample_size,population_size); exit(1); }
231 if(sample_size == 0){ num_levels = population_size; }
232 int num_mutations = sample.time_samples[sample_index]->num_mutations;
233 float num_sites = sample.sim_run_constants.num_sites;
235 cudaCheckErrorsAsync(cudaMalloc((
void**)&d_mutations_freq, sample.time_samples[sample_index]->num_mutations*
sizeof(
float)),-1,-1);
236 cudaCheckErrorsAsync(cudaMalloc((
void**)&d_histogram, num_levels*
sizeof(
float)),-1,-1);
237 cudaCheckErrors(cudaHostRegister(&sample.time_samples[sample_index]->mutations_freq[population_index*num_mutations], num_mutations*
sizeof(
float), cudaHostRegisterPortable),-1,-1);
238 cudaCheckErrorsAsync(cudaMemcpyAsync(d_mutations_freq, &sample.time_samples[sample_index]->mutations_freq[population_index*num_mutations], num_mutations*
sizeof(
float), cudaMemcpyHostToDevice, stream),-1,-1);
239 cudaCheckErrors(cudaHostUnregister(&sample.time_samples[sample_index]->mutations_freq[population_index*num_mutations]),-1,-1);
242 if((num_levels) % 2 == 0){ half_n = (num_levels)/2+1; }
243 else{ half_n = (num_levels+1)/2; }
245 float * d_binom_partial_coeff;
246 cudaCheckErrorsAsync(cudaMalloc((
void**)&d_binom_partial_coeff, half_n*
sizeof(
float)),-1,-1);
247 int num_threads = 1024;
248 if(half_n < 1024){ num_threads = 256;
if(half_n < 256){ num_threads = 128; } }
249 int num_blocks = max(num_levels/num_threads,1);
250 binom_coeff<<<num_blocks,num_threads,0,stream>>>(d_binom_partial_coeff, half_n, num_levels);
251 cudaCheckErrorsAsync(cudaPeekAtLastError(),-1,-1);
253 float * d_binom_coeff;
254 cudaCheckErrorsAsync(cudaMalloc((
void**)&d_binom_coeff, half_n*
sizeof(
float)),-1,-1);
256 void *d_temp_storage = NULL;
257 size_t temp_storage_bytes = 0;
258 cudaCheckErrorsAsync(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_binom_partial_coeff, d_binom_coeff, half_n, stream),-1,-1);
259 cudaCheckErrorsAsync(cudaMalloc(&d_temp_storage, temp_storage_bytes),-1,-1);
260 cudaCheckErrorsAsync(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_binom_partial_coeff, d_binom_coeff, half_n, stream),-1,-1);
261 cudaCheckErrorsAsync(cudaFree(d_temp_storage),-1,-1);
262 cudaCheckErrorsAsync(cudaFree(d_binom_partial_coeff),-1,-1);
265 const dim3 gridsize(200,20,1);
266 const int num_threads_binom = 1024;
267 cudaCheckErrorsAsync(cudaMemsetAsync(d_histogram, 0, num_levels*
sizeof(
float), stream),-1,-1);
268 binom<num_threads_binom><<<gridsize,num_threads_binom,0,stream>>>(d_histogram, d_mutations_freq, d_binom_coeff, half_n, num_levels,
num_sites,
num_mutations);
269 cudaCheckErrorsAsync(cudaPeekAtLastError(),-1,-1);
270 cudaCheckErrorsAsync(cudaFree(d_binom_coeff),-1,-1);
274 h_histogram =
new float[num_levels];
275 cudaCheckErrors(cudaHostRegister(h_histogram,
sizeof(
float)*num_levels, cudaHostRegisterPortable),-1,-1);
276 cudaCheckErrorsAsync(cudaMemcpyAsync(h_histogram, d_histogram, num_levels*
sizeof(
float), cudaMemcpyDeviceToHost, stream),-1,-1);
277 cudaCheckErrors(cudaHostUnregister(h_histogram),-1,-1);
279 if(cudaStreamQuery(stream) != cudaSuccess){ cudaCheckErrors(cudaStreamSynchronize(stream), -1, -1); }
285 mySFS.
num_sites = sample.sim_run_constants.num_sites;
291 cudaCheckErrorsAsync(cudaFree(d_mutations_freq),-1,-1);
292 cudaCheckErrorsAsync(cudaFree(d_histogram),-1,-1);
293 cudaCheckErrorsAsync(cudaStreamDestroy(stream),-1,-1);
float * frequency_spectrum
site frequency spectrum data structure
control and output data structure for GO_Fish simulation
int sampled_generation
number of generations in the simulation at time of sampling
site frequency spectrum data structure (at the moment, functions only generate SFS for a single popul...
int * sample_size
number of samples taken for each population
int * populations
which populations are in SFS
Namespace for site frequency spectrum data structure and functions. (in prototype-phase) ...
float num_sites
number of sites in SFS
int num_populations
number of populations in simulation
float num_mutations
number of segregating mutations in SFS
void population_frequency_histogram(SFS &mySFS, const GO_Fish::allele_trajectories &all_results, const int sample_index, const int population_index, int cuda_device=-1)
create a frequency histogram of mutations at a single time point sample_index in a single population ...
void site_frequency_spectrum(SFS &mySFS, const GO_Fish::allele_trajectories &all_results, const int sample_index, const int population_index, const int sample_size, int cuda_device=-1)
create a single-population SFS of size sample_size from a single time point sample_index in a single ...
structure specifying the ID for a mutation in a GO_Fish simulation
float num_sites
number of sites in simulation
int num_populations
number of populations in SFS
int num_generations
number of generations in simulation