-
Notifications
You must be signed in to change notification settings - Fork 1
/
sw_cuda_hypothetical.cu
127 lines (98 loc) · 4.75 KB
/
sw_cuda_hypothetical.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#include <iostream> // std::cout
#include <string>
#include <vector>
#include <utility>
#include "utils.h"
#include <stdio.h>
using dp_mat = int16_t[ SIZE + 1 ][ SIZE + 1 ];
__global__
void align_kernel(int *scores, dp_mat *matrices, char *sequences) {
// Indices
const int16_t tid = threadIdx.x + blockIdx.x * blockDim.x;
const int16_t iid = threadIdx.y + blockIdx.y * blockDim.y;
const int16_t jid = threadIdx.z + blockIdx.z * blockDim.z;
// Instantiate scores
auto const gap = -2;
auto const mismatch = -2;
auto const match = 3;
int16_t i = iid + 1;
int16_t j = jid + 1;
int16_t diagonal_value = matrices[ tid ][ i - 1 ][ j - 1 ];
diagonal_value += (
sequences[ tid * SIZE * 2 + i - 1 ] == sequences[ tid * SIZE * 2 + j - 1 + SIZE ] ? match : mismatch);
int16_t top_value = matrices[ tid ][ i - 1 ][ j ] + gap;
int16_t left_value = matrices[ tid ][ i ][ j - 1 ] + gap;
// diagonal_value = std::max(diagonal_value, std::max(top_value, left_value));
int16_t temp =
top_value - ((top_value - left_value) & ((top_value - left_value) >> (sizeof(int16_t) * 8 - 1)));
int16_t target_value =
diagonal_value - ((diagonal_value - temp) & ((diagonal_value - temp) >> (sizeof(int16_t) * 8 - 1)));
// matrices[ t ][ i ][ j ] = (target_value > 0) ? target_value : 0;
matrices[ tid ][ i ][ j ] =
target_value - (target_value & (target_value >> (sizeof(int16_t) * 8 - 1)));
atomicMax(scores + tid, int(target_value));
}
void sw_cuda_hypothetical(std::vector<std::pair<std::string, std::string>> const sequences){
// Instantiate host variables
std::vector<int> scores(QUANTITY);
// Instantiate device variables
dp_mat *dev_matrices;
char *dev_input;
int *dev_output;
int64_t matrices_size = QUANTITY * sizeof(dp_mat);
int64_t output_size = QUANTITY * sizeof(int);
int64_t input_size = QUANTITY * SIZE * 2;
// Allocate memory on device
auto const start_time_1 = std::chrono::steady_clock::now();
cudaMalloc( (void**) &dev_output, output_size );
cudaMalloc( (void**) &dev_matrices, matrices_size );
cudaMalloc( (void**) &dev_input, input_size );
auto const end_time_1 = std::chrono::steady_clock::now();
std::cout << "Device malloc time: (μs) "
<< std::chrono::duration_cast<std::chrono::microseconds>( end_time_1 - start_time_1 ).count()
<< std::endl;
cudaCheckErrors("Failed to allocate device buffer");
// Preprocessing
auto const start_time_0 = std::chrono::steady_clock::now();
const char *sequences_bytes = to_byte_arr(sequences);
auto const end_time_0 = std::chrono::steady_clock::now();
std::cout << "Preprocessing time: (μs) "
<< std::chrono::duration_cast<std::chrono::microseconds>( end_time_0 - start_time_0 ).count()
<< std::endl;
// Send the data to device
auto const start_time_2 = std::chrono::steady_clock::now();
cudaMemcpy( dev_input, sequences_bytes, input_size, cudaMemcpyHostToDevice );
auto const end_time_2 = std::chrono::steady_clock::now();
std::cout << "To device transfer time: (μs) "
<< std::chrono::duration_cast<std::chrono::microseconds>( end_time_2 - start_time_2 ).count()
<< std::endl;
cudaCheckErrors("CUDA memcpy failure");
// Kernel
auto const start_time_3 = std::chrono::steady_clock::now();
dim3 blockSize(CUDA_XBLOCK_SIZE, CUDA_YBLOCK_SIZE, CUDA_ZBLOCK_SIZE);
dim3 gridSize(QUANTITY / CUDA_XBLOCK_SIZE, SIZE / CUDA_YBLOCK_SIZE, SIZE / CUDA_ZBLOCK_SIZE);
align_kernel<<< gridSize, blockSize>>>( dev_output, dev_matrices, dev_input);
auto const end_time_3 = std::chrono::steady_clock::now();
std::cout << "Exec time: (μs) "
<< std::chrono::duration_cast<std::chrono::microseconds>( end_time_3 - start_time_3 ).count()
<< std::endl;
cudaCheckErrors("Kernel launch failure");
// Retrieve results from device
auto const start_time_4 = std::chrono::steady_clock::now();
cudaMemcpy( scores.data(), dev_output, output_size, cudaMemcpyDeviceToHost );
auto const end_time_4 = std::chrono::steady_clock::now();
std::cout << "From device transfer time: (μs) "
<< std::chrono::duration_cast<std::chrono::microseconds>( end_time_4 - start_time_4 ).count()
<< std::endl;
cudaCheckErrors("CUDA memcpy failure");
// Free the memory on device
cudaFree( dev_input );
cudaFree( dev_matrices );
cudaFree( dev_output );
cudaCheckErrors("cudaFree fail");
int total_scores;
for (auto e : scores) {
total_scores += e;
}
std::cout << "Average score: " << total_scores / QUANTITY << std::endl;
}