-
Notifications
You must be signed in to change notification settings - Fork 0
/
getrs_usm_het2.cpp
153 lines (128 loc) · 5.7 KB
/
getrs_usm_het2.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
// MKL/SYCL includes
#include "oneapi/mkl/lapack.hpp"
// includes
#include <iostream>
#include <time.h>
void GETRS(cl::sycl::queue &queue_gpu, cl::sycl::queue &queue_host, cl::sycl::device &device_gpu, cl::sycl::device &device_host, cl::sycl::context &context_gpu, cl::sycl::context &context_host, std::int64_t info, double *A, double *b, std::int64_t size, std::int64_t A_size, std::int64_t b_size);
int main(int argc, char *argv[])
{
//Input size at compilation time
std::int64_t size = atoi(argv[1]);
std::int64_t i, j, INFO=0;
//Asynchronous error handler
auto error_handler = [&] (cl::sycl::exception_list exceptions) {
for (auto const& e : exceptions) {
try{
std::rethrow_exception(e);
}catch(oneapi::mkl::lapack::exception const& e) {
// lapack related
INFO = e.info();
std::cout << "Unexpected eception caught during asynchronous cal to LAPACK operation:\n"
<< e.what() << "\ninfo: " << e.info() << std::endl;
}catch(cl::sycl::exception const& e) {
// not lapack related
std::cout << "Unexpected exception caught during asynchronous operation:\n"
<< e.what() << std::endl;
INFO = -1;
}
}
};
// Device information
cl::sycl::queue queue_gpu({cl::sycl::gpu_selector{}}, error_handler);
cl::sycl::queue queue_host({cl::sycl::host_selector{}}, error_handler);
cl::sycl::context context_gpu = queue_gpu.get_context();
cl::sycl::context context_host = queue_host.get_context();
cl::sycl::device device_gpu = queue_gpu.get_device();
cl::sycl::device device_host = queue_host.get_device();
std::cout << "Device: "
<< device_host.get_info<cl::sycl::info::device::name>()
<<std::endl;
// Allocate and initialize arrays
std::int64_t coeff_matrix_size = size * size;
std::int64_t RHS_size = size * size;
double *coeff_matrix = cl::sycl::malloc_shared<double>(coeff_matrix_size, queue_host);
double *RHS = cl::sycl::malloc_shared<double>(RHS_size, queue_host);
// Initialize A to zero
if( coeff_matrix != NULL){
for(i = 0; i < size; i++){
for(j = 0; j < size; j++){
coeff_matrix[i*size+j] = 0.0;
}
}
}else{
std::cerr << "Could not allocate memory to coefficient matrix! Out of memory!" << "\n" << std::endl;
exit(1);
}
// Make coefficient matrix a tridiagonal matrix
for(i = 0; i < size; i++){
coeff_matrix[i*size+i] = 2.0;
if(i > 0){
coeff_matrix[i*size + i-1] = -1.0;}
}
for(i = 0; i < size-1; i++){
if(i < size - 1){
coeff_matrix[i*size + i+1] = -1.0;}
}
// Result array overwritten by the solution X
if(RHS != NULL){
for(i = 0; i < size; i++){
RHS[i] = 1.0;}
}else{
std::cerr << "Could not allocate memory to RHS array! Out of memory!" << "\n" << std::endl;
exit(1);
}
// Solve tridiagonal system
clock_t start, end;
start = clock();
GETRS(queue_gpu, queue_host, device_gpu, device_host, context_gpu, context_host, INFO, coeff_matrix, RHS, size, coeff_matrix_size, RHS_size);
end = clock();
std::cout << "\n" << "Time taken by getrs heterogeneous USM version 2: " << double(end - start)/double(CLOCKS_PER_SEC) << " seconds. "<< std::endl;
std::cout << "\n";
return 0;
}
void GETRS(cl::sycl::queue &queue_gpu, cl::sycl::queue &queue_host, cl::sycl::device &device_gpu, cl::sycl::device &device_host, cl::sycl::context &context_gpu, cl::sycl::context &context_host, std::int64_t info, double *A, double *b, std::int64_t size, std::int64_t A_size, std::int64_t b_size)
{
// oneMKL functions parameters
std::int64_t m = size, n = size, nrhs = size, lda = size, ldb = size;
// Allocate ipiv array
std::int64_t ipiv_size = n * n;
std::int64_t *ipiv = cl::sycl::malloc_shared<std::int64_t>(ipiv_size, queue_host);
try{
// Get sizes of scratchpad
std::int64_t getrf_scratchpad_size = oneapi::mkl::lapack::getrf_scratchpad_size<double>(queue_gpu, m, n, lda);
double *getrf_scratchpad = cl::sycl::malloc_shared<double>(getrf_scratchpad_size, device_gpu, context_gpu);
std::int64_t getrs_scratchpad_size = oneapi::mkl::lapack::getrs_scratchpad_size<double>(queue_gpu, oneapi::mkl::transpose::nontrans, n, nrhs, lda, ldb);
double *getrs_scratchpad = cl::sycl::malloc_shared<double>(getrs_scratchpad_size, device_gpu, context_gpu);
// Perform factorization
auto getrf_done_event = oneapi::mkl::lapack::getrf(queue_host, m, n, A, lda, ipiv, getrf_scratchpad, getrf_scratchpad_size);
// Print out factorization
/*
for(int j = 0; j < m; j++){
for(int i = 0; i < n; i++){
std::cout << A[j*lda+i] << " ";
}
std::cout << std::endl;
}*/
// Solve the system
auto getrs_done_event = oneapi::mkl::lapack::getrs(queue_host, oneapi::mkl::transpose::nontrans, n, nrhs, A, lda, ipiv, b, ldb, getrs_scratchpad, getrs_scratchpad_size);
// Wait until calculations are done
getrs_done_event.wait_and_throw();
//free scatch memory
cl::sycl::free(getrf_scratchpad, context_gpu);
cl::sycl::free(getrs_scratchpad, context_gpu);
} catch(oneapi::mkl::lapack::exception const &e) {
// Handle lapack related exceptions
std::cout << "Unexpected exception caught during synchronous call to LAPACK API:\nreason: " << e.what() << "\ninfo: " << e.info() << std::endl;
info = e.info();
} catch(cl::sycl::exception const& e) {
// Handle not lapack related exceptions
std::cout << "Unexpected exception caught during synchronous call to SYCL API:\n" << e.what() << std::endl;
info = -1;
}
std::cout << "getrs " << ((info == 0) ? "ran OK" : "FAILED") << std::endl;
// PRINT RESULT
/*std::cout << "\n" << "SOLUTION TO SYSTEM: " << "\n" << std::endl;
for(std::int64_t j = 0; j < size; j++){
std::cout << b[j] << "\n" << std::endl;
}*/
}