Skip to content

Commit

Permalink
Replace CUDA API wrapper memory operations with native CUDA calls (#395)
Browse files Browse the repository at this point in the history
  • Loading branch information
waredjeb authored and fwyzard committed Apr 1, 2021
1 parent 80ec6eb commit 4181007
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 31 deletions.
13 changes: 7 additions & 6 deletions RecoPixelVertexing/PixelTrackFitting/test/testEigenGPU.cu
Original file line number Diff line number Diff line change
Expand Up @@ -261,10 +261,10 @@ void testFit() {
kernelFastFit<N><<<Ntracks / 64, 64>>>(hitsGPU, fast_fit_resultsGPU);
cudaDeviceSynchronize();

cudaMemcpy(fast_fit_resultsGPUret,
fast_fit_resultsGPU,
Rfit::maxNumberOfTracks() * sizeof(Vector4d),
cudaMemcpyDeviceToHost);
cudaCheck(cudaMemcpy(fast_fit_resultsGPUret,
fast_fit_resultsGPU,
Rfit::maxNumberOfTracks() * sizeof(Vector4d),
cudaMemcpyDeviceToHost));
Rfit::Map4d fast_fit(fast_fit_resultsGPUret + 10, 4);
std::cout << "Fitted values (FastFit, [X0, Y0, R, tan(theta)]): GPU\n" << fast_fit << std::endl;
assert(isEqualFuzzy(fast_fit_results, fast_fit));
Expand Down Expand Up @@ -311,13 +311,14 @@ void testFit() {

std::cout << "Fitted values (CircleFit):\n" << circle_fit_results.par << std::endl;

cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost);
cudaCheck(
cudaMemcpy(circle_fit_resultsGPUret, circle_fit_resultsGPU, sizeof(Rfit::circle_fit), cudaMemcpyDeviceToHost));
std::cout << "Fitted values (CircleFit) GPU:\n" << circle_fit_resultsGPUret->par << std::endl;
assert(isEqualFuzzy(circle_fit_results.par, circle_fit_resultsGPUret->par));

std::cout << "Fitted values (LineFit):\n" << line_fit_results.par << std::endl;
// LINE_FIT GPU
cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost);
cudaCheck(cudaMemcpy(line_fit_resultsGPUret, line_fit_resultsGPU, sizeof(Rfit::line_fit), cudaMemcpyDeviceToHost));
std::cout << "Fitted values (LineFit) GPU:\n" << line_fit_resultsGPUret->par << std::endl;
assert(isEqualFuzzy(line_fit_results.par, line_fit_resultsGPUret->par, N == 5 ? 1e-4 : 1e-6)); // requires fma on CPU

Expand Down
52 changes: 27 additions & 25 deletions RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu
Original file line number Diff line number Diff line change
Expand Up @@ -73,18 +73,19 @@ void testMultiply() {
Eigen::Matrix<double, row1, col2> *multiply_resultGPU = nullptr;
Eigen::Matrix<double, row1, col2> *multiply_resultGPUret = new Eigen::Matrix<double, row1, col2>();

cudaMalloc((void **)&JGPU, sizeof(Eigen::Matrix<double, row1, col1>));
cudaMalloc((void **)&CGPU, sizeof(Eigen::Matrix<double, row2, col2>));
cudaMalloc((void **)&multiply_resultGPU, sizeof(Eigen::Matrix<double, row1, col2>));
cudaMemcpy(JGPU, &J, sizeof(Eigen::Matrix<double, row1, col1>), cudaMemcpyHostToDevice);
cudaMemcpy(CGPU, &C, sizeof(Eigen::Matrix<double, row2, col2>), cudaMemcpyHostToDevice);
cudaMemcpy(multiply_resultGPU, &multiply_result, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyHostToDevice);
cudaCheck(cudaMalloc((void **)&JGPU, sizeof(Eigen::Matrix<double, row1, col1>)));
cudaCheck(cudaMalloc((void **)&CGPU, sizeof(Eigen::Matrix<double, row2, col2>)));
cudaCheck(cudaMalloc((void **)&multiply_resultGPU, sizeof(Eigen::Matrix<double, row1, col2>)));
cudaCheck(cudaMemcpy(JGPU, &J, sizeof(Eigen::Matrix<double, row1, col1>), cudaMemcpyHostToDevice));
cudaCheck(cudaMemcpy(CGPU, &C, sizeof(Eigen::Matrix<double, row2, col2>), cudaMemcpyHostToDevice));
cudaCheck(cudaMemcpy(
multiply_resultGPU, &multiply_result, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyHostToDevice));

kernelMultiply<<<1, 1>>>(JGPU, CGPU, multiply_resultGPU);
cudaDeviceSynchronize();

cudaMemcpy(
multiply_resultGPUret, multiply_resultGPU, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyDeviceToHost);
cudaCheck(cudaMemcpy(
multiply_resultGPUret, multiply_resultGPU, sizeof(Eigen::Matrix<double, row1, col2>), cudaMemcpyDeviceToHost));
printIt(multiply_resultGPUret);
assert(isEqualFuzzy(multiply_result, (*multiply_resultGPUret)));
}
Expand All @@ -104,14 +105,14 @@ void testInverse3x3() {
std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
#endif
cudaMalloc((void **)&mGPU, sizeof(Matrix3d));
cudaMalloc((void **)&mGPUret, sizeof(Matrix3d));
cudaMemcpy(mGPU, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice);
cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix3d)));
cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix3d)));
cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice));

kernelInverse3x3<<<1, 1>>>(mGPU, mGPUret);
cudaDeviceSynchronize();

cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix3d), cudaMemcpyDeviceToHost);
cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix3d), cudaMemcpyDeviceToHost));
#if TEST_DEBUG
std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
#endif
Expand All @@ -133,14 +134,14 @@ void testInverse4x4() {
std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
#endif
cudaMalloc((void **)&mGPU, sizeof(Matrix4d));
cudaMalloc((void **)&mGPUret, sizeof(Matrix4d));
cudaMemcpy(mGPU, &m, sizeof(Matrix4d), cudaMemcpyHostToDevice);
cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix4d)));
cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix4d)));
cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix4d), cudaMemcpyHostToDevice));

kernelInverse4x4<<<1, 1>>>(mGPU, mGPUret);
cudaDeviceSynchronize();

cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix4d), cudaMemcpyDeviceToHost);
cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix4d), cudaMemcpyDeviceToHost));
#if TEST_DEBUG
std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
#endif
Expand All @@ -162,14 +163,14 @@ void testInverse5x5() {
std::cout << "Here is the matrix m:" << std::endl << m << std::endl;
std::cout << "Its inverse is:" << std::endl << m.inverse() << std::endl;
#endif
cudaMalloc((void **)&mGPU, sizeof(Matrix5d));
cudaMalloc((void **)&mGPUret, sizeof(Matrix5d));
cudaMemcpy(mGPU, &m, sizeof(Matrix5d), cudaMemcpyHostToDevice);
cudaCheck(cudaMalloc((void **)&mGPU, sizeof(Matrix5d)));
cudaCheck(cudaMalloc((void **)&mGPUret, sizeof(Matrix5d)));
cudaCheck(cudaMemcpy(mGPU, &m, sizeof(Matrix5d), cudaMemcpyHostToDevice));

kernelInverse5x5<<<1, 1>>>(mGPU, mGPUret);
cudaDeviceSynchronize();

cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix5d), cudaMemcpyDeviceToHost);
cudaCheck(cudaMemcpy(mCPUret, mGPUret, sizeof(Matrix5d), cudaMemcpyDeviceToHost));
#if TEST_DEBUG
std::cout << "Its GPU inverse is:" << std::endl << (*mCPUret) << std::endl;
#endif
Expand All @@ -195,15 +196,16 @@ void testEigenvalues() {
std::cout << "The eigenvalues of M are:" << std::endl << (*ret) << std::endl;
std::cout << "*************************\n\n" << std::endl;
#endif
cudaMalloc((void **)&m_gpu, sizeof(Matrix3d));
cudaMalloc((void **)&ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType));
cudaMemcpy(m_gpu, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice);
cudaCheck(cudaMalloc((void **)&m_gpu, sizeof(Matrix3d)));
cudaCheck(cudaMalloc((void **)&ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType)));
cudaCheck(cudaMemcpy(m_gpu, &m, sizeof(Matrix3d), cudaMemcpyHostToDevice));

kernel<<<1, 1>>>(m_gpu, ret_gpu);
cudaDeviceSynchronize();

cudaMemcpy(mgpudebug, m_gpu, sizeof(Matrix3d), cudaMemcpyDeviceToHost);
cudaMemcpy(ret1, ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType), cudaMemcpyDeviceToHost);
cudaCheck(cudaMemcpy(mgpudebug, m_gpu, sizeof(Matrix3d), cudaMemcpyDeviceToHost));
cudaCheck(cudaMemcpy(
ret1, ret_gpu, sizeof(Eigen::SelfAdjointEigenSolver<Matrix3d>::RealVectorType), cudaMemcpyDeviceToHost));
#if TEST_DEBUG
std::cout << "GPU Generated Matrix M 3x3:\n" << (*mgpudebug) << std::endl;
std::cout << "GPU The eigenvalues of M are:" << std::endl << (*ret1) << std::endl;
Expand Down

0 comments on commit 4181007

Please sign in to comment.