diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu index 2112f5f6027a5..ead2e3cc00504 100644 --- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu +++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenGPUNoFit.cu @@ -83,7 +83,7 @@ void testMultiply() { kernelMultiply<<<1,1>>>(JGPU, CGPU, multiply_resultGPU); cudaDeviceSynchronize(); - cudaMemcpy(multiply_resultGPUret, multiply_resultGPU, + cudaMemcpy(multiply_resultGPUret, multiply_resultGPU, sizeof(Eigen::Matrix), cudaMemcpyDeviceToHost); printIt(multiply_resultGPUret); assert(isEqualFuzzy(multiply_result, (*multiply_resultGPUret))); @@ -91,7 +91,10 @@ void testMultiply() { void testInverse3x3() { std::cout << "TEST INVERSE 3x3" << std::endl; - Matrix3d m = Matrix3d::Random(); + Matrix3d m; + fillMatrix(m); + m += m.transpose().eval(); + Matrix3d m_inv = m.inverse(); Matrix3d *mGPU = nullptr; Matrix3d *mGPUret = nullptr; @@ -117,7 +120,10 @@ void testInverse3x3() { void testInverse4x4() { std::cout << "TEST INVERSE 4x4" << std::endl; - Matrix4d m = Matrix4d::Random(); + Matrix4d m; + fillMatrix(m); + m += m.transpose().eval(); + Matrix4d m_inv = m.inverse(); Matrix4d *mGPU = nullptr; Matrix4d *mGPUret = nullptr; @@ -143,9 +149,10 @@ void testInverse4x4() { void testEigenvalues() { std::cout << "TEST EIGENVALUES" << std::endl; - Matrix3d m = Matrix3d::Random(); - Matrix3d mt = m.transpose(); - m += mt; + Matrix3d m; + fillMatrix(m); + m += m.transpose().eval(); + Matrix3d * m_gpu = nullptr; Matrix3d * mgpudebug = new Matrix3d(); Eigen::SelfAdjointEigenSolver::RealVectorType *ret = new Eigen::SelfAdjointEigenSolver::RealVectorType; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu index 66d6d56286008..d6d6e2bdf24ff 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu @@ -14,8 +14,8 @@ using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU; using namespace Eigen; -__global__ void -KernelFastFitAllHits(GPU::SimpleVector * foundNtuplets, +__global__ +void kernelFastFitAllHits(GPU::SimpleVector * foundNtuplets, siPixelRecHitsHeterogeneousProduct::HitsOnGPU const * hhp, int hits_in_fit, float B, @@ -71,8 +71,8 @@ KernelFastFitAllHits(GPU::SimpleVector * foundNtuplets, fast_fit[helix_start] = Rfit::Fast_fit(hits[helix_start]); } -__global__ void -KernelCircleFitAllHits(GPU::SimpleVector * foundNtuplets, +__global__ +void kernelCircleFitAllHits(GPU::SimpleVector * foundNtuplets, int hits_in_fit, float B, Rfit::helix_fit *results, @@ -88,11 +88,10 @@ KernelCircleFitAllHits(GPU::SimpleVector * foundNtuplets, } #ifdef GPU_DEBUG - printf("BlockDim.x: %d, BlockIdx.x: %d, threadIdx.x: %d, helix_start: %d" - "cumulative_size: %d\n", - blockDim.x, blockIdx.x, threadIdx.x, helix_start, foundNtuplets->size()); + printf("blockDim.x: %d, blockIdx.x: %d, threadIdx.x: %d, helix_start: %d, cumulative_size: %d\n", + blockDim.x, blockIdx.x, threadIdx.x, helix_start, foundNtuplets->size()); #endif - u_int n = hits[helix_start].cols(); + auto n = hits[helix_start].cols(); Rfit::VectorNd rad = (hits[helix_start].block(0, 0, 2, n).colwise().norm()); @@ -102,17 +101,14 @@ KernelCircleFitAllHits(GPU::SimpleVector * foundNtuplets, fast_fit[helix_start], rad, B, true); #ifdef GPU_DEBUG - printf("KernelCircleFitAllHits circle.par(0): %d %f\n", helix_start, - circle_fit[helix_start].par(0)); - printf("KernelCircleFitAllHits circle.par(1): %d %f\n", helix_start, - circle_fit[helix_start].par(1)); - printf("KernelCircleFitAllHits circle.par(2): %d %f\n", helix_start, - circle_fit[helix_start].par(2)); + printf("kernelCircleFitAllHits circle.par(0): %d %f\n", helix_start, circle_fit[helix_start].par(0)); + printf("kernelCircleFitAllHits circle.par(1): %d %f\n", helix_start, circle_fit[helix_start].par(1)); + printf("kernelCircleFitAllHits circle.par(2): %d %f\n", helix_start, circle_fit[helix_start].par(2)); #endif } -__global__ void -KernelLineFitAllHits(GPU::SimpleVector * foundNtuplets, +__global__ +void kernelLineFitAllHits(GPU::SimpleVector * foundNtuplets, float B, Rfit::helix_fit *results, Rfit::Matrix3xNd *hits, @@ -127,20 +123,16 @@ KernelLineFitAllHits(GPU::SimpleVector * foundNtuplets, } #ifdef GPU_DEBUG - - printf("BlockDim.x: %d, BlockIdx.x: %d, threadIdx.x: %d, helix_start: %d, " - "cumulative_size: %d\n", - blockDim.x, blockIdx.x, threadIdx.x, helix_start, foundNtuplets->size()); + printf("blockDim.x: %d, blockIdx.x: %d, threadIdx.x: %d, helix_start: %d, cumulative_size: %d\n", + blockDim.x, blockIdx.x, threadIdx.x, helix_start, foundNtuplets->size()); #endif - line_fit[helix_start] = - Rfit::Line_fit(hits[helix_start], hits_cov[helix_start], - circle_fit[helix_start], fast_fit[helix_start], B, true); + line_fit[helix_start] = Rfit::Line_fit(hits[helix_start], hits_cov[helix_start], circle_fit[helix_start], fast_fit[helix_start], B, true); par_uvrtopak(circle_fit[helix_start], B, true); // Grab helix_fit from the proper location in the output vector - Rfit::helix_fit &helix = results[helix_start]; + auto & helix = results[helix_start]; helix.par << circle_fit[helix_start].par, line_fit[helix_start].par; // TODO: pass properly error booleans @@ -154,16 +146,13 @@ KernelLineFitAllHits(GPU::SimpleVector * foundNtuplets, helix.chi2_line = line_fit[helix_start].chi2; #ifdef GPU_DEBUG - - printf("KernelLineFitAllHits line.par(0): %d %f\n", helix_start, - circle_fit[helix_start].par(0)); - printf("KernelLineFitAllHits line.par(1): %d %f\n", helix_start, - line_fit[helix_start].par(1)); + printf("kernelLineFitAllHits line.par(0): %d %f\n", helix_start, circle_fit[helix_start].par(0)); + printf("kernelLineFitAllHits line.par(1): %d %f\n", helix_start, line_fit[helix_start].par(1)); #endif } -__global__ void -kernel_checkOverflows(GPU::SimpleVector *foundNtuplets, +__global__ +void kernel_checkOverflows(GPU::SimpleVector *foundNtuplets, GPUCACell const * __restrict__ cells, uint32_t const * __restrict__ nCells, GPU::VecArray< unsigned int, 256> const * __restrict__ isOuterHitOfCell, uint32_t nHits, uint32_t maxNumberOfDoublets) { @@ -240,8 +229,8 @@ __global__ void kernel_find_ntuplets( // printf("in %d found quadruplets: %d\n", cellIndex, foundNtuplets->size()); } -__global__ void -kernel_print_found_ntuplets(GPU::SimpleVector *foundNtuplets, int maxPrint) { +__global__ +void kernel_print_found_ntuplets(GPU::SimpleVector *foundNtuplets, int maxPrint) { for (int i = 0; i < std::min(maxPrint, foundNtuplets->size()); ++i) { printf("\nquadruplet %d: %d %d %d %d\n", i, (*foundNtuplets)[i].hitId[0], @@ -371,19 +360,22 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion ®ion, // kernel_print_found_ntuplets<<<1, 1, 0, cudaStream>>>(d_foundNtupletsVec_[regionIndex], 10); - KernelFastFitAllHits<<>>( + kernelFastFitAllHits<<>>( d_foundNtupletsVec_[regionIndex], hh.gpu_d, 4, bField_, helix_fit_resultsGPU_, hitsGPU_, hits_covGPU_, circle_fit_resultsGPU_, fast_fit_resultsGPU_, line_fit_resultsGPU_); cudaCheck(cudaGetLastError()); - KernelCircleFitAllHits<<>>( + blockSize = 256; + numberOfBlocks = (maxNumberOfQuadruplets_ + blockSize - 1) / blockSize; + + kernelCircleFitAllHits<<>>( d_foundNtupletsVec_[regionIndex], 4, bField_, helix_fit_resultsGPU_, hitsGPU_, hits_covGPU_, circle_fit_resultsGPU_, fast_fit_resultsGPU_, line_fit_resultsGPU_); cudaCheck(cudaGetLastError()); - KernelLineFitAllHits<<>>( + kernelLineFitAllHits<<>>( d_foundNtupletsVec_[regionIndex], bField_, helix_fit_resultsGPU_, hitsGPU_, hits_covGPU_, circle_fit_resultsGPU_, fast_fit_resultsGPU_, line_fit_resultsGPU_);