diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
index 9db5d89de1f83..b2c412ee3793e 100644
--- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc
+++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -286,7 +286,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
     auto maxBin            = allocator.getUntrackedParameter<unsigned int>("maxBin");
     size_t maxCachedBytes  = allocator.getUntrackedParameter<unsigned int>("maxCachedBytes");
     auto maxCachedFraction = allocator.getUntrackedParameter<double>("maxCachedFraction");
-    auto debug = allocator.getUntrackedParameter<bool>("debug");
+    auto debug             = allocator.getUntrackedParameter<bool>("debug");
 
     size_t minCachedBytes = std::numeric_limits<size_t>::max();
     int currentDevice;
diff --git a/HeterogeneousCore/CUDAServices/src/CachingDeviceAllocator.h b/HeterogeneousCore/CUDAServices/src/CachingDeviceAllocator.h
index eb0b6686ef8d5..33bc1bbb175fd 100644
--- a/HeterogeneousCore/CUDAServices/src/CachingDeviceAllocator.h
+++ b/HeterogeneousCore/CUDAServices/src/CachingDeviceAllocator.h
@@ -418,8 +418,8 @@ struct CachingDeviceAllocator
                     cached_bytes[device].free -= search_key.bytes;
                     cached_bytes[device].live += search_key.bytes;
 
-                    if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
-                        device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long)  block_itr->associated_stream);
+                    if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld, event %lld (previously associated with stream %lld, event %lld).\n",
+                        device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) search_key.ready_event, (long long) block_itr->associated_stream, (long long) block_itr->ready_event);
 
                     cached_blocks.erase(block_itr);
 
@@ -500,8 +500,8 @@ struct CachingDeviceAllocator
             cached_bytes[device].live += search_key.bytes;
             mutex.Unlock();
 
-            if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
-                      device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);
+            if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n",
+                      device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) search_key.ready_event);
 
             // Attempt to revert back to previous device if necessary
             if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
@@ -579,8 +579,8 @@ struct CachingDeviceAllocator
                 cached_blocks.insert(search_key);
                 cached_bytes[device].free += search_key.bytes;
 
-                if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
-                    device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
+                if (debug) _CubLog("\tDevice %d returned %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
+                    device, (long long) search_key.bytes, d_ptr, (long long) search_key.associated_stream, (long long) search_key.ready_event, (long long) cached_blocks.size(),
                     (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
             }
         }
@@ -607,8 +607,8 @@ struct CachingDeviceAllocator
             if (CubDebug(error = cudaFree(d_ptr))) return error;
             if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
 
-            if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
-                device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+            if (debug) _CubLog("\tDevice %d freed %lld bytes at %p from associated stream %lld, event %lld.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                device, (long long) search_key.bytes, d_ptr, (long long) search_key.associated_stream, (long long) search_key.ready_event, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
         }
 
         // Reset device
diff --git a/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h b/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h
index 215e7be96a4d6..43ae2f42429a2 100644
--- a/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h
+++ b/HeterogeneousCore/CUDAServices/src/CachingHostAllocator.h
@@ -407,8 +407,8 @@ struct CachingHostAllocator
                     cached_bytes.free -= search_key.bytes;
                     cached_bytes.live += search_key.bytes;
 
-                    if (debug) _CubLog("\tHost reused cached block at %p (%lld bytes) for stream %lld on device %lld (previously associated with stream %lld).\n",
-                        search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) search_key.device, (long long)  block_itr->associated_stream);
+                    if (debug) _CubLog("\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld (previously associated with stream %lld, event %lld).\n",
+                        search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) search_key.ready_event, (long long) search_key.device, (long long) block_itr->associated_stream, (long long) block_itr->ready_event);
 
                     cached_blocks.erase(block_itr);
 
@@ -482,8 +482,8 @@ struct CachingHostAllocator
             cached_bytes.live += search_key.bytes;
             mutex.Unlock();
 
-            if (debug) _CubLog("\tHost allocated new host block at %p (%lld bytes associated with stream %lld on device %lld).\n",
-                      search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) search_key.device);
+            if (debug) _CubLog("\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device %lld).\n",
+                      search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) search_key.ready_event, (long long) search_key.device);
         }
 
         // Copy host pointer to output parameter
@@ -529,8 +529,8 @@ struct CachingHostAllocator
                 cached_blocks.insert(search_key);
                 cached_bytes.free += search_key.bytes;
 
-                if (debug) _CubLog("\tHost returned %lld bytes from associated stream %lld on device %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
-                    (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) search_key.device, (long long) cached_blocks.size(),
+                if (debug) _CubLog("\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
+                    (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) search_key.ready_event, (long long) search_key.device, (long long) cached_blocks.size(),
                     (long long) cached_bytes.free, (long long) live_blocks.size(), (long long) cached_bytes.live);
             }
         }
@@ -554,8 +554,8 @@ struct CachingHostAllocator
             if (CubDebug(error = cudaFreeHost(d_ptr))) return error;
             if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
 
-            if (debug) _CubLog("\tHost freed %lld bytes from associated stream %lld on device %lld.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
-                (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) search_key.device, (long long) cached_blocks.size(), (long long) cached_bytes.free, (long long) live_blocks.size(), (long long) cached_bytes.live);
+            if (debug) _CubLog("\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) search_key.ready_event, (long long) search_key.device, (long long) cached_blocks.size(), (long long) cached_bytes.free, (long long) live_blocks.size(), (long long) cached_bytes.live);
         }
 
         // Reset device
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h
index 4930307a89567..e9fe7aba2087b 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h
@@ -2,21 +2,35 @@
 #define HeterogeneousCore_CUDAUtilities_cudaCheck_h
 
 #include <iostream>
+#include <sstream>
 #include <cuda.h>
 #include <cuda_runtime.h>
 
+namespace {
+
+  inline
+  void printCudaErrorMessage(const char* file, int line, const char* cmd, const char* error, const char* message) {
+    std::ostringstream out;
+    out << "\n";
+    out << file << ", line " << line << ":\n";
+    out << "cudaCheck(" << cmd << ");\n";
+    out << error << ": " << message << "\n";
+    std::cerr << out.rdbuf() << std::endl;
+  }
+
+}
+
 inline
 bool cudaCheck_(const char* file, int line, const char* cmd, CUresult result)
 {
-    //std::cerr << file << ", line " << line << ": " << cmd << std::endl;
-    if (result == CUDA_SUCCESS)
+    if (__builtin_expect(result == CUDA_SUCCESS, true))
         return true;
 
     const char* error;
     const char* message;
     cuGetErrorName(result, &error);
     cuGetErrorString(result, &message);
-    std::cerr << file << ", line " << line << ": " << error << ": " << message << std::endl;
+    printCudaErrorMessage(file, line, cmd, error, message);
     abort();
     return false;
 }
@@ -24,13 +38,12 @@ bool cudaCheck_(const char* file, int line, const char* cmd, CUresult result)
 inline
 bool cudaCheck_(const char* file, int line, const char* cmd, cudaError_t result)
 {
-    //std::cerr << file << ", line " << line << ": " << cmd << std::endl;
-    if (result == cudaSuccess)
+    if (__builtin_expect(result == cudaSuccess, true))
         return true;
 
     const char* error = cudaGetErrorName(result);
     const char* message = cudaGetErrorString(result);
-    std::cerr << file << ", line " << line << ": " << error << ": " << message << std::endl;
+    printCudaErrorMessage(file, line, cmd, error, message);
     abort();
     return false;
 }