AIDASoft · tmadlener · Nov 14, 2023 · Nov 8, 2023 · Nov 8, 2023 · Nov 8, 2023
diff --git a/include/podio/ROOTNTupleWriter.h b/include/podio/ROOTNTupleWriter.h
@@ -37,24 +37,23 @@ class ROOTNTupleWriter {
   std::unique_ptr<ROOT::Experimental::RNTupleModel> createModels(const std::vector<StoreCollection>& collections);
 
   std::unique_ptr<ROOT::Experimental::RNTupleModel> m_metadata{};
-  std::unordered_map<std::string, std::unique_ptr<ROOT::Experimental::RNTupleWriter>> m_writers{};
   std::unique_ptr<ROOT::Experimental::RNTupleWriter> m_metadataWriter{};
 
   std::unique_ptr<TFile> m_file{};
 
   DatamodelDefinitionCollector m_datamodelCollector{};
 
   struct CollectionInfo {
-    std::vector<unsigned int> id{};
+    std::vector<uint32_t> id{};
     std::vector<std::string> name{};
     std::vector<std::string> type{};
     std::vector<short> isSubsetCollection{};
     std::vector<SchemaVersionT> schemaVersion{};
+    std::unique_ptr<ROOT::Experimental::RNTupleWriter> writer{nullptr};
   };
+  CollectionInfo& getCategoryInfo(const std::string& category);
 
-  std::unordered_map<std::string, CollectionInfo> m_collectionInfo{};
-
-  std::set<std::string> m_categories{};
+  std::unordered_map<std::string, CollectionInfo> m_categories{};
 
   bool m_finished{false};
 

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -109,6 +109,7 @@ PODIO_ADD_LIB_AND_DICT(podioRootIO "${root_headers}" "${root_sources}" root_sele
 target_link_libraries(podioRootIO PUBLIC podio::podio ROOT::Core ROOT::RIO ROOT::Tree)
 if(ENABLE_RNTUPLE)
   target_link_libraries(podioRootIO PUBLIC ROOT::ROOTNTuple)
+  target_compile_definitions(podioRootIO PUBLIC PODIO_ENABLE_RNTUPLE=1)
 endif()
 
 

diff --git a/src/ROOTFrameWriter.cc b/src/ROOTFrameWriter.cc
@@ -41,9 +41,12 @@ void ROOTFrameWriter::writeFrame(const podio::Frame& frame, const std::string& c
   collections.reserve(catInfo.collsToWrite.size());
   for (const auto& name : catInfo.collsToWrite) {
     auto* coll = frame.getCollectionForWrite(name);
+    if (!coll) {
+      // Make sure all collections that we want to write are actually available
+      // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
+      throw std::runtime_error("Collection '" + name + "' in category '" + category + "' is not available in Frame");
+    }
     collections.emplace_back(name, const_cast<podio::CollectionBase*>(coll));
-
-    m_datamodelCollector.registerDatamodelDefinition(coll, name);
   }
 
   // We will at least have a parameters branch, even if there are no
@@ -52,6 +55,11 @@ void ROOTFrameWriter::writeFrame(const podio::Frame& frame, const std::string& c
     initBranches(catInfo, collections, const_cast<podio::GenericParameters&>(frame.getParameters()));
 
   } else {
+    // Make sure that the category contents are consistent with the initial
+    // frame in the category
+    if (!root_utils::checkConsistentColls(catInfo.collsToWrite, collsToWrite)) {
+      throw std::runtime_error("Trying to write category '" + category + "' with inconsistent collection content");
+    }
     resetBranches(catInfo.branches, collections, &const_cast<podio::GenericParameters&>(frame.getParameters()));
   }
 
@@ -73,6 +81,10 @@ void ROOTFrameWriter::initBranches(CategoryInfo& catInfo, const std::vector<Stor
 
   // First collections
   for (auto& [name, coll] : collections) {
+    // For the first entry in each category we also record the datamodel
+    // definition
+    m_datamodelCollector.registerDatamodelDefinition(coll, name);
+
     root_utils::CollectionBranches branches;
     const auto buffers = coll->getBuffers();
     // For subset collections we only fill one references branch

diff --git a/src/ROOTNTupleWriter.cc b/src/ROOTNTupleWriter.cc
@@ -64,22 +64,49 @@ void ROOTNTupleWriter::writeFrame(const podio::Frame& frame, const std::string&
 
 void ROOTNTupleWriter::writeFrame(const podio::Frame& frame, const std::string& category,
                                   const std::vector<std::string>& collsToWrite) {
+  auto& catInfo = getCategoryInfo(category);
+
+  // Use the writer as proxy to check whether this category has been initialized
+  // already and do so if not
+  const bool new_category = (catInfo.writer == nullptr);
+  if (new_category) {
+    // This is the minimal information that we need for now
+    catInfo.name = collsToWrite;
+  }
 
   std::vector<StoreCollection> collections;
-  collections.reserve(collsToWrite.size());
-  for (const auto& name : collsToWrite) {
+  collections.reserve(catInfo.name.size());
+  // Only loop over the collections that were requested in the first Frame of
+  // this category
+  for (const auto& name : catInfo.name) {
     auto* coll = frame.getCollectionForWrite(name);
+    if (!coll) {
+      // Make sure all collections that we want to write are actually available
+      // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
+      throw std::runtime_error("Collection '" + name + "' in category '" + category + "' is not available in Frame");
+    }
+
     collections.emplace_back(name, const_cast<podio::CollectionBase*>(coll));
   }
 
-  bool new_category = false;
-  if (m_writers.find(category) == m_writers.end()) {
-    new_category = true;
+  if (new_category) {
+    // Now we have enough info to populate the rest
     auto model = createModels(collections);
-    m_writers[category] = ROOT::Experimental::RNTupleWriter::Append(std::move(model), category, *m_file.get(), {});
+    catInfo.writer = ROOT::Experimental::RNTupleWriter::Append(std::move(model), category, *m_file.get(), {});
+
+    for (const auto& [name, coll] : collections) {
+      catInfo.id.emplace_back(coll->getID());
+      catInfo.type.emplace_back(coll->getTypeName());
+      catInfo.isSubsetCollection.emplace_back(coll->isSubsetCollection());
+      catInfo.schemaVersion.emplace_back(coll->getSchemaVersion());
+    }
+  } else {
+    if (!root_utils::checkConsistentColls(catInfo.name, collsToWrite)) {
+      throw std::runtime_error("Trying to write category '" + category + "' with inconsistent collection content");
+    }
   }
 
-  auto entry = m_writers[category]->GetModel()->CreateBareEntry();
+  auto entry = m_categories[category].writer->GetModel()->CreateBareEntry();
 
   ROOT::Experimental::RNTupleWriteOptions options;
   options.SetCompression(ROOT::RCompressionSetting::EDefaults::kUseGeneralPurpose);
@@ -121,14 +148,6 @@ void ROOTNTupleWriter::writeFrame(const podio::Frame& frame, const std::string&
     // Not supported
     // entry->CaptureValueUnsafe(root_utils::paramBranchName,
     // &const_cast<podio::GenericParameters&>(frame.getParameters()));
-
-    if (new_category) {
-      m_collectionInfo[category].id.emplace_back(coll->getID());
-      m_collectionInfo[category].name.emplace_back(name);
-      m_collectionInfo[category].type.emplace_back(coll->getTypeName());
-      m_collectionInfo[category].isSubsetCollection.emplace_back(coll->isSubsetCollection());
-      m_collectionInfo[category].schemaVersion.emplace_back(coll->getSchemaVersion());
-    }
   }
 
   auto params = frame.getParameters();
@@ -137,8 +156,7 @@ void ROOTNTupleWriter::writeFrame(const podio::Frame& frame, const std::string&
   fillParams<double>(params, entry.get());
   fillParams<std::string>(params, entry.get());
 
-  m_writers[category]->Fill(*entry);
-  m_categories.insert(category);
+  m_categories[category].writer->Fill(*entry);
 }
 
 std::unique_ptr<ROOT::Experimental::RNTupleModel>
@@ -215,6 +233,15 @@ ROOTNTupleWriter::createModels(const std::vector<StoreCollection>& collections)
   return model;
 }
 
+ROOTNTupleWriter::CollectionInfo& ROOTNTupleWriter::getCategoryInfo(const std::string& category) {
+  if (auto it = m_categories.find(category); it != m_categories.end()) {
+    return it->second;
+  }
+
+  auto [it, _] = m_categories.try_emplace(category, CollectionInfo{});
+  return it->second;
+}
+
 void ROOTNTupleWriter::finish() {
 
   auto podioVersion = podio::version::build_version;
@@ -227,21 +254,21 @@ void ROOTNTupleWriter::finish() {
   *edmField = edmDefinitions;
 
   auto availableCategoriesField = m_metadata->MakeField<std::vector<std::string>>(root_utils::availableCategories);
-  for (auto& [c, _] : m_collectionInfo) {
+  for (auto& [c, _] : m_categories) {
     availableCategoriesField->push_back(c);
   }
 
-  for (auto& category : m_categories) {
+  for (auto& [category, collInfo] : m_categories) {
     auto idField = m_metadata->MakeField<std::vector<unsigned int>>({root_utils::idTableName(category)});
-    *idField = m_collectionInfo[category].id;
+    *idField = collInfo.id;
     auto collectionNameField = m_metadata->MakeField<std::vector<std::string>>({root_utils::collectionName(category)});
-    *collectionNameField = m_collectionInfo[category].name;
+    *collectionNameField = collInfo.name;
     auto collectionTypeField = m_metadata->MakeField<std::vector<std::string>>({root_utils::collInfoName(category)});
-    *collectionTypeField = m_collectionInfo[category].type;
+    *collectionTypeField = collInfo.type;
     auto subsetCollectionField = m_metadata->MakeField<std::vector<short>>({root_utils::subsetCollection(category)});
-    *subsetCollectionField = m_collectionInfo[category].isSubsetCollection;
+    *subsetCollectionField = collInfo.isSubsetCollection;
     auto schemaVersionField = m_metadata->MakeField<std::vector<SchemaVersionT>>({"schemaVersion_" + category});
-    *schemaVersionField = m_collectionInfo[category].schemaVersion;
+    *schemaVersionField = collInfo.schemaVersion;
   }
 
   m_metadata->Freeze();
@@ -254,7 +281,9 @@ void ROOTNTupleWriter::finish() {
 
   // All the tuple writers must be deleted before the file so that they flush
   // unwritten output
-  m_writers.clear();
+  for (auto& [_, catInfo] : m_categories) {
+    catInfo.writer.reset();
+  }
   m_metadataWriter.reset();
 
   m_finished = true;

diff --git a/src/rootUtils.h b/src/rootUtils.h
@@ -272,6 +272,30 @@ inline std::vector<std::string> sortAlphabeticaly(std::vector<std::string> strin
   return strings;
 }
 
+/**
+ * Check whether existingColls and candidateColls both contain the same
+ * collection names. Returns false if the two vectors differ in content. Inputs
+ * can have random order wrt each other, but the assumption is that each vector
+ * only contains unique names.
+ */
+inline bool checkConsistentColls(const std::vector<std::string>& existingColls,
+                                 const std::vector<std::string>& candidateColls) {
+  if (existingColls.size() != candidateColls.size()) {
+    return false;
+  }
+
+  // Since we are guaranteed to have unique names here, we can just look for
+  // collisions brute force, which seems to be quickest approach for vector
+  // sizes we typically have here (few hundred)
+  for (const auto& id : candidateColls) {
+    if (std::find(existingColls.begin(), existingColls.end(), id) == existingColls.end()) {
+      return false;
+    }
+  }
-  // Since we are guaranteed to have unique names here, we can just look for
-  // collisions brute force, which seems to be quickest approach for vector
-  // sizes we typically have here (few hundred)
-  for (const auto& id : candidateColls) {
-    if (std::find(existingColls.begin(), existingColls.end(), id) == existingColls.end()) {
-      return false;
-    }
-  }
+  // Since we are guaranteed to have unique names here, we can just look for
+  // collisions brute force, which seems to be quickest approach for vector
+  // sizes we typically have here (few hundred)
+  for (const auto& id : candidateColls) {
+    if (!std::binary_search(existingColls.begin(), existingColls.end(), id, [](const auto& lhs, const auto& rhs) { std::lexicographical_compare(
+        lhs.begin(), lhs.end(), rhs.begin(), rhs.end(),
+        [](const auto& cl, const auto& cr) { return std::tolower(cl) < std::tolower(cr); }))) {
+      return false;
+    }
+  }
-  // Since we are guaranteed to have unique names here, we can just look for
-  // collisions brute force, which seems to be quickest approach for vector
-  // sizes we typically have here (few hundred)
-  for (const auto& id : candidateColls) {
-    if (std::find(existingColls.begin(), existingColls.end(), id) == existingColls.end()) {
-      return false;
-    }
-  }
+  // Since we are guaranteed to have unique names here, we can just look for
+  // collisions brute force, which seems to be quickest approach for vector
+  // sizes we typically have here (few hundred)
+  for (const auto& id : candidateColls) {
+    if (!std::binary_search(existingColls.begin(), existingColls.end(), id, [](const auto& lhs, const auto& rhs) { std::lexicographical_compare(
+        lhs.begin(), lhs.end(), rhs.begin(), rhs.end(),
+        [](const auto& cl, const auto& cr) { return std::tolower(cl) < std::tolower(cr); }))) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
 } // namespace podio::root_utils
 
 #endif
diff --git a/tests/unittests/CMakeLists.txt b/tests/unittests/CMakeLists.txt
@@ -88,7 +88,7 @@ else()
       WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
       TEST_PREFIX "UT_" # make it possible to filter easily with -R ^UT
       TEST_SPEC ${filter_tests} # discover only tests that are known to not fail
-      DL_PATHS ${CMAKE_CURRENT_BINARY_DIR}:${PROJECT_BINARY_DIR}/src:$<TARGET_FILE_DIR:ROOT::Tree>:$<$<TARGET_EXISTS:SIO::sio>:$<TARGET_FILE_DIR:SIO::sio>>:$ENV{LD_LIBRARY_PATH}
+      DL_PATHS ${CMAKE_CURRENT_BINARY_DIR}:${PROJECT_BINARY_DIR}/src:${PROJECT_BINARY_DIR}/tests:$<TARGET_FILE_DIR:ROOT::Tree>:$<$<TARGET_EXISTS:SIO::sio>:$<TARGET_FILE_DIR:SIO::sio>>:$ENV{LD_LIBRARY_PATH}
       PROPERTIES
         ENVIRONMENT
         PODIO_SIOBLOCK_PATH=${CMAKE_CURRENT_BINARY_DIR}

diff --git a/tests/unittests/unittest.cpp b/tests/unittests/unittest.cpp
@@ -7,12 +7,15 @@
 #include <type_traits>
 #include <vector>
 
+#include "catch2/catch_template_test_macros.hpp"
 #include "catch2/catch_test_macros.hpp"
 
 // podio specific includes
 #include "podio/EventStore.h"
+#include "podio/Frame.h"
 #include "podio/GenericParameters.h"
 #include "podio/ROOTFrameReader.h"
+#include "podio/ROOTFrameWriter.h"
 #include "podio/ROOTLegacyReader.h"
 #include "podio/ROOTReader.h"
 #include "podio/podioVersion.h"
@@ -26,6 +29,13 @@
   #include "podio/SIOReader.h"
 #endif
 
+#ifndef PODIO_ENABLE_RNTUPLE
+  #define PODIO_ENABLE_RNTUPLE 0
+#endif
+#if PODIO_ENABLE_RNTUPLE
+  #include "podio/ROOTNTupleWriter.h"
+#endif
+
 // Test data types
 #include "datamodel/EventInfoCollection.h"
 #include "datamodel/ExampleClusterCollection.h"
@@ -1134,3 +1144,57 @@ TEST_CASE("JSON", "[json]") {
 }
 
 #endif
+
+// Write a template function that can be used with different writers in order to
+// be able to tag the unittests differently. This is necessary because the
+// ROOTFrameWriter fails with ASan, but the ROOTNTuple writer doesn't
+template <typename WriterT>
+void runConsistentFrameTest(const std::string& filename) {
+  podio::Frame frame;
+
+  frame.put(ExampleClusterCollection(), "clusters");
+  frame.put(ExampleClusterCollection(), "clusters2");
+  frame.put(ExampleHitCollection(), "hits");
+
+  WriterT writer(filename);
+  writer.writeFrame(frame, "full");
+
+  // Write a frame with more collections
+  frame.put(ExampleHitCollection(), "hits2");
+  REQUIRE_THROWS_AS(writer.writeFrame(frame, "full"), std::runtime_error);
+
+  // Write a frame with less collections
+  podio::Frame frame2;
+  frame2.put(ExampleClusterCollection(), "clusters");
+  frame2.put(ExampleClusterCollection(), "clusters2");
+  REQUIRE_THROWS_AS(writer.writeFrame(frame2, "full"), std::runtime_error);
+
+  // Write only a subset of collections
+  const std::vector<std::string> collsToWrite = {"clusters", "hits"};
+  writer.writeFrame(frame, "subset", collsToWrite);
+
+  // Frame is missing a collection
+  REQUIRE_THROWS_AS(writer.writeFrame(frame2, "subset", collsToWrite), std::runtime_error);
+
+  // Don't throw if frame contents are different, but the subset that is written
+  // is consistent
+  const std::vector<std::string> otherCollsToWrite = {"clusters", "clusters2"};
+  writer.writeFrame(frame, "subset2", otherCollsToWrite);
+  REQUIRE_NOTHROW(writer.writeFrame(frame2, "subset2", otherCollsToWrite));
+
+  // Make sure that restricting the second frame works.
+  // See https://github.com/AIDASoft/podio/issues/382 for the original issue
+  writer.writeFrame(frame2, "full_frame2");
+  REQUIRE_NOTHROW(writer.writeFrame(frame, "full_frame2", frame2.getAvailableCollections()));
+}
+
+TEST_CASE("ROOTFrameWriter consistent frame contents", "[ASAN-FAIL][UBSAN-FAIL][basics][root]") {
+  // The UBSAN-FAIL only happens on clang12 in CI.
+  runConsistentFrameTest<podio::ROOTFrameWriter>("unittests_frame_consistency.root");
+}
+
+#if PODIO_ENABLE_RNTUPLE
+TEST_CASE("ROOTNTupleWriter consistent frame contents", "[basics][root]") {
+  runConsistentFrameTest<podio::ROOTNTupleWriter>("unittests_frame_consistency.root");
+}
+#endif