AIDASoft · tmadlener · Nov 14, 2023 · Nov 8, 2023 · Nov 8, 2023 · Nov 8, 2023
diff --git a/include/podio/ROOTFrameWriter.h b/include/podio/ROOTFrameWriter.h
@@ -49,6 +49,22 @@ class ROOTFrameWriter {
    */
   void finish();
 
+  /** Check whether the collsToWrite are consistent with the state of the passed
+   * category.
+   *
+   * Return two vectors of collection names. The first one contains all the
+   * names that were missing from the collsToWrite but were present in the
+   * category. The second one contains the names that are present in the
+   * collsToWrite only. If both vectors are empty the category and the passed
+   * collsToWrite are consistent.
+   *
+   * NOTE: This will only be a meaningful check if the first Frame of the passed
+   * category has already been written. Also, this check is rather expensive as
+   * it has to effectively do two set differences.
+   */
+  std::tuple<std::vector<std::string>, std::vector<std::string>>
+  checkConsistency(const std::vector<std::string>& collsToWrite, const std::string& category) const;
+
 private:
   using StoreCollection = std::pair<const std::string&, podio::CollectionBase*>;
 

diff --git a/include/podio/ROOTNTupleWriter.h b/include/podio/ROOTNTupleWriter.h
@@ -32,29 +32,44 @@ class ROOTNTupleWriter {
   void writeFrame(const podio::Frame& frame, const std::string& category, const std::vector<std::string>& collsToWrite);
   void finish();
 
+  /** Check whether the collsToWrite are consistent with the state of the passed
+   * category.
+   *
+   * Return two vectors of collection names. The first one contains all the
+   * names that were missing from the collsToWrite but were present in the
+   * category. The second one contains the names that are present in the
+   * collsToWrite only. If both vectors are empty the category and the passed
+   * collsToWrite are consistent.
+   *
+   * NOTE: This will only be a meaningful check if the first Frame of the passed
+   * category has already been written. Also, this check is rather expensive as
+   * it has to effectively do two set differences.
+   */
+  std::tuple<std::vector<std::string>, std::vector<std::string>>
+  checkConsistency(const std::vector<std::string>& collsToWrite, const std::string& category) const;
+
 private:
   using StoreCollection = std::pair<const std::string&, podio::CollectionBase*>;
   std::unique_ptr<ROOT::Experimental::RNTupleModel> createModels(const std::vector<StoreCollection>& collections);
 
   std::unique_ptr<ROOT::Experimental::RNTupleModel> m_metadata{};
-  std::unordered_map<std::string, std::unique_ptr<ROOT::Experimental::RNTupleWriter>> m_writers{};
   std::unique_ptr<ROOT::Experimental::RNTupleWriter> m_metadataWriter{};
 
   std::unique_ptr<TFile> m_file{};
 
   DatamodelDefinitionCollector m_datamodelCollector{};
 
   struct CollectionInfo {
-    std::vector<unsigned int> id{};
+    std::vector<uint32_t> id{};
     std::vector<std::string> name{};
     std::vector<std::string> type{};
     std::vector<short> isSubsetCollection{};
     std::vector<SchemaVersionT> schemaVersion{};
+    std::unique_ptr<ROOT::Experimental::RNTupleWriter> writer{nullptr};
   };
+  CollectionInfo& getCategoryInfo(const std::string& category);
 
-  std::unordered_map<std::string, CollectionInfo> m_collectionInfo{};
-
-  std::set<std::string> m_categories{};
+  std::unordered_map<std::string, CollectionInfo> m_categories{};
 
   bool m_finished{false};
 

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -109,6 +109,9 @@ PODIO_ADD_LIB_AND_DICT(podioRootIO "${root_headers}" "${root_sources}" root_sele
 target_link_libraries(podioRootIO PUBLIC podio::podio ROOT::Core ROOT::RIO ROOT::Tree)
 if(ENABLE_RNTUPLE)
   target_link_libraries(podioRootIO PUBLIC ROOT::ROOTNTuple)
+  target_compile_definitions(podioRootIO PUBLIC PODIO_ENABLE_RNTUPLE=1)
+else()
+  target_compile_definitions(podioRootIO PUBLIC PODIO_ENABLE_RNTUPLE=0)
 endif()
 
 

diff --git a/src/ROOTFrameWriter.cc b/src/ROOTFrameWriter.cc
@@ -41,9 +41,12 @@ void ROOTFrameWriter::writeFrame(const podio::Frame& frame, const std::string& c
   collections.reserve(catInfo.collsToWrite.size());
   for (const auto& name : catInfo.collsToWrite) {
     auto* coll = frame.getCollectionForWrite(name);
+    if (!coll) {
+      // Make sure all collections that we want to write are actually available
+      // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
+      throw std::runtime_error("Collection '" + name + "' in category '" + category + "' is not available in Frame");
+    }
     collections.emplace_back(name, const_cast<podio::CollectionBase*>(coll));
-
-    m_datamodelCollector.registerDatamodelDefinition(coll, name);
   }
 
   // We will at least have a parameters branch, even if there are no
@@ -52,6 +55,12 @@ void ROOTFrameWriter::writeFrame(const podio::Frame& frame, const std::string& c
     initBranches(catInfo, collections, const_cast<podio::GenericParameters&>(frame.getParameters()));
 
   } else {
+    // Make sure that the category contents are consistent with the initial
+    // frame in the category
+    if (!root_utils::checkConsistentColls(catInfo.collsToWrite, collsToWrite)) {
+      throw std::runtime_error("Trying to write category '" + category + "' with inconsistent collection content. " +
+                               root_utils::getInconsistentCollsMsg(catInfo.collsToWrite, collsToWrite));
+    }
     resetBranches(catInfo.branches, collections, &const_cast<podio::GenericParameters&>(frame.getParameters()));
   }
 
@@ -73,6 +82,10 @@ void ROOTFrameWriter::initBranches(CategoryInfo& catInfo, const std::vector<Stor
 
   // First collections
   for (auto& [name, coll] : collections) {
+    // For the first entry in each category we also record the datamodel
+    // definition
+    m_datamodelCollector.registerDatamodelDefinition(coll, name);
+
     root_utils::CollectionBranches branches;
     const auto buffers = coll->getBuffers();
     // For subset collections we only fill one references branch
@@ -153,4 +166,13 @@ void ROOTFrameWriter::finish() {
   m_finished = true;
 }
 
+std::tuple<std::vector<std::string>, std::vector<std::string>>
+ROOTFrameWriter::checkConsistency(const std::vector<std::string>& collsToWrite, const std::string& category) const {
+  if (const auto it = m_categories.find(category); it != m_categories.end()) {
+    return root_utils::getInconsistentColls(it->second.collsToWrite, collsToWrite);
+  }
+
+  return {std::vector<std::string>{}, collsToWrite};
+}
+
 } // namespace podio
diff --git a/src/ROOTNTupleWriter.cc b/src/ROOTNTupleWriter.cc
@@ -64,22 +64,50 @@ void ROOTNTupleWriter::writeFrame(const podio::Frame& frame, const std::string&
 
 void ROOTNTupleWriter::writeFrame(const podio::Frame& frame, const std::string& category,
                                   const std::vector<std::string>& collsToWrite) {
+  auto& catInfo = getCategoryInfo(category);
+
+  // Use the writer as proxy to check whether this category has been initialized
+  // already and do so if not
+  const bool new_category = (catInfo.writer == nullptr);
+  if (new_category) {
+    // This is the minimal information that we need for now
+    catInfo.name = collsToWrite;
+  }
 
   std::vector<StoreCollection> collections;
-  collections.reserve(collsToWrite.size());
-  for (const auto& name : collsToWrite) {
+  collections.reserve(catInfo.name.size());
+  // Only loop over the collections that were requested in the first Frame of
+  // this category
+  for (const auto& name : catInfo.name) {
     auto* coll = frame.getCollectionForWrite(name);
+    if (!coll) {
+      // Make sure all collections that we want to write are actually available
+      // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
+      throw std::runtime_error("Collection '" + name + "' in category '" + category + "' is not available in Frame");
+    }
+
     collections.emplace_back(name, const_cast<podio::CollectionBase*>(coll));
   }
 
-  bool new_category = false;
-  if (m_writers.find(category) == m_writers.end()) {
-    new_category = true;
+  if (new_category) {
+    // Now we have enough info to populate the rest
     auto model = createModels(collections);
-    m_writers[category] = ROOT::Experimental::RNTupleWriter::Append(std::move(model), category, *m_file.get(), {});
+    catInfo.writer = ROOT::Experimental::RNTupleWriter::Append(std::move(model), category, *m_file.get(), {});
+
+    for (const auto& [name, coll] : collections) {
+      catInfo.id.emplace_back(coll->getID());
+      catInfo.type.emplace_back(coll->getTypeName());
+      catInfo.isSubsetCollection.emplace_back(coll->isSubsetCollection());
+      catInfo.schemaVersion.emplace_back(coll->getSchemaVersion());
+    }
+  } else {
+    if (!root_utils::checkConsistentColls(catInfo.name, collsToWrite)) {
+      throw std::runtime_error("Trying to write category '" + category + "' with inconsistent collection content. " +
+                               root_utils::getInconsistentCollsMsg(catInfo.name, collsToWrite));
+    }
   }
 
-  auto entry = m_writers[category]->GetModel()->CreateBareEntry();
+  auto entry = m_categories[category].writer->GetModel()->CreateBareEntry();
 
   ROOT::Experimental::RNTupleWriteOptions options;
   options.SetCompression(ROOT::RCompressionSetting::EDefaults::kUseGeneralPurpose);
@@ -121,14 +149,6 @@ void ROOTNTupleWriter::writeFrame(const podio::Frame& frame, const std::string&
     // Not supported
     // entry->CaptureValueUnsafe(root_utils::paramBranchName,
     // &const_cast<podio::GenericParameters&>(frame.getParameters()));
-
-    if (new_category) {
-      m_collectionInfo[category].id.emplace_back(coll->getID());
-      m_collectionInfo[category].name.emplace_back(name);
-      m_collectionInfo[category].type.emplace_back(coll->getTypeName());
-      m_collectionInfo[category].isSubsetCollection.emplace_back(coll->isSubsetCollection());
-      m_collectionInfo[category].schemaVersion.emplace_back(coll->getSchemaVersion());
-    }
   }
 
   auto params = frame.getParameters();
@@ -137,8 +157,7 @@ void ROOTNTupleWriter::writeFrame(const podio::Frame& frame, const std::string&
   fillParams<double>(params, entry.get());
   fillParams<std::string>(params, entry.get());
 
-  m_writers[category]->Fill(*entry);
-  m_categories.insert(category);
+  m_categories[category].writer->Fill(*entry);
 }
 
 std::unique_ptr<ROOT::Experimental::RNTupleModel>
@@ -215,6 +234,15 @@ ROOTNTupleWriter::createModels(const std::vector<StoreCollection>& collections)
   return model;
 }
 
+ROOTNTupleWriter::CollectionInfo& ROOTNTupleWriter::getCategoryInfo(const std::string& category) {
+  if (auto it = m_categories.find(category); it != m_categories.end()) {
+    return it->second;
+  }
+
+  auto [it, _] = m_categories.try_emplace(category, CollectionInfo{});
+  return it->second;
+}
+
 void ROOTNTupleWriter::finish() {
 
   auto podioVersion = podio::version::build_version;
@@ -227,21 +255,21 @@ void ROOTNTupleWriter::finish() {
   *edmField = edmDefinitions;
 
   auto availableCategoriesField = m_metadata->MakeField<std::vector<std::string>>(root_utils::availableCategories);
-  for (auto& [c, _] : m_collectionInfo) {
+  for (auto& [c, _] : m_categories) {
     availableCategoriesField->push_back(c);
   }
 
-  for (auto& category : m_categories) {
+  for (auto& [category, collInfo] : m_categories) {
     auto idField = m_metadata->MakeField<std::vector<unsigned int>>({root_utils::idTableName(category)});
-    *idField = m_collectionInfo[category].id;
+    *idField = collInfo.id;
     auto collectionNameField = m_metadata->MakeField<std::vector<std::string>>({root_utils::collectionName(category)});
-    *collectionNameField = m_collectionInfo[category].name;
+    *collectionNameField = collInfo.name;
     auto collectionTypeField = m_metadata->MakeField<std::vector<std::string>>({root_utils::collInfoName(category)});
-    *collectionTypeField = m_collectionInfo[category].type;
+    *collectionTypeField = collInfo.type;
     auto subsetCollectionField = m_metadata->MakeField<std::vector<short>>({root_utils::subsetCollection(category)});
-    *subsetCollectionField = m_collectionInfo[category].isSubsetCollection;
+    *subsetCollectionField = collInfo.isSubsetCollection;
     auto schemaVersionField = m_metadata->MakeField<std::vector<SchemaVersionT>>({"schemaVersion_" + category});
-    *schemaVersionField = m_collectionInfo[category].schemaVersion;
+    *schemaVersionField = collInfo.schemaVersion;
   }
 
   m_metadata->Freeze();
@@ -254,10 +282,21 @@ void ROOTNTupleWriter::finish() {
 
   // All the tuple writers must be deleted before the file so that they flush
   // unwritten output
-  m_writers.clear();
+  for (auto& [_, catInfo] : m_categories) {
+    catInfo.writer.reset();
+  }
   m_metadataWriter.reset();
 
   m_finished = true;
 }
 
+std::tuple<std::vector<std::string>, std::vector<std::string>>
+ROOTNTupleWriter::checkConsistency(const std::vector<std::string>& collsToWrite, const std::string& category) const {
+  if (const auto it = m_categories.find(category); it != m_categories.end()) {
+    return root_utils::getInconsistentColls(it->second.name, collsToWrite);
+  }
+
+  return {std::vector<std::string>{}, collsToWrite};
+}
+
 } // namespace podio
diff --git a/src/rootUtils.h b/src/rootUtils.h
@@ -14,6 +14,7 @@
 #include <algorithm>
 #include <cctype>
 #include <iostream>
+#include <iterator>
 #include <string>
 #include <string_view>
 #include <tuple>
@@ -272,6 +273,75 @@ inline std::vector<std::string> sortAlphabeticaly(std::vector<std::string> strin
   return strings;
 }
 
+/**
+ * Check whether existingColls and candidateColls both contain the same
+ * collection names. Returns false if the two vectors differ in content. Inputs
+ * can have random order wrt each other, but the assumption is that each vector
+ * only contains unique names.
+ */
+inline bool checkConsistentColls(const std::vector<std::string>& existingColls,
+                                 const std::vector<std::string>& candidateColls) {
+  if (existingColls.size() != candidateColls.size()) {
+    return false;
+  }
+
+  // Since we are guaranteed to have unique names here, we can just look for
+  // collisions brute force, which seems to be quickest approach for vector
+  // sizes we typically have here (few hundred)
+  for (const auto& id : candidateColls) {
+    if (std::find(existingColls.begin(), existingColls.end(), id) == existingColls.end()) {
+      return false;
+    }
+  }
-  // Since we are guaranteed to have unique names here, we can just look for
-  // collisions brute force, which seems to be quickest approach for vector
-  // sizes we typically have here (few hundred)
-  for (const auto& id : candidateColls) {
-    if (std::find(existingColls.begin(), existingColls.end(), id) == existingColls.end()) {
-      return false;
-    }
-  }
+  // Since we are guaranteed to have unique names here, we can just look for
+  // collisions brute force, which seems to be quickest approach for vector
+  // sizes we typically have here (few hundred)
+  for (const auto& id : candidateColls) {
+    if (!std::binary_search(existingColls.begin(), existingColls.end(), id, [](const auto& lhs, const auto& rhs) { std::lexicographical_compare(
+        lhs.begin(), lhs.end(), rhs.begin(), rhs.end(),
+        [](const auto& cl, const auto& cr) { return std::tolower(cl) < std::tolower(cr); }))) {
+      return false;
+    }
+  }
-  // Since we are guaranteed to have unique names here, we can just look for
-  // collisions brute force, which seems to be quickest approach for vector
-  // sizes we typically have here (few hundred)
-  for (const auto& id : candidateColls) {
-    if (std::find(existingColls.begin(), existingColls.end(), id) == existingColls.end()) {
-      return false;
-    }
-  }
+  // Since we are guaranteed to have unique names here, we can just look for
+  // collisions brute force, which seems to be quickest approach for vector
+  // sizes we typically have here (few hundred)
+  for (const auto& id : candidateColls) {
+    if (!std::binary_search(existingColls.begin(), existingColls.end(), id, [](const auto& lhs, const auto& rhs) { std::lexicographical_compare(
+        lhs.begin(), lhs.end(), rhs.begin(), rhs.end(),
+        [](const auto& cl, const auto& cr) { return std::tolower(cl) < std::tolower(cr); }))) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/**
+ * Get the differences in the existingColls and candidateColls collection names.
+ * Returns two vectors of collection names. The first one are the collections
+ * that only exist in the existingColls, the seconde one are the names that only
+ * exist in the candidateColls.
+ */
+inline std::tuple<std::vector<std::string>, std::vector<std::string>>
+getInconsistentColls(std::vector<std::string> existingColls, std::vector<std::string> candidateColls) {
+  // Need sorted ranges for set_difference
+  std::sort(existingColls.begin(), existingColls.end());
+  std::sort(candidateColls.begin(), candidateColls.end());
+
+  std::vector<std::string> onlyInExisting{};
+  std::set_difference(existingColls.begin(), existingColls.end(), candidateColls.begin(), candidateColls.end(),
+                      std::back_inserter(onlyInExisting));
+
+  std::vector<std::string> onlyInCands{};
+  std::set_difference(candidateColls.begin(), candidateColls.end(), existingColls.begin(), existingColls.end(),
+                      std::back_inserter(onlyInCands));
+
+  return {std::move(onlyInExisting), std::move(onlyInCands)};
+}
+
+inline std::string getInconsistentCollsMsg(const std::vector<std::string>& existingColls,
+                                           const std::vector<std::string>& candidateColls) {
+  const auto& [onlyExisting, onlyCands] = getInconsistentColls(existingColls, candidateColls);
+
+  std::stringstream sstr;
+  sstr << "missing: [";
+  std::string sep = "";
+  for (const auto& name : onlyExisting) {
+    sstr << sep << name;
+    sep = ",";
+  }
+  sep = "";
+  sstr << "], superfluous: [";
+  for (const auto& name : onlyCands) {
+    sstr << sep << name;
+    sep = ",";
+  }
+  sstr << "]";
+
+  return sstr.str();
+}
+
 } // namespace podio::root_utils
 
 #endif
diff --git a/tests/unittests/CMakeLists.txt b/tests/unittests/CMakeLists.txt
@@ -88,7 +88,7 @@ else()
       WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
       TEST_PREFIX "UT_" # make it possible to filter easily with -R ^UT
       TEST_SPEC ${filter_tests} # discover only tests that are known to not fail
-      DL_PATHS ${CMAKE_CURRENT_BINARY_DIR}:${PROJECT_BINARY_DIR}/src:$<TARGET_FILE_DIR:ROOT::Tree>:$<$<TARGET_EXISTS:SIO::sio>:$<TARGET_FILE_DIR:SIO::sio>>:$ENV{LD_LIBRARY_PATH}
+      DL_PATHS ${CMAKE_CURRENT_BINARY_DIR}:${PROJECT_BINARY_DIR}/src:${PROJECT_BINARY_DIR}/tests:$<TARGET_FILE_DIR:ROOT::Tree>:$<$<TARGET_EXISTS:SIO::sio>:$<TARGET_FILE_DIR:SIO::sio>>:$ENV{LD_LIBRARY_PATH}
       PROPERTIES
         ENVIRONMENT
         PODIO_SIOBLOCK_PATH=${CMAKE_CURRENT_BINARY_DIR}