Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make sure that the ROOT writers enforce consistency for the Frame contents they write #513

Merged
merged 19 commits into from
Nov 14, 2023
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions include/podio/ROOTFrameWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,22 @@ class ROOTFrameWriter {
*/
void finish();

/** Check whether the collsToWrite are consistent with the state of the passed
* category.
*
* Return two vectors of collection names. The first one contains all the
* names that were missing from the collsToWrite but were present in the
* category. The second one contains the names that are present in the
* collsToWrite only. If both vectors are empty the category and the passed
* collsToWrite are consistent.
*
* NOTE: This will only be a meaningful check if the first Frame of the passed
* category has already been written. Also, this check is rather expensive as
* it has to effectively do two set differences.
*/
std::tuple<std::vector<std::string>, std::vector<std::string>>
checkConsistency(const std::vector<std::string>& collsToWrite, const std::string& category) const;

private:
using StoreCollection = std::pair<const std::string&, podio::CollectionBase*>;

Expand Down
25 changes: 20 additions & 5 deletions include/podio/ROOTNTupleWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,29 +32,44 @@ class ROOTNTupleWriter {
void writeFrame(const podio::Frame& frame, const std::string& category, const std::vector<std::string>& collsToWrite);
void finish();

/** Check whether the collsToWrite are consistent with the state of the passed
* category.
*
* Return two vectors of collection names. The first one contains all the
* names that were missing from the collsToWrite but were present in the
* category. The second one contains the names that are present in the
* collsToWrite only. If both vectors are empty the category and the passed
* collsToWrite are consistent.
*
* NOTE: This will only be a meaningful check if the first Frame of the passed
* category has already been written. Also, this check is rather expensive as
* it has to effectively do two set differences.
*/
std::tuple<std::vector<std::string>, std::vector<std::string>>
checkConsistency(const std::vector<std::string>& collsToWrite, const std::string& category) const;

private:
using StoreCollection = std::pair<const std::string&, podio::CollectionBase*>;
std::unique_ptr<ROOT::Experimental::RNTupleModel> createModels(const std::vector<StoreCollection>& collections);

std::unique_ptr<ROOT::Experimental::RNTupleModel> m_metadata{};
std::unordered_map<std::string, std::unique_ptr<ROOT::Experimental::RNTupleWriter>> m_writers{};
std::unique_ptr<ROOT::Experimental::RNTupleWriter> m_metadataWriter{};

std::unique_ptr<TFile> m_file{};

DatamodelDefinitionCollector m_datamodelCollector{};

struct CollectionInfo {
std::vector<unsigned int> id{};
std::vector<uint32_t> id{};
std::vector<std::string> name{};
std::vector<std::string> type{};
std::vector<short> isSubsetCollection{};
std::vector<SchemaVersionT> schemaVersion{};
std::unique_ptr<ROOT::Experimental::RNTupleWriter> writer{nullptr};
};
CollectionInfo& getCategoryInfo(const std::string& category);

std::unordered_map<std::string, CollectionInfo> m_collectionInfo{};

std::set<std::string> m_categories{};
std::unordered_map<std::string, CollectionInfo> m_categories{};

bool m_finished{false};

Expand Down
3 changes: 3 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,9 @@ PODIO_ADD_LIB_AND_DICT(podioRootIO "${root_headers}" "${root_sources}" root_sele
target_link_libraries(podioRootIO PUBLIC podio::podio ROOT::Core ROOT::RIO ROOT::Tree)
if(ENABLE_RNTUPLE)
target_link_libraries(podioRootIO PUBLIC ROOT::ROOTNTuple)
target_compile_definitions(podioRootIO PUBLIC PODIO_ENABLE_RNTUPLE=1)
else()
target_compile_definitions(podioRootIO PUBLIC PODIO_ENABLE_RNTUPLE=0)
endif()


Expand Down
26 changes: 24 additions & 2 deletions src/ROOTFrameWriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,12 @@ void ROOTFrameWriter::writeFrame(const podio::Frame& frame, const std::string& c
collections.reserve(catInfo.collsToWrite.size());
for (const auto& name : catInfo.collsToWrite) {
auto* coll = frame.getCollectionForWrite(name);
if (!coll) {
// Make sure all collections that we want to write are actually available
// NOLINTNEXTLINE(performance-inefficient-string-concatenation)
throw std::runtime_error("Collection '" + name + "' in category '" + category + "' is not available in Frame");
}
collections.emplace_back(name, const_cast<podio::CollectionBase*>(coll));

m_datamodelCollector.registerDatamodelDefinition(coll, name);
}

// We will at least have a parameters branch, even if there are no
Expand All @@ -52,6 +55,12 @@ void ROOTFrameWriter::writeFrame(const podio::Frame& frame, const std::string& c
initBranches(catInfo, collections, const_cast<podio::GenericParameters&>(frame.getParameters()));

} else {
// Make sure that the category contents are consistent with the initial
// frame in the category
if (!root_utils::checkConsistentColls(catInfo.collsToWrite, collsToWrite)) {
throw std::runtime_error("Trying to write category '" + category + "' with inconsistent collection content. " +
root_utils::getInconsistentCollsMsg(catInfo.collsToWrite, collsToWrite));
}
resetBranches(catInfo.branches, collections, &const_cast<podio::GenericParameters&>(frame.getParameters()));
}

Expand All @@ -73,6 +82,10 @@ void ROOTFrameWriter::initBranches(CategoryInfo& catInfo, const std::vector<Stor

// First collections
for (auto& [name, coll] : collections) {
// For the first entry in each category we also record the datamodel
// definition
m_datamodelCollector.registerDatamodelDefinition(coll, name);

root_utils::CollectionBranches branches;
const auto buffers = coll->getBuffers();
// For subset collections we only fill one references branch
Expand Down Expand Up @@ -153,4 +166,13 @@ void ROOTFrameWriter::finish() {
m_finished = true;
}

std::tuple<std::vector<std::string>, std::vector<std::string>>
ROOTFrameWriter::checkConsistency(const std::vector<std::string>& collsToWrite, const std::string& category) const {
if (const auto it = m_categories.find(category); it != m_categories.end()) {
return root_utils::getInconsistentColls(it->second.collsToWrite, collsToWrite);
}

return {std::vector<std::string>{}, collsToWrite};
}

} // namespace podio
89 changes: 64 additions & 25 deletions src/ROOTNTupleWriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,22 +64,50 @@ void ROOTNTupleWriter::writeFrame(const podio::Frame& frame, const std::string&

void ROOTNTupleWriter::writeFrame(const podio::Frame& frame, const std::string& category,
const std::vector<std::string>& collsToWrite) {
auto& catInfo = getCategoryInfo(category);

// Use the writer as proxy to check whether this category has been initialized
// already and do so if not
const bool new_category = (catInfo.writer == nullptr);
if (new_category) {
// This is the minimal information that we need for now
catInfo.name = collsToWrite;
}

std::vector<StoreCollection> collections;
collections.reserve(collsToWrite.size());
for (const auto& name : collsToWrite) {
collections.reserve(catInfo.name.size());
// Only loop over the collections that were requested in the first Frame of
// this category
for (const auto& name : catInfo.name) {
auto* coll = frame.getCollectionForWrite(name);
if (!coll) {
// Make sure all collections that we want to write are actually available
// NOLINTNEXTLINE(performance-inefficient-string-concatenation)
throw std::runtime_error("Collection '" + name + "' in category '" + category + "' is not available in Frame");
}

collections.emplace_back(name, const_cast<podio::CollectionBase*>(coll));
}

bool new_category = false;
if (m_writers.find(category) == m_writers.end()) {
new_category = true;
if (new_category) {
// Now we have enough info to populate the rest
auto model = createModels(collections);
m_writers[category] = ROOT::Experimental::RNTupleWriter::Append(std::move(model), category, *m_file.get(), {});
catInfo.writer = ROOT::Experimental::RNTupleWriter::Append(std::move(model), category, *m_file.get(), {});

for (const auto& [name, coll] : collections) {
catInfo.id.emplace_back(coll->getID());
catInfo.type.emplace_back(coll->getTypeName());
catInfo.isSubsetCollection.emplace_back(coll->isSubsetCollection());
catInfo.schemaVersion.emplace_back(coll->getSchemaVersion());
}
} else {
if (!root_utils::checkConsistentColls(catInfo.name, collsToWrite)) {
throw std::runtime_error("Trying to write category '" + category + "' with inconsistent collection content. " +
root_utils::getInconsistentCollsMsg(catInfo.name, collsToWrite));
}
}

auto entry = m_writers[category]->GetModel()->CreateBareEntry();
auto entry = m_categories[category].writer->GetModel()->CreateBareEntry();

ROOT::Experimental::RNTupleWriteOptions options;
options.SetCompression(ROOT::RCompressionSetting::EDefaults::kUseGeneralPurpose);
Expand Down Expand Up @@ -121,14 +149,6 @@ void ROOTNTupleWriter::writeFrame(const podio::Frame& frame, const std::string&
// Not supported
// entry->CaptureValueUnsafe(root_utils::paramBranchName,
// &const_cast<podio::GenericParameters&>(frame.getParameters()));

if (new_category) {
m_collectionInfo[category].id.emplace_back(coll->getID());
m_collectionInfo[category].name.emplace_back(name);
m_collectionInfo[category].type.emplace_back(coll->getTypeName());
m_collectionInfo[category].isSubsetCollection.emplace_back(coll->isSubsetCollection());
m_collectionInfo[category].schemaVersion.emplace_back(coll->getSchemaVersion());
}
}

auto params = frame.getParameters();
Expand All @@ -137,8 +157,7 @@ void ROOTNTupleWriter::writeFrame(const podio::Frame& frame, const std::string&
fillParams<double>(params, entry.get());
fillParams<std::string>(params, entry.get());

m_writers[category]->Fill(*entry);
m_categories.insert(category);
m_categories[category].writer->Fill(*entry);
}

std::unique_ptr<ROOT::Experimental::RNTupleModel>
Expand Down Expand Up @@ -215,6 +234,15 @@ ROOTNTupleWriter::createModels(const std::vector<StoreCollection>& collections)
return model;
}

ROOTNTupleWriter::CollectionInfo& ROOTNTupleWriter::getCategoryInfo(const std::string& category) {
if (auto it = m_categories.find(category); it != m_categories.end()) {
return it->second;
}

auto [it, _] = m_categories.try_emplace(category, CollectionInfo{});
return it->second;
}

void ROOTNTupleWriter::finish() {

auto podioVersion = podio::version::build_version;
Expand All @@ -227,21 +255,21 @@ void ROOTNTupleWriter::finish() {
*edmField = edmDefinitions;

auto availableCategoriesField = m_metadata->MakeField<std::vector<std::string>>(root_utils::availableCategories);
for (auto& [c, _] : m_collectionInfo) {
for (auto& [c, _] : m_categories) {
availableCategoriesField->push_back(c);
}

for (auto& category : m_categories) {
for (auto& [category, collInfo] : m_categories) {
auto idField = m_metadata->MakeField<std::vector<unsigned int>>({root_utils::idTableName(category)});
*idField = m_collectionInfo[category].id;
*idField = collInfo.id;
auto collectionNameField = m_metadata->MakeField<std::vector<std::string>>({root_utils::collectionName(category)});
*collectionNameField = m_collectionInfo[category].name;
*collectionNameField = collInfo.name;
auto collectionTypeField = m_metadata->MakeField<std::vector<std::string>>({root_utils::collInfoName(category)});
*collectionTypeField = m_collectionInfo[category].type;
*collectionTypeField = collInfo.type;
auto subsetCollectionField = m_metadata->MakeField<std::vector<short>>({root_utils::subsetCollection(category)});
*subsetCollectionField = m_collectionInfo[category].isSubsetCollection;
*subsetCollectionField = collInfo.isSubsetCollection;
auto schemaVersionField = m_metadata->MakeField<std::vector<SchemaVersionT>>({"schemaVersion_" + category});
*schemaVersionField = m_collectionInfo[category].schemaVersion;
*schemaVersionField = collInfo.schemaVersion;
}

m_metadata->Freeze();
Expand All @@ -254,10 +282,21 @@ void ROOTNTupleWriter::finish() {

// All the tuple writers must be deleted before the file so that they flush
// unwritten output
m_writers.clear();
for (auto& [_, catInfo] : m_categories) {
catInfo.writer.reset();
}
m_metadataWriter.reset();

m_finished = true;
}

std::tuple<std::vector<std::string>, std::vector<std::string>>
ROOTNTupleWriter::checkConsistency(const std::vector<std::string>& collsToWrite, const std::string& category) const {
if (const auto it = m_categories.find(category); it != m_categories.end()) {
return root_utils::getInconsistentColls(it->second.name, collsToWrite);
}

return {std::vector<std::string>{}, collsToWrite};
}

} // namespace podio
70 changes: 70 additions & 0 deletions src/rootUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <algorithm>
#include <cctype>
#include <iostream>
#include <iterator>
#include <string>
#include <string_view>
#include <tuple>
Expand Down Expand Up @@ -272,6 +273,75 @@ inline std::vector<std::string> sortAlphabeticaly(std::vector<std::string> strin
return strings;
}

/**
* Check whether existingColls and candidateColls both contain the same
* collection names. Returns false if the two vectors differ in content. Inputs
* can have random order wrt each other, but the assumption is that each vector
* only contains unique names.
*/
inline bool checkConsistentColls(const std::vector<std::string>& existingColls,
const std::vector<std::string>& candidateColls) {
andresailer marked this conversation as resolved.
Show resolved Hide resolved
if (existingColls.size() != candidateColls.size()) {
return false;
}

// Since we are guaranteed to have unique names here, we can just look for
// collisions brute force, which seems to be quickest approach for vector
// sizes we typically have here (few hundred)
for (const auto& id : candidateColls) {
if (std::find(existingColls.begin(), existingColls.end(), id) == existingColls.end()) {
return false;
}
}
Copy link
Member

@jmcarcell jmcarcell Nov 13, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Since we are guaranteed to have unique names here, we can just look for
// collisions brute force, which seems to be quickest approach for vector
// sizes we typically have here (few hundred)
for (const auto& id : candidateColls) {
if (std::find(existingColls.begin(), existingColls.end(), id) == existingColls.end()) {
return false;
}
}
// Since we are guaranteed to have unique names here, we can just look for
// collisions brute force, which seems to be quickest approach for vector
// sizes we typically have here (few hundred)
for (const auto& id : candidateColls) {
if (!std::binary_search(existingColls.begin(), existingColls.end(), id, [](const auto& lhs, const auto& rhs) { std::lexicographical_compare(
lhs.begin(), lhs.end(), rhs.begin(), rhs.end(),
[](const auto& cl, const auto& cr) { return std::tolower(cl) < std::tolower(cr); }))) {
return false;
}
}

We can use that existingColls is ordered, and maybe make a helper function for the comparison lambda, looks like it should be faster.

Edit: Actually they are not sorted for the rntuple so this wouldn't work then without sorting them first

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added the binary_search version to benchmarks1 I did before to see if a more involved approach works better for us. It looks like a binary_search approach without lower-casing things is the best (depending on the number of collections). On the other hand, in most cases this check will probably exit simply because the two vectors have unequal size.

Results with one vector orderd as we do in the ROOTFrameWriter (and the second one unordered). The numbers are the sizes of the vectors. CompareLinear is the current implementation, CompareBinary and CompareBinaryLowerCase use binary_search. The former one uses binary_search(existingColls.begin(), existingColls.end(), id), the second one the proposal from above.

---------------------------------------------------------------------
Benchmark                           Time             CPU   Iterations
---------------------------------------------------------------------
CompareLinear/2                  11.8 ns         11.8 ns     57782514
CompareLinear/8                  50.6 ns         50.5 ns     10000000
CompareLinear/64                 1153 ns         1151 ns       603770
CompareLinear/512               61044 ns        60996 ns        11362
CompareBinary/2                  27.8 ns         27.8 ns     24459705
CompareBinary/8                   142 ns          141 ns      4901529
CompareBinary/64                  525 ns          525 ns      1375137
CompareBinary/512                 706 ns          705 ns       983974
CompareBinaryLowerCase/2          251 ns          251 ns      2779374
CompareBinaryLowerCase/8         1646 ns         1644 ns       422281
CompareBinaryLowerCase/64       19288 ns        19274 ns        36780
CompareBinaryLowerCase/512     173434 ns       173318 ns         4026

Footnotes

  1. Basically what the benchmark does is to select N collection names randomly from all the collection names that we collected during Make CollectionIDs a 32bit hash value of the collection name #412. This is done twice, in order to have vectors to compare. In this way the cheap check for the sizes is effectively always skipped and depending on N the actual loop is executed at least a few times. The biggest N=512 in this case represents the worst case for the runtime as it will do the full N^2 of work. This is not entirely our use case, but it should represent sort of the worst case scenario.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After some further checking it turns out that binary_search without taking into account that existingColls are lexicographically sorted case insensitve doesn't work because the pre-conditions of binary_search are broken. Hence, it looks like std::find and linear searching is still the way to go.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just for documentation of our short internal discussion. Making the binary_search approach exit early in case the strings are of unequal sizes, speeds that up quite a bit and it becomes comparable in speed at 64 elements already and scales much better:

-----------------------------------------------------------------------------
Benchmark                                   Time             CPU   Iterations
-----------------------------------------------------------------------------
CompareLinear/2                          11.3 ns         11.3 ns     58053921
CompareLinear/8                          48.3 ns         48.3 ns     14333123
CompareLinear/64                         1130 ns         1130 ns       618685
CompareLinear/512                       60220 ns        60216 ns        11686
CompareBinaryLowerCaseShortCut/2          150 ns          150 ns      4636032
CompareBinaryLowerCaseShortCut/8          498 ns          498 ns      1373998
CompareBinaryLowerCaseShortCut/64        1159 ns         1159 ns       599214
CompareBinaryLowerCaseShortCut/512       6149 ns         6149 ns       111620

Given that our happy path will always have to go through the complete vector to see whether the contents are the same, I have switched to this approach and also made sure that the collection names are sorted accordingly for the RNTuple writer.


return true;
}

/**
* Get the differences in the existingColls and candidateColls collection names.
* Returns two vectors of collection names. The first one are the collections
* that only exist in the existingColls, the seconde one are the names that only
* exist in the candidateColls.
*/
inline std::tuple<std::vector<std::string>, std::vector<std::string>>
getInconsistentColls(std::vector<std::string> existingColls, std::vector<std::string> candidateColls) {
// Need sorted ranges for set_difference
std::sort(existingColls.begin(), existingColls.end());
std::sort(candidateColls.begin(), candidateColls.end());

std::vector<std::string> onlyInExisting{};
std::set_difference(existingColls.begin(), existingColls.end(), candidateColls.begin(), candidateColls.end(),
std::back_inserter(onlyInExisting));

std::vector<std::string> onlyInCands{};
std::set_difference(candidateColls.begin(), candidateColls.end(), existingColls.begin(), existingColls.end(),
std::back_inserter(onlyInCands));

return {std::move(onlyInExisting), std::move(onlyInCands)};
}

inline std::string getInconsistentCollsMsg(const std::vector<std::string>& existingColls,
const std::vector<std::string>& candidateColls) {
const auto& [onlyExisting, onlyCands] = getInconsistentColls(existingColls, candidateColls);

std::stringstream sstr;
sstr << "missing: [";
std::string sep = "";
for (const auto& name : onlyExisting) {
sstr << sep << name;
sep = ",";
}
sep = "";
sstr << "], superfluous: [";
for (const auto& name : onlyCands) {
sstr << sep << name;
sep = ",";
}
sstr << "]";
tmadlener marked this conversation as resolved.
Show resolved Hide resolved

return sstr.str();
}

} // namespace podio::root_utils

#endif
2 changes: 1 addition & 1 deletion tests/unittests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ else()
WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
TEST_PREFIX "UT_" # make it possible to filter easily with -R ^UT
TEST_SPEC ${filter_tests} # discover only tests that are known to not fail
DL_PATHS ${CMAKE_CURRENT_BINARY_DIR}:${PROJECT_BINARY_DIR}/src:$<TARGET_FILE_DIR:ROOT::Tree>:$<$<TARGET_EXISTS:SIO::sio>:$<TARGET_FILE_DIR:SIO::sio>>:$ENV{LD_LIBRARY_PATH}
DL_PATHS ${CMAKE_CURRENT_BINARY_DIR}:${PROJECT_BINARY_DIR}/src:${PROJECT_BINARY_DIR}/tests:$<TARGET_FILE_DIR:ROOT::Tree>:$<$<TARGET_EXISTS:SIO::sio>:$<TARGET_FILE_DIR:SIO::sio>>:$ENV{LD_LIBRARY_PATH}
PROPERTIES
ENVIRONMENT
PODIO_SIOBLOCK_PATH=${CMAKE_CURRENT_BINARY_DIR}
Expand Down
Loading