From 7df9f8457bac59f569efdf0d3cdb8f7b3ab534ae Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Mon, 30 Sep 2024 09:57:18 +0200 Subject: [PATCH] hictk load: support ingesting interactions from pairs/pixel files with interactions overlapping with the lower-triangular matrix (#264) * Update hictk load to support mirroring pixels overlapping with the lower-triangular matrix * Add integration test cases covering the new flags * Fix typo --- cmake/FetchTestDataset.cmake | 4 ++-- src/hictk/cli/cli_load.cpp | 16 ++++++++++++++++ src/hictk/include/hictk/tools/config.hpp | 1 + src/hictk/load/load.cpp | 9 ++++++--- src/hictk/load/load.hpp | 20 ++++++++++++++++---- test/integration/config.toml | 3 +++ 6 files changed, 44 insertions(+), 9 deletions(-) diff --git a/cmake/FetchTestDataset.cmake b/cmake/FetchTestDataset.cmake index 188fb2a2..5a3f29a7 100644 --- a/cmake/FetchTestDataset.cmake +++ b/cmake/FetchTestDataset.cmake @@ -4,8 +4,8 @@ # cmake-format: off file( - DOWNLOAD https://zenodo.org/records/13849053/files/hictk_test_data.tar.zst?download=1 - EXPECTED_HASH SHA256=4a5a871421a981bc11fb4e4c6b9877e281dd721ef56fc35ff8cb940e81e301a0 + DOWNLOAD https://zenodo.org/records/13851354/files/hictk_test_data.tar.zst?download=1 + EXPECTED_HASH SHA256=55071172638948112a69a43ebfd07b0b9b830cc5e1bfef87b05b586d228ab1bd "${PROJECT_SOURCE_DIR}/test/data/hictk_test_data.tar.zst" ) # cmake-format: on diff --git a/src/hictk/cli/cli_load.cpp b/src/hictk/cli/cli_load.cpp index ff09c406..3a807795 100644 --- a/src/hictk/cli/cli_load.cpp +++ b/src/hictk/cli/cli_load.cpp @@ -136,6 +136,22 @@ void Cli::make_load_subcommand() { "Assume input files are already sorted.") ->capture_default_str(); + sc.add_flag( + "--validate-pixels,!--no-validate-pixels", + c.validate_pixels, + "Toggle pixel validation on or off.\n" + "When --no-validate-pixels is used and invalid pixels are encountered,\n" + "hictk will either crash or produce invalid files.") + ->capture_default_str(); + + sc.add_flag( + "--transpose-lower-triangular-pixels,!--no-transpose-lower-triangular-pixels", + c.transpose_lower_triangular_pixels, + "Transpose pixels overlapping the lower-triangular matrix.\n" + "When --no-transpose-lower-triangular-pixels is used and one or more pixels overlapping\n" + "with the lower triangular matrix are encountered an exception will be raised.") + ->capture_default_str(); + sc.add_option( "--chunk-size", c.batch_size, diff --git a/src/hictk/include/hictk/tools/config.hpp b/src/hictk/include/hictk/tools/config.hpp index 9a46a013..7b966870 100644 --- a/src/hictk/include/hictk/tools/config.hpp +++ b/src/hictk/include/hictk/tools/config.hpp @@ -157,6 +157,7 @@ struct LoadConfig { bool assume_sorted{false}; bool force{false}; bool validate_pixels{true}; + bool transpose_lower_triangular_pixels{false}; bool skip_all_vs_all_matrix{true}; std::string output_format{"auto"}; diff --git a/src/hictk/load/load.cpp b/src/hictk/load/load.cpp index 00406fa1..b5bc4df8 100644 --- a/src/hictk/load/load.cpp +++ b/src/hictk/load/load.cpp @@ -42,7 +42,8 @@ namespace hictk::tools { const auto queue_capacity = queue_capacity_bytes / sizeof(ThinPixel); PixelQueue pixel_queue{queue_capacity}; - auto producer = spawn_producer(tpool, parser, pixel_queue, c.offset, early_return); + auto producer = spawn_producer(tpool, parser, pixel_queue, c.offset, early_return, + c.transpose_lower_triangular_pixels); auto consumer = spawn_consumer(tpool, c, bins, parser.assembly(), format, pixel_queue, early_return); @@ -73,7 +74,8 @@ namespace hictk::tools { const auto& bins = parser.bins(); - auto producer = spawn_producer(tpool, parser, pixel_queue, c.offset, early_return); + auto producer = spawn_producer(tpool, parser, pixel_queue, c.offset, early_return, + c.transpose_lower_triangular_pixels); auto consumer = spawn_consumer(tpool, c, bins, parser.assembly(), format, pixel_queue, early_return); @@ -104,7 +106,8 @@ namespace hictk::tools { const auto& bins = parser.bins(); - auto producer = spawn_producer(tpool, parser, pixel_queue, c.offset, early_return); + auto producer = spawn_producer(tpool, parser, pixel_queue, c.offset, early_return, + c.transpose_lower_triangular_pixels); auto consumer = spawn_consumer(tpool, c, bins, parser.assembly(), format, pixel_queue, early_return); diff --git a/src/hictk/load/load.hpp b/src/hictk/load/load.hpp index a10bc2dc..cf1ae28b 100644 --- a/src/hictk/load/load.hpp +++ b/src/hictk/load/load.hpp @@ -37,7 +37,7 @@ namespace hictk::tools { const std::filesystem::path& path_to_bins, std::uint32_t resolution, std::string_view assembly); -template +template inline void parse_pixels(PixelParser& parser, std::int64_t offset, PixelQueue& queue, std::atomic& early_return) { ThinPixel buffer{}; @@ -45,6 +45,13 @@ inline void parse_pixels(PixelParser& parser, std::int64_t offset, PixelQueue assert(buffer.bin1_id != ThinPixel::null_id); assert(buffer.bin2_id != ThinPixel::null_id); assert(buffer.count != 0); + + if constexpr (transpose_pixels) { + if (buffer.bin1_id > buffer.bin2_id) { + std::swap(buffer.bin1_id, buffer.bin2_id); + } + } + while (!queue.try_enqueue(buffer) && !early_return) { std::this_thread::sleep_for(std::chrono::milliseconds(10)); } @@ -133,10 +140,15 @@ template template [[nodiscard]] inline std::future spawn_producer(BS::thread_pool& tpool, PixelParser& parser, PixelQueue& queue, std::int64_t offset, - std::atomic& early_return) { - return tpool.submit_task([&parser, &queue, offset, &early_return]() { + std::atomic& early_return, + bool transpose_lower_triangular_pixels) { + return tpool.submit_task([&parser, &queue, offset, &early_return, + transpose_lower_triangular_pixels]() { try { - return parse_pixels(parser, offset, queue, early_return); + if (transpose_lower_triangular_pixels) { + return parse_pixels(parser, offset, queue, early_return); + } + return parse_pixels(parser, offset, queue, early_return); } catch (...) { SPDLOG_WARN( FMT_STRING("exception caught in thread parsing interactions: returning immediately!")); diff --git a/test/integration/config.toml b/test/integration/config.toml index 863e7637..73fe0b26 100644 --- a/test/integration/config.toml +++ b/test/integration/config.toml @@ -132,6 +132,7 @@ files = [ { input-path = "integration_tests/4DNFIKNWM36K.subset.validpairs.xz", format = "validpairs" }, { input-path = "integration_tests/dm6.bins.bed", format = "bed" }, { input-path = "integration_tests/dm6.chrom.sizes", format = "chrom.sizes" }, + { input-path = "integration_tests/dm6.chrom.sizes.sorted", format = "chrom.sizes" }, { input-path = "integration_tests/4DNFIKNWM36K.subset.fixed-bins.cool", format = "cool" }, { input-path = "integration_tests/4DNFIKNWM36K.subset.fixed-bins.float.cool", format = "cool" }, { input-path = "integration_tests/4DNFIKNWM36K.subset.variable-bins.cool", format = "cool", variable-bin-size = true } @@ -149,6 +150,8 @@ test-cases = [ { input-path = "integration_tests/4DNFIKNWM36K.subset.sorted.coo.xz", format = "coo", chrom-sizes-path = "integration_tests/dm6.chrom.sizes", reference-uri = "integration_tests/4DNFIKNWM36K.subset.fixed-bins.cool", timeout = 240, args = { format = "coo", bin-size = 10000, assume-sorted = "" } }, { input-path = "integration_tests/4DNFIKNWM36K.subset.unsorted.bg2.xz", format = "bg2", chrom-sizes-path = "integration_tests/dm6.chrom.sizes", reference-uri = "integration_tests/4DNFIKNWM36K.subset.fixed-bins.cool", timeout = 240, args = { format = "bg2", bin-size = 10000, assume-unsorted = "" } }, { input-path = "integration_tests/4DNFIKNWM36K.subset.unsorted.coo.xz", format = "coo", chrom-sizes-path = "integration_tests/dm6.chrom.sizes", reference-uri = "integration_tests/4DNFIKNWM36K.subset.fixed-bins.cool", timeout = 240, args = { format = "coo", bin-size = 10000, assume-unsorted = "" } }, + { input-path = "integration_tests/4DNFIKNWM36K.subset.sorted.bg2.xz", format = "bg2", chrom-sizes-path = "integration_tests/dm6.chrom.sizes.sorted", expect-failure = true, timeout = 240, args = { format = "bg2", bin-size = 10000, assume-unsorted = "" } }, + { input-path = "integration_tests/4DNFIKNWM36K.subset.sorted.bg2.xz", format = "bg2", chrom-sizes-path = "integration_tests/dm6.chrom.sizes.sorted", no-validate = true, timeout = 240, args = { format = "bg2", bin-size = 10000, assume-unsorted = "", transpose-lower-triangular-pixels = "" } }, { input-path = "integration_tests/4DNFIKNWM36K.subset.sorted.float.coo.xz", format = "coo", chrom-sizes-path = "integration_tests/dm6.chrom.sizes", reference-uri = "integration_tests/4DNFIKNWM36K.subset.fixed-bins.float.cool", timeout = 240, args = { format = "coo", bin-size = 10000, assume-sorted = "", count-as-float = "" } } ]