Skip to content

Commit

Permalink
hictk load: support ingesting interactions from pairs/pixel files wit…
Browse files Browse the repository at this point in the history
…h interactions overlapping with the lower-triangular matrix (#264)

* Update hictk load to support mirroring pixels overlapping with the lower-triangular matrix

* Add integration test cases covering the new flags

* Fix typo
  • Loading branch information
robomics committed Sep 30, 2024
1 parent 08f37d5 commit 7df9f84
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 9 deletions.
4 changes: 2 additions & 2 deletions cmake/FetchTestDataset.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

# cmake-format: off
file(
DOWNLOAD https://zenodo.org/records/13849053/files/hictk_test_data.tar.zst?download=1
EXPECTED_HASH SHA256=4a5a871421a981bc11fb4e4c6b9877e281dd721ef56fc35ff8cb940e81e301a0
DOWNLOAD https://zenodo.org/records/13851354/files/hictk_test_data.tar.zst?download=1
EXPECTED_HASH SHA256=55071172638948112a69a43ebfd07b0b9b830cc5e1bfef87b05b586d228ab1bd
"${PROJECT_SOURCE_DIR}/test/data/hictk_test_data.tar.zst"
)
# cmake-format: on
Expand Down
16 changes: 16 additions & 0 deletions src/hictk/cli/cli_load.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,22 @@ void Cli::make_load_subcommand() {
"Assume input files are already sorted.")
->capture_default_str();

sc.add_flag(
"--validate-pixels,!--no-validate-pixels",
c.validate_pixels,
"Toggle pixel validation on or off.\n"
"When --no-validate-pixels is used and invalid pixels are encountered,\n"
"hictk will either crash or produce invalid files.")
->capture_default_str();

sc.add_flag(
"--transpose-lower-triangular-pixels,!--no-transpose-lower-triangular-pixels",
c.transpose_lower_triangular_pixels,
"Transpose pixels overlapping the lower-triangular matrix.\n"
"When --no-transpose-lower-triangular-pixels is used and one or more pixels overlapping\n"
"with the lower triangular matrix are encountered an exception will be raised.")
->capture_default_str();

sc.add_option(
"--chunk-size",
c.batch_size,
Expand Down
1 change: 1 addition & 0 deletions src/hictk/include/hictk/tools/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ struct LoadConfig {
bool assume_sorted{false};
bool force{false};
bool validate_pixels{true};
bool transpose_lower_triangular_pixels{false};
bool skip_all_vs_all_matrix{true};

std::string output_format{"auto"};
Expand Down
9 changes: 6 additions & 3 deletions src/hictk/load/load.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ namespace hictk::tools {
const auto queue_capacity = queue_capacity_bytes / sizeof(ThinPixel<float>);
PixelQueue<float> pixel_queue{queue_capacity};

auto producer = spawn_producer(tpool, parser, pixel_queue, c.offset, early_return);
auto producer = spawn_producer(tpool, parser, pixel_queue, c.offset, early_return,
c.transpose_lower_triangular_pixels);
auto consumer =
spawn_consumer(tpool, c, bins, parser.assembly(), format, pixel_queue, early_return);

Expand Down Expand Up @@ -73,7 +74,8 @@ namespace hictk::tools {

const auto& bins = parser.bins();

auto producer = spawn_producer(tpool, parser, pixel_queue, c.offset, early_return);
auto producer = spawn_producer(tpool, parser, pixel_queue, c.offset, early_return,
c.transpose_lower_triangular_pixels);
auto consumer =
spawn_consumer(tpool, c, bins, parser.assembly(), format, pixel_queue, early_return);

Expand Down Expand Up @@ -104,7 +106,8 @@ namespace hictk::tools {

const auto& bins = parser.bins();

auto producer = spawn_producer(tpool, parser, pixel_queue, c.offset, early_return);
auto producer = spawn_producer(tpool, parser, pixel_queue, c.offset, early_return,
c.transpose_lower_triangular_pixels);
auto consumer =
spawn_consumer(tpool, c, bins, parser.assembly(), format, pixel_queue, early_return);

Expand Down
20 changes: 16 additions & 4 deletions src/hictk/load/load.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,21 @@ namespace hictk::tools {
const std::filesystem::path& path_to_bins,
std::uint32_t resolution, std::string_view assembly);

template <typename N>
template <bool transpose_pixels = true, typename N>
inline void parse_pixels(PixelParser& parser, std::int64_t offset, PixelQueue<N>& queue,
std::atomic<bool>& early_return) {
ThinPixel<N> buffer{};
while (!early_return && parser.next_pixel(buffer, offset)) {
assert(buffer.bin1_id != ThinPixel<N>::null_id);
assert(buffer.bin2_id != ThinPixel<N>::null_id);
assert(buffer.count != 0);

if constexpr (transpose_pixels) {
if (buffer.bin1_id > buffer.bin2_id) {
std::swap(buffer.bin1_id, buffer.bin2_id);
}
}

while (!queue.try_enqueue(buffer) && !early_return) {
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}
Expand Down Expand Up @@ -133,10 +140,15 @@ template <typename N>
template <typename N>
[[nodiscard]] inline std::future<void> spawn_producer(BS::thread_pool& tpool, PixelParser& parser,
PixelQueue<N>& queue, std::int64_t offset,
std::atomic<bool>& early_return) {
return tpool.submit_task([&parser, &queue, offset, &early_return]() {
std::atomic<bool>& early_return,
bool transpose_lower_triangular_pixels) {
return tpool.submit_task([&parser, &queue, offset, &early_return,
transpose_lower_triangular_pixels]() {
try {
return parse_pixels(parser, offset, queue, early_return);
if (transpose_lower_triangular_pixels) {
return parse_pixels<true>(parser, offset, queue, early_return);
}
return parse_pixels<false>(parser, offset, queue, early_return);
} catch (...) {
SPDLOG_WARN(
FMT_STRING("exception caught in thread parsing interactions: returning immediately!"));
Expand Down
3 changes: 3 additions & 0 deletions test/integration/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ files = [
{ input-path = "integration_tests/4DNFIKNWM36K.subset.validpairs.xz", format = "validpairs" },
{ input-path = "integration_tests/dm6.bins.bed", format = "bed" },
{ input-path = "integration_tests/dm6.chrom.sizes", format = "chrom.sizes" },
{ input-path = "integration_tests/dm6.chrom.sizes.sorted", format = "chrom.sizes" },
{ input-path = "integration_tests/4DNFIKNWM36K.subset.fixed-bins.cool", format = "cool" },
{ input-path = "integration_tests/4DNFIKNWM36K.subset.fixed-bins.float.cool", format = "cool" },
{ input-path = "integration_tests/4DNFIKNWM36K.subset.variable-bins.cool", format = "cool", variable-bin-size = true }
Expand All @@ -149,6 +150,8 @@ test-cases = [
{ input-path = "integration_tests/4DNFIKNWM36K.subset.sorted.coo.xz", format = "coo", chrom-sizes-path = "integration_tests/dm6.chrom.sizes", reference-uri = "integration_tests/4DNFIKNWM36K.subset.fixed-bins.cool", timeout = 240, args = { format = "coo", bin-size = 10000, assume-sorted = "" } },
{ input-path = "integration_tests/4DNFIKNWM36K.subset.unsorted.bg2.xz", format = "bg2", chrom-sizes-path = "integration_tests/dm6.chrom.sizes", reference-uri = "integration_tests/4DNFIKNWM36K.subset.fixed-bins.cool", timeout = 240, args = { format = "bg2", bin-size = 10000, assume-unsorted = "" } },
{ input-path = "integration_tests/4DNFIKNWM36K.subset.unsorted.coo.xz", format = "coo", chrom-sizes-path = "integration_tests/dm6.chrom.sizes", reference-uri = "integration_tests/4DNFIKNWM36K.subset.fixed-bins.cool", timeout = 240, args = { format = "coo", bin-size = 10000, assume-unsorted = "" } },
{ input-path = "integration_tests/4DNFIKNWM36K.subset.sorted.bg2.xz", format = "bg2", chrom-sizes-path = "integration_tests/dm6.chrom.sizes.sorted", expect-failure = true, timeout = 240, args = { format = "bg2", bin-size = 10000, assume-unsorted = "" } },
{ input-path = "integration_tests/4DNFIKNWM36K.subset.sorted.bg2.xz", format = "bg2", chrom-sizes-path = "integration_tests/dm6.chrom.sizes.sorted", no-validate = true, timeout = 240, args = { format = "bg2", bin-size = 10000, assume-unsorted = "", transpose-lower-triangular-pixels = "" } },
{ input-path = "integration_tests/4DNFIKNWM36K.subset.sorted.float.coo.xz", format = "coo", chrom-sizes-path = "integration_tests/dm6.chrom.sizes", reference-uri = "integration_tests/4DNFIKNWM36K.subset.fixed-bins.float.cool", timeout = 240, args = { format = "coo", bin-size = 10000, assume-sorted = "", count-as-float = "" } }
]

Expand Down

0 comments on commit 7df9f84

Please sign in to comment.