Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support opening .hic files with only 1 bin per chromosome #180

Merged
merged 4 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions src/hictk/convert/cool_to_hic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ static void copy_pixels(hic::internal::HiCFileWriter& w, const cooler::File& bas
}

static void copy_normalization_vector(hic::internal::HiCFileWriter& w, const cooler::File& clr,
const balancing::Method& norm, bool throw_if_missing) {
std::uint32_t resolution, const balancing::Method& norm,
bool throw_if_missing) {
if (norm == balancing::Method::NONE()) {
return;
}
Expand All @@ -54,8 +55,8 @@ static void copy_normalization_vector(hic::internal::HiCFileWriter& w, const coo
const auto& weights = *clr.normalization(norm);

const auto norm_name = norm.to_string() == "weight" ? "ICE" : norm.to_string();
SPDLOG_INFO(FMT_STRING("[{}] adding {} normalization vector"), clr.resolution(), norm_name);
w.add_norm_vector(norm_name, "BP", clr.resolution(), weights, true);
SPDLOG_INFO(FMT_STRING("[{}] adding {} normalization vector"), resolution, norm_name);
w.add_norm_vector(norm_name, "BP", resolution, weights, true);

} catch (const std::exception& e) {
const std::string_view msg{e.what()};
Expand All @@ -79,7 +80,7 @@ static void copy_normalization_vectors(hic::internal::HiCFileWriter& w,

if (c.input_format == "cool") {
for (const auto& norm : c.normalization_methods) {
copy_normalization_vector(w, base_clr, norm, c.fail_if_normalization_method_is_not_avaliable);
copy_normalization_vector(w, base_clr, base_clr.resolution(), norm, c.fail_if_normalization_method_is_not_avaliable);
}
w.write_norm_vectors_and_norm_expected_values();
return;
Expand All @@ -91,7 +92,7 @@ static void copy_normalization_vectors(hic::internal::HiCFileWriter& w,
for (const auto& res : c.resolutions) {
const auto clr = mclr.open(res);
for (const auto& norm : c.normalization_methods) {
copy_normalization_vector(w, clr, norm, c.fail_if_normalization_method_is_not_avaliable);
copy_normalization_vector(w, clr, res, norm, c.fail_if_normalization_method_is_not_avaliable);
}
}
w.write_norm_vectors_and_norm_expected_values();
Expand Down
4 changes: 3 additions & 1 deletion src/libhictk/hic/include/hictk/hic/impl/file_writer_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ inline void HiCFileWriter::write_all_matrix(std::uint32_t target_num_bins) {
static_cast<std::uint32_t>((genome_size + target_num_bins - 1) / target_num_bins);
const auto factor = std::max(std::uint32_t(1), target_resolution / base_resolution);
target_resolution = factor * base_resolution;
const auto target_resolution_scaled = target_resolution / DEFAULT_CHROM_ALL_SCALE_FACTOR;
const auto target_resolution_scaled = std::max(std::uint32_t{1}, target_resolution / DEFAULT_CHROM_ALL_SCALE_FACTOR);

SPDLOG_INFO(FMT_STRING("writing pixels for {}:{} matrix..."), chromosomes().at(0).name(),
chromosomes().at(0).name());
Expand All @@ -367,6 +367,8 @@ inline void HiCFileWriter::write_all_matrix(std::uint32_t target_num_bins) {
genome_size_scaled += static_cast<std::uint32_t>(num_bins) * target_resolution_scaled;
}

genome_size_scaled = std::max(std::uint32_t{1}, genome_size_scaled);

const auto bin_table_ALL = std::make_shared<const BinTable>(
Reference{Chromosome{0, "__ALL__", genome_size_scaled}}, target_resolution_scaled);
const auto chrom = bin_table_ALL->chromosomes().at(0);
Expand Down
13 changes: 7 additions & 6 deletions src/libhictk/hic/include/hictk/hic/impl/pixel_selector_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,18 +209,19 @@ inline std::size_t PixelSelector::estimate_optimal_cache_size(

// Try to guess how many blocks overlap a single row of pixels
std::size_t max_blocks_per_row = 0;
const auto &chrom = coord1().bin1.chrom();
const auto bin_size = bins().resolution();

const std::size_t first_bin_id = 0;
const std::size_t last_bin_id =
bins().at(coord1().bin1.chrom(), coord1().bin1.chrom().size() - 1).rel_id() - 1;
const auto samples = (std::min)(num_samples, bins().subset(coord1().bin1.chrom()).size());
const std::size_t last_bin_id = bins().at(chrom, chrom.size() - 1).rel_id();

const auto samples = std::min(num_samples, bins().subset(chrom).size());
for (std::size_t i = 0; i < samples; ++i) {
const auto bin_id =
std::uniform_int_distribution<std::size_t>{first_bin_id, last_bin_id}(rand_eng);
const auto bin_id = std::uniform_int_distribution<std::size_t>{
first_bin_id, std::min(last_bin_id, last_bin_id - 1)}(rand_eng);

const auto pos1 = static_cast<std::uint32_t>(bin_id * bin_size);
const auto bin1 = bins().at(coord1().bin1.chrom(), pos1);
const auto bin1 = bins().at(chrom, pos1);

auto overlap = idx.find_overlaps({bin1, bin1}, coord2());
const auto num_blocks = static_cast<std::size_t>(std::distance(overlap.begin(), overlap.end()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ inline Reference Reference::add_ALL(std::uint32_t scaling_factor) const {
all_size += chrom.size() / scaling_factor;
}

std::vector<Chromosome> chroms{Chromosome{0, "All", all_size}};
std::vector<Chromosome> chroms{Chromosome{0, "All", std::max(std::uint32_t{1}, all_size)}};
std::copy_if(begin(), end(), std::back_inserter(chroms),
[](const Chromosome& chrom) { return !chrom.is_all(); });

Expand Down
12 changes: 12 additions & 0 deletions test/units/hic/file_writer_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ TEST_CASE("HiC: HiCFileWriter", "[hic][v9][long]") {
const auto path1 = (datadir / "4DNFIZ1ZVXC8.hic9").string();
const auto path2 = (testdir() / "hic_writer_001.hic").string();
const auto path3 = (testdir() / "hic_writer_002.hic").string();
const auto path4 = (testdir() / "hic_writer_003.hic").string();

SECTION("create file (st)") {
const std::vector<std::uint32_t> resolutions{250'000, 500'000, 2'500'000};
Expand All @@ -171,6 +172,17 @@ TEST_CASE("HiC: HiCFileWriter", "[hic][v9][long]") {
hic_file_writer_create_file_test(path1, path2, resolutions, 3, true);
}

SECTION("regression PR 180") {
// Ensure we can create .hic files having bin tables with 1 bin per chromosome
// See https://github.com/paulsengroup/hictk/pull/180
const hictk::Reference chromosomes{{0, "chr1", 10}};
HiCFileWriter w(path4, chromosomes, {100});

const std::vector<Pixel<float>> pixels{Pixel<float>{w.bins(100), 0, 0, 1.0F}};
w.add_pixels(100, pixels.begin(), pixels.end());
w.serialize(); // Before PR 180, this used to throw
}

SECTION("add weights") {
const std::uint32_t resolution = 500'000;
const hic::File hf1(path1, resolution);
Expand Down
Loading