Skip to content

Commit

Permalink
Merge pull request #260 from jeromekelleher/fix-parse-bug
Browse files Browse the repository at this point in the history
Fix character field bounds update error
  • Loading branch information
jeromekelleher authored Jun 25, 2024
2 parents 20e6dd1 + 937aee0 commit d192054
Show file tree
Hide file tree
Showing 6 changed files with 20 additions and 18 deletions.
8 changes: 5 additions & 3 deletions bio2zarr/vcf2zarr/icf.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,9 +500,9 @@ def transform(self, vcf_value):
def transform_and_update_bounds(self, vcf_value):
if vcf_value is None:
return None
# print(self, self.field.full_name, "T", vcf_value)
value = self.transform(vcf_value)
self.update_bounds(value)
# print(self.field.full_name, "T", vcf_value, "->", value)
return value


Expand Down Expand Up @@ -531,13 +531,15 @@ def update_bounds(self, value):
class StringValueTransformer(VcfValueTransformer):
def update_bounds(self, value):
summary = self.field.summary
number = value.shape[-1]
if self.field.category == "FORMAT":
number = max(len(v) for v in value)
else:
number = value.shape[-1]
# TODO would be nice to report string lengths, but not
# really necessary.
summary.max_number = max(summary.max_number, number)

def transform(self, vcf_value):
# print("transform", vcf_value)
if self.dimension == 1:
value = np.array(list(vcf_value.split(",")))
else:
Expand Down
17 changes: 8 additions & 9 deletions bio2zarr/vcf2zarr/verification.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,19 +109,17 @@ def assert_info_val_equal(vcf_val, zarr_val, vcf_type):
assert_all_fill(zarr_val[1:], vcf_type)


def assert_format_val_equal(vcf_val, zarr_val, vcf_type):
def assert_format_val_equal(vcf_val, zarr_val, vcf_type, vcf_number):
assert vcf_val is not None
assert isinstance(vcf_val, np.ndarray)
if vcf_type in ("String", "Character"):
assert len(vcf_val) == len(zarr_val)
for v, z in zip(vcf_val, zarr_val):
split = list(v.split(","))
# Note: deliberately duplicating logic here between this and the
# INFO col above to make sure all combinations are covered by tests
k = len(split)
if k == 1:
if vcf_number == "1":
assert v == z
else:
split = list(v.split(","))
k = len(split)
nt.assert_equal(split, z[:k])
assert_all_fill(z[k:], vcf_type)
else:
Expand Down Expand Up @@ -173,7 +171,8 @@ def verify(vcf_path, zarr_path, show_progress=False):
if colname.startswith("call") and not colname.startswith("call_genotype"):
vcf_name = colname.split("_", 1)[1]
vcf_type = format_headers[vcf_name]["Type"]
format_fields[vcf_name] = vcf_type, iter(root[colname])
vcf_number = format_headers[vcf_name]["Number"]
format_fields[vcf_name] = vcf_type, vcf_number, iter(root[colname])
if colname.startswith("variant"):
name = colname.split("_", 1)[1]
if name.isupper():
Expand Down Expand Up @@ -221,10 +220,10 @@ def verify(vcf_path, zarr_path, show_progress=False):
else:
assert_info_val_equal(vcf_val, zarr_val, vcf_type)

for name, (vcf_type, zarr_iter) in format_fields.items():
for name, (vcf_type, vcf_number, zarr_iter) in format_fields.items():
vcf_val = row.format(name)
zarr_val = next(zarr_iter)
if vcf_val is None:
assert_format_val_missing(zarr_val, vcf_type)
else:
assert_format_val_equal(vcf_val, zarr_val, vcf_type)
assert_format_val_equal(vcf_val, zarr_val, vcf_type, vcf_number)
Binary file added tests/data/vcf/issue_251.vcf.gz
Binary file not shown.
Binary file added tests/data/vcf/issue_251.vcf.gz.csi
Binary file not shown.
12 changes: 6 additions & 6 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,12 +198,12 @@ def test_5_chunk_1(self, n, expected):
@pytest.mark.parametrize(
("path", "expected"),
[
# NOTE: this data was generated using du -sb on a Linux system.
# It *might* work in CI, but it may well not either, as it's
# probably dependent on a whole bunch of things. Expect to fail
# at some point.
("tests/data", 4973315),
("tests/data/vcf", 4961178),
# NOTE: this data is generated using du -sb on a Linux system.
# It works in CI on Linux, but it'll probably break at some point.
# It's also necessary to update these numbers each time a new data
# file gets added
("tests/data", 4973751),
("tests/data/vcf", 4961614),
("tests/data/vcf/sample.vcf.gz", 1089),
],
)
Expand Down
1 change: 1 addition & 0 deletions tests/test_vcf_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -838,6 +838,7 @@ def test_duplicate_paths(self, tmp_path):
"field_type_combos.vcf.gz",
"out_of_order_contigs.vcf.gz",
"chr_m_indels.vcf.gz",
"issue_251.vcf.gz",
],
)
def test_by_validating(name, tmp_path):
Expand Down

0 comments on commit d192054

Please sign in to comment.