diff --git a/bio2zarr/vcf2zarr/icf.py b/bio2zarr/vcf2zarr/icf.py index 91e79c1..f5554c5 100644 --- a/bio2zarr/vcf2zarr/icf.py +++ b/bio2zarr/vcf2zarr/icf.py @@ -212,6 +212,7 @@ def make_field_def(name, vcf_type, vcf_number): make_field_def("FILTERS", "String", "."), make_field_def("REF", "String", "1"), make_field_def("ALT", "String", "."), + make_field_def("rlen", "Integer", "1"), # computed field ] return fields @@ -1276,6 +1277,7 @@ def process_partition(self, partition_index): tcw.append("FILTERS", variant.FILTERS) tcw.append("REF", variant.REF) tcw.append("ALT", variant.ALT) + tcw.append("rlen", variant.end - variant.start) for field in info_fields: tcw.append(field.full_name, variant.INFO.get(field.name, None)) if has_gt: diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py index 801670e..dcaef3f 100644 --- a/bio2zarr/vcf2zarr/vcz.py +++ b/bio2zarr/vcf2zarr/vcz.py @@ -37,6 +37,7 @@ def inspect(path): "variant_contig": "An identifier from the reference genome or an angle-bracketed ID" " string pointing to a contig in the assembly file", "variant_position": "The reference position", + "variant_length": "The length of the variant measured in bases", "variant_id": "List of unique identifiers where applicable", "variant_allele": "List of the reference and alternate alleles", "variant_quality": "Phred-scaled quality score", @@ -302,11 +303,12 @@ def fixed_field_spec( ] name_map = {field.full_name: field for field in icf.metadata.fields} - # Only two of the fixed fields have a direct one-to-one mapping. + # Only three of the fixed fields have a direct one-to-one mapping. array_specs.extend( [ spec_from_field(name_map["QUAL"], array_name="variant_quality"), spec_from_field(name_map["POS"], array_name="variant_position"), + spec_from_field(name_map["rlen"], array_name="variant_length"), ] ) array_specs.extend( diff --git a/tests/test_icf.py b/tests/test_icf.py index a5c118f..954a510 100644 --- a/tests/test_icf.py +++ b/tests/test_icf.py @@ -18,7 +18,7 @@ class TestSmallExample: 'ALT', 'CHROM', 'FILTERS', 'FORMAT/DP', 'FORMAT/GQ', 'FORMAT/GT', 'FORMAT/HQ', 'ID', 'INFO/AA', 'INFO/AC', 'INFO/AF', 'INFO/AN', 'INFO/DB', 'INFO/DP', 'INFO/H2', - 'INFO/NS', 'POS', 'QUAL', 'REF' + 'INFO/NS', 'POS', 'QUAL', 'REF', 'rlen' ) # fmt: on @@ -117,6 +117,7 @@ class TestLocalAllelesExample: "POS", "QUAL", "REF", + "rlen", ) @pytest.fixture(scope="class") @@ -451,7 +452,7 @@ def icf(self, tmp_path_factory): def test_repr(self, icf): assert repr(icf).startswith( - "IntermediateColumnarFormat(fields=7, partitions=5, records=4665, path=" + "IntermediateColumnarFormat(fields=8, partitions=5, records=4665, path=" ) def test_pos_repr(self, icf): diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py index c736eed..e4f48b5 100644 --- a/tests/test_vcf_examples.py +++ b/tests/test_vcf_examples.py @@ -56,6 +56,9 @@ def test_position(self, ds): [111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10], ) + def test_length(self, ds): + nt.assert_array_equal(ds["variant_length"], [1, 1, 1, 1, 1, 1, 1, 1, 2]) + def test_int_info_fields(self, ds): nt.assert_array_equal( ds["variant_NS"], @@ -938,6 +941,7 @@ def test_info_fields(self, ds): "variant_filter", "variant_contig", "variant_position", + "variant_length", "variant_allele", "variant_id", "variant_id_mask",