From a86bf610f7ee393cc55e6307e301f21bb87db2f9 Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 9 Sep 2024 16:06:17 +0100 Subject: [PATCH 1/2] remove targetvariants class --- .../src/pgscatalog/core/lib/__init__.py | 3 - .../src/pgscatalog/core/lib/targetvariants.py | 189 ------------------ 2 files changed, 192 deletions(-) delete mode 100644 pgscatalog.core/src/pgscatalog/core/lib/targetvariants.py diff --git a/pgscatalog.core/src/pgscatalog/core/lib/__init__.py b/pgscatalog.core/src/pgscatalog/core/lib/__init__.py index ea01d29..cd2b926 100644 --- a/pgscatalog.core/src/pgscatalog/core/lib/__init__.py +++ b/pgscatalog.core/src/pgscatalog/core/lib/__init__.py @@ -4,7 +4,6 @@ from .catalogapi import ScoreQueryResult, CatalogQuery, CatalogCategory from .scorefiles import ScoringFiles, ScoringFile, NormalisedScoringFile from .scorevariant import ScoreVariant -from .targetvariants import TargetVariants, TargetVariant, TargetType from ._relabel import RelabelArgs, relabel, relabel_write from ._sortpaths import effect_type_keyfunc, chrom_keyfunc from .pgsexceptions import ( @@ -57,8 +56,6 @@ "CatalogQuery", "ScoreQueryResult", "CatalogCategory", - "TargetVariant", - "TargetVariants", "TargetType", "NormalisedScoringFile", "RelabelArgs", diff --git a/pgscatalog.core/src/pgscatalog/core/lib/targetvariants.py b/pgscatalog.core/src/pgscatalog/core/lib/targetvariants.py deleted file mode 100644 index 29143c1..0000000 --- a/pgscatalog.core/src/pgscatalog/core/lib/targetvariants.py +++ /dev/null @@ -1,189 +0,0 @@ -"""This module contains classes to work with target variants. When a scoring file is -being reused to calculate scores for new genotypes, the new genotypes are target -genomes.""" - -import enum -import csv -import pathlib - -from xopen import xopen - - -class TargetVariant: - """A single target variant, including genomic coordinates and allele information - - >>> a = TargetVariant(chrom="1", pos=12, ref="A", alt="C", id='1:12:A:C') - >>> a - TargetVariant(chrom='1', pos=12, ref='A', alt='C', id='1:12:A:C') - >>> b = a - >>> b == a - True - """ - - def __init__(self, *, chrom, pos, ref, alt, id): - self.chrom = chrom - self.pos = int(pos) - self.ref = ref - self.alt = alt - self.id = id - - def __repr__(self): - return ( - f"{self.__class__.__name__}(chrom={repr(self.chrom)}, pos=" - f"{repr(self.pos)}, " - f"ref={repr(self.ref)}, " - f"alt={repr(self.alt)}, " - f"id={repr(self.id)})" - ) - - def __hash__(self): - return hash((self.chrom, self.pos, self.ref, self.alt, self.id)) - - def __eq__(self, other): - if isinstance(other, TargetVariant): - return (self.chrom, self.pos, self.ref, self.alt) == ( - other.chrom, - other.pos, - other.ref, - other.alt, - ) - return False - - -class TargetVariants: - """A container of :class:`TargetVariant` - - :raises: FileNotFoundError - - >>> from pgscatalog.core import Config # ignore, only to load test data - >>> pvar = TargetVariants(Config.ROOT_DIR / "tests" / "data" / "hapnest.pvar") - >>> pvar.ftype - - - Iterating over TargetVariants is done via a read-only generator attribute: - - >>> pvar.variants # doctest: +ELLIPSIS - - >>> for variant in pvar: - ... variant - ... break - TargetVariant(chrom='14', pos=65003549, ref='T', alt='C', id='14:65003549:T:C') - - gzip and zstandard compression is transparently handled for pvar: - - >>> pvar = TargetVariants(Config.ROOT_DIR / "tests" / "data" / "hapnest.pvar.zst") - >>> for variant in pvar: - ... variant - ... break - TargetVariant(chrom='14', pos=65003549, ref='T', alt='C', id='14:65003549:T:C') - - The same is true for bim files: - - >>> bim = TargetVariants(Config.ROOT_DIR / "tests" / "data" / "hapnest.bim.gz") - >>> bim.ftype - - >>> for variant in bim: - ... variant - ... break - TargetVariant(chrom='1', pos=10180, ref='C', alt='T', id='1:10180:T:C') - >>> bim = TargetVariants(Config.ROOT_DIR / "tests" / "data" / "hapnest.bim.zst") - >>> for variant in bim: - ... variant - ... break - TargetVariant(chrom='1', pos=10180, ref='C', alt='T', id='1:10180:T:C') - >>> bim = TargetVariants(Config.ROOT_DIR / "tests" / "data" / "hapnest.bim") - >>> for variant in bim: - ... variant - ... break - TargetVariant(chrom='1', pos=10180, ref='C', alt='T', id='1:10180:T:C') - - Note, A1/A2 isn't guaranteed to be ref/alt because of PLINK1 file format - limitations. PGS Catalog libraries handle this internally, but you should be - aware REF/ALT can be swapped by plink during VCF to bim conversion. - - Some pvar files can contain a lot of comments in the header, which are ignored: - - >>> pvar = TargetVariants(Config.ROOT_DIR / "tests" / "data" / "1000G.pvar") - >>> for variant in pvar: - ... variant - ... break - TargetVariant(chrom='1', pos=10390, ref='CCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA', alt='C', id='1:10390:CCCCTAACCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA:C') - """ - - def __init__(self, path, chrom=None): - match n := pathlib.Path(path).name: - case _ if "pvar" in n: - self.ftype = TargetType.PVAR - case _ if "bim" in n: - self.ftype = TargetType.BIM - case _: - raise ValueError(f"Unknown target type {n!r}") - - self._chrom = chrom - self._path = str(path) - - def __repr__(self): - return f"{self.__class__.__name__}(path={repr(self.path)})" - - def __iter__(self): - yield from self.variants - - @property - def path(self): - return self._path - - @property - def chrom(self): - return self._chrom - - @property - def variants(self): - match self.ftype: - case TargetType.BIM: - return read_bim(self.path) - case TargetType.PVAR: - return read_pvar(self.path) - case _: - raise ValueError - - -def read_pvar(path): - """Read plink2 pvar variant information files using python core library""" - - with xopen(path, "rt") as f: - # pvars do have a header column and support arbitrary columns - for line in f: - if line.startswith("##"): - continue - else: - fieldnames = line.strip().split("\t") - break - reader = csv.DictReader(f, fieldnames=fieldnames, delimiter="\t") - fields = { - "#CHROM": "chrom", - "POS": "pos", - "REF": "ref", - "ALT": "alt", - "ID": "id", - } - for row in reader: - yield TargetVariant(**{v: row[k] for k, v in fields.items()}) - - -def read_bim(path): - """Read plink1 bim variant information files using python core library""" - with xopen(path, "rt") as f: - # bims don't have header column - reader = csv.reader(f, delimiter="\t") - # yes, A1/A2 in bim isn't ref/alt - fields = ["chrom", "id", "pos_cm", "pos", "ref", "alt"] - for row in reader: - row = dict(zip(fields, row, strict=True)) - yield TargetVariant( - **{k: row[k] for k in ("chrom", "pos", "ref", "alt", "id")} - ) - - -class TargetType(enum.Enum): - PVAR = enum.auto() - BIM = enum.auto() From 13cefc2122b694fde5da72210fea1a2a43dde6cf Mon Sep 17 00:00:00 2001 From: Benjamin Wingfield Date: Mon, 9 Sep 2024 16:09:20 +0100 Subject: [PATCH 2/2] fix deleted imports --- pgscatalog.core/src/pgscatalog/core/__init__.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pgscatalog.core/src/pgscatalog/core/__init__.py b/pgscatalog.core/src/pgscatalog/core/__init__.py index 4bd72e4..624ddd0 100644 --- a/pgscatalog.core/src/pgscatalog/core/__init__.py +++ b/pgscatalog.core/src/pgscatalog/core/__init__.py @@ -12,9 +12,6 @@ NormalisedScoringFile, ScoreVariant, GenomeBuild, - TargetVariants, - TargetVariant, - TargetType, RelabelArgs, relabel, relabel_write, @@ -37,9 +34,6 @@ "CatalogQuery", "ScoreQueryResult", "CatalogCategory", - "TargetVariant", - "TargetVariants", - "TargetType", "NormalisedScoringFile", "RelabelArgs", "relabel",