-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update score log structure #49
base: fix-nonadditive
Are you sure you want to change the base?
Conversation
Codecov ReportAttention: Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## fix-nonadditive #49 +/- ##
===================================================
+ Coverage 89.18% 89.51% +0.33%
===================================================
Files 40 39 -1
Lines 2320 2413 +93
===================================================
+ Hits 2069 2160 +91
- Misses 251 253 +2
Flags with carried forward coverage won't be shown. Click here to find out more. ☔ View full report in Codecov by Sentry. |
"pgs_id", | ||
"pgp_id", | ||
"pgs_name", | ||
"genome_build", | ||
"variants_number", | ||
"trait_reported", | ||
"trait_efo", | ||
"trait_mapped", | ||
"weight_type", | ||
"citation", | ||
"HmPOS_build", | ||
"HmPOS_date", | ||
"format_version", | ||
"license", | ||
) | ||
|
||
_default_license_text = ( | ||
"PGS obtained from the Catalog should be cited appropriately, and " | ||
"used in accordance with any licensing restrictions set by the authors. See " | ||
"EBI " | ||
"Terms of Use (https://www.ebi.ac.uk/about/terms-of-use/) for additional " | ||
"details." | ||
) | ||
|
||
def __init__( | ||
self, | ||
*, | ||
pgs_name, | ||
genome_build, | ||
pgs_id=None, | ||
pgp_id=None, | ||
variants_number=None, | ||
trait_reported=None, | ||
trait_efo=None, | ||
trait_mapped=None, | ||
weight_type=None, | ||
citation=None, | ||
HmPOS_build=None, | ||
HmPOS_date=None, | ||
format_version=None, | ||
license=None, | ||
): | ||
"""kwargs are forced because this is a complicated init and from_path() is | ||
almost always the correct thing to do. | ||
|
||
Mandatory/optional fields in a header are less clear than columns. The | ||
Catalog provides this information automatically but scoring files from other | ||
places might not. | ||
|
||
We don't want to annoy people and force them to reformat their custom scoring | ||
files, but we do require some minimum information for the calculator, | ||
so pgs_name and genome_build are mandatory. | ||
""" | ||
self.pgs_name = pgs_name | ||
self.genome_build = GenomeBuild.from_string(genome_build) | ||
|
||
if self.genome_build is None: | ||
# try overwriting with harmonised data | ||
self.genome_build = GenomeBuild.from_string(HmPOS_build) | ||
|
||
if self.pgs_name is None: | ||
raise ValueError("pgs_name cannot be None") | ||
|
||
# the rest of these fields are optional | ||
self.pgs_id = pgs_id | ||
self.pgp_id = pgp_id | ||
|
||
try: | ||
self.variants_number = int(variants_number) | ||
except TypeError: | ||
self.variants_number = None | ||
|
||
self.trait_reported = trait_reported | ||
self.trait_efo = trait_efo | ||
self.trait_mapped = trait_mapped | ||
self.weight_type = weight_type | ||
self.citation = citation | ||
self.HmPOS_build = GenomeBuild.from_string(HmPOS_build) | ||
self.HmPOS_date = HmPOS_date | ||
self.format_version = format_version | ||
self.license = license | ||
|
||
if self.license is None: | ||
self.license = self._default_license_text | ||
|
||
def __repr__(self): | ||
values = {x: getattr(self, x) for x in self.fields} | ||
value_strings = ", ".join([f"{key}='{value}'" for key, value in values.items()]) | ||
return f"{type(self).__name__}({value_strings})" | ||
|
||
@classmethod | ||
def from_path(cls, path): | ||
raw_header: dict = read_header(path) | ||
|
||
if len(raw_header) == 0: | ||
raise ValueError(f"No header detected in scoring file {path=}") | ||
|
||
header_dict = {k: raw_header.get(k) for k in cls.fields} | ||
|
||
return cls(**header_dict) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
deleted because migrated to a pydantic model
@property | ||
def pgs_id(self): | ||
return self._pgs_id | ||
|
||
if (build := self._header.HmPOS_build) is not None: | ||
self.genome_build = build | ||
self.harmonised = True | ||
@property | ||
def is_harmonised(self): | ||
return self._header.is_harmonised | ||
|
||
@property | ||
def genome_build(self): | ||
if self.is_harmonised: | ||
return self._header.HmPOS_build | ||
else: | ||
self.genome_build = self._header.genome_build | ||
self.harmonised = False | ||
return self._header.genome_build | ||
|
||
@property | ||
def header(self): | ||
return self._header |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added these properties because they're a really helpful shortcut in the CLI
def get_log(self, drop_missing=False, variant_log=None): | ||
"""Create a JSON log from a ScoringFile's header and variant rows.""" | ||
|
||
logger.debug(f"Creating JSON log for {self!r}") | ||
|
||
log = {} | ||
|
||
for attr in self._header.fields: | ||
if (extracted_attr := getattr(self._header, attr, None)) is not None: | ||
log[attr] = str(extracted_attr) | ||
else: | ||
log[attr] = None | ||
|
||
if log["variants_number"] is None: | ||
# custom scoring files might not have this information | ||
log["variants_number"] = variant_log["n_variants"] | ||
|
||
if ( | ||
variant_log is not None | ||
and int(log["variants_number"]) != variant_log["n_variants"] | ||
and not drop_missing | ||
): | ||
logger.warning( | ||
f"Mismatch between header ({log['variants_number']}) and output row count ({variant_log['n_variants']}) for {self.pgs_id}" | ||
) | ||
logger.warning( | ||
"This can happen with older scoring files in the PGS Catalog (e.g. PGS000028)" | ||
) | ||
|
||
# multiple terms may be separated with a pipe | ||
if log["trait_mapped"]: | ||
log["trait_mapped"] = log["trait_mapped"].split("|") | ||
|
||
if log["trait_efo"]: | ||
log["trait_efo"] = log["trait_efo"].split("|") | ||
|
||
log["columns"] = self._fields | ||
log["use_harmonised"] = self.harmonised | ||
|
||
if variant_log is not None: | ||
log["sources"] = [k for k, v in variant_log.items() if k != "n_variants"] | ||
|
||
return {self.pgs_id: log} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Replaced by the pydantic models + model_dump()
class ScoreVariant(CatalogScoreVariant): | ||
"""This model includes attributes useful for processing and normalising variants | ||
|
||
>>> variant = ScoreVariant(**{"rsID": None, "chr_name": "1", "chr_position": 1, "effect_allele": "A", "effect_weight": 0.5, "row_nr": 0, "accession": "test"}) | ||
>>> variant # doctest: +ELLIPSIS | ||
ScoreVariant(rsID=None, chr_name='1', chr_position=1, effect_allele=Allele(allele='A', ... | ||
>>> variant.is_complex | ||
False | ||
>>> variant.is_non_additive | ||
False | ||
>>> variant.is_harmonised | ||
False | ||
>>> variant.effect_type | ||
EffectType.ADDITIVE | ||
|
||
>>> variant_missing_positions = ScoreVariant(**{"rsID": None, "chr_name": None, "chr_position": None, "effect_allele": "A", "effect_weight": 0.5, "row_nr": 0, "accession": "test"}) # doctest: +ELLIPSIS | ||
Traceback (most recent call last): | ||
... | ||
TypeError: Bad position: self.rsID=None, self.chr_name=None, self.chr_position=None | ||
|
||
>>> harmonised_variant = ScoreVariant(**{"rsID": None, "chr_name": "1", "chr_position": 1, "effect_allele": "A", "effect_weight": 0.5, "hm_chr": "1", "hm_pos": 1, "hm_rsID": "rs1921", "hm_source": "ENSEMBL", "row_nr": 0, "accession": "test"}) | ||
>>> harmonised_variant.is_harmonised | ||
True | ||
|
||
>>> variant_badly_harmonised = ScoreVariant(**{"rsID": None, "chr_name": "1", "chr_position": 1, "effect_allele": "A", "effect_weight": 0.5, "hm_chr": None, "hm_pos": None, "hm_rsID": "rs1921", "hm_source": "ENSEMBL", "row_nr": 0, "accession": "test"}) | ||
Traceback (most recent call last): | ||
... | ||
TypeError: Missing harmonised column data: hm_chr | ||
|
||
>>> variant_nonadditive = ScoreVariant(**{"rsID": None, "chr_name": "1", "chr_position": 1, "effect_allele": "A", "effect_weight": 0.5, "dosage_0_weight": 0, "dosage_1_weight": 1, "row_nr": 0, "accession": "test"}) | ||
>>> variant_nonadditive.is_non_additive | ||
True | ||
>>> variant_nonadditive.is_complex | ||
False | ||
>>> variant_nonadditive.effect_type | ||
EffectType.NONADDITIVE | ||
|
||
>>> variant_complex = ScoreVariant(**{"rsID": None, "chr_name": "1", "chr_position": 1, "effect_allele": "A", "effect_weight": 0.5, "is_haplotype": True, "row_nr": 0, "accession": "test"}) | ||
>>> variant_complex.is_complex | ||
True | ||
""" | ||
|
||
model_config = ConfigDict(use_enum_values=True) | ||
|
||
row_nr: int = Field( | ||
title="Row number", | ||
description="Row number of variant in scoring file (first variant = 0)", | ||
) | ||
accession: str = Field(title="Accession", description="Accession of score variant") | ||
is_duplicated: Optional[bool] = Field( | ||
default=False, | ||
title="Duplicated variant", | ||
description="In a list of variants with the same accession, is ID duplicated?", | ||
) | ||
|
||
# column names for output are used by __iter__ and when writing out | ||
output_fields: ClassVar[tuple[str]] = ( | ||
"chr_name", | ||
"chr_position", | ||
"effect_allele", | ||
"other_allele", | ||
"effect_weight", | ||
"effect_type", | ||
"is_duplicated", | ||
"accession", | ||
"row_nr", | ||
) | ||
|
||
def __iter__(self): | ||
for attr in self.output_fields: | ||
yield getattr(self, attr) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thought it would be best to keep all pydantic models in this module 🤔
What's changed
Added pydantic models for:
Integrated these models with the ScoringFile object and combine CLI
There is a breaking change in log structure. Changes include:
compatible_effect_type
field (bool){'PGSXXXX': { nested_dict }
) has been made static because of pydantic limitations (replaced with "header" field)Why changed
#46 (comment)
Example log file
PGS000001 is normalised OK and will be in the output, PGS004255 is ignored but warned about.