-
Notifications
You must be signed in to change notification settings - Fork 25
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
updating parse age to cover more formats #633
Changes from 3 commits
742fcd3
5f5c245
943cd44
9aedd09
397c53a
e674695
e556783
dc8addf
f1f70b1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -95,9 +95,100 @@ def get_metadata(path): | |
return meta | ||
|
||
|
||
def _parse_iso8601(age): | ||
""" checking if age is proper iso8601, additional formatting""" | ||
if "T" in age[1:]: | ||
date, time = age[1:].split("T") | ||
else: | ||
date, time = age[1:], "" | ||
age_frm = ["P"] | ||
|
||
if date: | ||
for unit in ["Y", "M", "W", "D"]: | ||
if unit in date: | ||
date, part_frm = _parse_iso8601_part(date, unit) | ||
age_frm.append(part_frm) | ||
if date: # if there is still anything left | ||
raise ValueError(f"ISO 8601 expected, but {age} was received") | ||
if time: | ||
age_frm.append("T") | ||
for unit in ["H", "M", "S"]: | ||
if unit in time: | ||
time, part_frm = _parse_iso8601_part(time, unit) | ||
age_frm.append(part_frm) | ||
if date: # if there is still anything left | ||
raise ValueError(f"ISO 8601 expected, but {age} was received") | ||
if age_frm == ["P"]: | ||
raise ValueError(f"ISO 8601 expected, but {age} was received") | ||
return age_frm | ||
|
||
|
||
def _parse_iso8601_part(age, unit): | ||
# splitting to value and remaining part | ||
val, age_rem = age.split(unit) | ||
if float(val) == int(float(val)): | ||
return age_rem, f"{int(float(val))}{unit}" | ||
else: | ||
return age_rem, f"{val}{unit}" | ||
|
||
|
||
def _parse_age_re(age, unit, tp="date"): | ||
""" finding parts that have <value> <unit> in various forms""" | ||
import re | ||
|
||
if unit == "Y": | ||
pat_un = "y(ear)?" | ||
elif unit == "M" and tp == "date": | ||
pat_un = "(month|mon|mo|m)" | ||
elif unit == "W": | ||
pat_un = "w(eek)?" | ||
elif unit == "D": | ||
pat_un = "d(ay)?" | ||
elif unit == "H": | ||
pat_un = "h(our)?" | ||
elif unit == "M" and tp == "time": | ||
pat_un = "(min|m(inute)?)" | ||
elif unit == "S": | ||
pat_un = "(sec|s(econd)?)" | ||
|
||
pattern = rf"(\d+\.?\d*)\s*({pat_un}s?)" | ||
matchstr = re.match(pattern, age, flags=re.I) | ||
if matchstr is None: | ||
# checking pattern with "unit" word | ||
pattern_unit = rf"(\d+\.?\d*)\s*units?:?\s*({pat_un}s?)" | ||
matchstr = re.match(pattern_unit, age, flags=re.I) | ||
if matchstr is None: | ||
return age, None | ||
|
||
if "." in matchstr.group(1): | ||
qty = float(matchstr.group(1)) | ||
if int(qty) == qty: | ||
qty = int(qty) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this code block of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added |
||
else: | ||
qty = int(matchstr.group(1)) | ||
age_rem = age.replace(matchstr.group(0), "") | ||
age_rem = age_rem.strip() | ||
if age_rem and age_rem[0] in [",", ";", "."]: | ||
age_rem = age_rem[1:] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there an easy test case to add to cover this line? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think this is already covered somewhere else, so no if needed here |
||
return age_rem, f"{qty}{unit}" | ||
|
||
|
||
def _parse_hours_format(age): | ||
""" parsing format 0:30:10""" | ||
pattern = r"\s*(\d\d?):(\d\d):(\d\d)" | ||
matchstr = re.match(pattern, age, flags=re.I) | ||
if matchstr: | ||
time_part = f"T{int(matchstr.group(1))}H{int(matchstr.group(2))}M{int(matchstr.group(3))}S" | ||
age_rem = age.replace(matchstr.group(0), "") | ||
age_rem = age_rem.strip() | ||
return age_rem, [time_part] | ||
else: | ||
return age, [] | ||
|
||
|
||
def parse_age(age): | ||
""" | ||
Convert a human-friendly duration string into an ISO 8601 duration | ||
Parsing age field and converting into an ISO 8601 duration | ||
|
||
Parameters | ||
---------- | ||
|
@@ -107,13 +198,52 @@ def parse_age(age): | |
------- | ||
str | ||
""" | ||
m = re.fullmatch(r"(\d+)\s*(y(ear)?|m(onth)?|w(eek)?|d(ay)?)s?", age, flags=re.I) | ||
if m: | ||
qty = int(m.group(1)) | ||
unit = m.group(2)[0].upper() | ||
return f"P{qty}{unit}" | ||
else: | ||
raise ValueError(age) | ||
|
||
if not age: | ||
raise ValueError("age is empty") | ||
|
||
age_orig = age | ||
# removing some symbols | ||
for symb in [",", ";", "(", ")"]: | ||
age = age.replace(symb, " ") | ||
age = age.strip() | ||
|
||
if not age: | ||
raise ValueError("age doesn't have any information") | ||
|
||
if age[0] == "P": | ||
age_f = _parse_iso8601(age) | ||
else: # trying to figure out any free form | ||
date_f = [] | ||
for unit in ["Y", "M", "W", "D"]: | ||
if not age: | ||
break | ||
age, part_f = _parse_age_re(age, unit) | ||
if part_f and date_f: | ||
date_f.append(part_f) | ||
elif part_f: | ||
date_f = ["P", part_f] | ||
|
||
time_f = [] | ||
for un in ["H", "M", "S"]: | ||
if not age: | ||
break | ||
age, part_f = _parse_age_re(age, un, tp="time") | ||
if part_f and time_f: | ||
time_f.append(part_f) | ||
elif part_f: | ||
time_f = ["T", part_f] | ||
# trying to find formats 00:00:00 for time | ||
if not time_f: | ||
age, time_f = _parse_hours_format(age) | ||
|
||
age_f = date_f + time_f | ||
if set(age) - {" ", ".", ",", ":", ";"}: | ||
raise ValueError( | ||
f"not able to parse the age: {age_orig}, no rules to convert: {age}" | ||
) | ||
|
||
return "".join(age_f) | ||
|
||
|
||
def extract_age(metadata): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
here are the regex patterns we use in reproschema. for at least iso8601, we should cover the formal patterns for duration definition.
https://github.com/ReproNim/reproschema/blob/master/validation/reproschema-shacl.ttl#L327
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i'm checking the regex as it seems to fail for something specific, but in general the following age patterns should be supported:
https://en.wikipedia.org/wiki/ISO_8601#Durations
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sorry the error was my mistake. this should help for the duration pattern:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok, can move to this, thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it doesn't cover "P149.D" - is that ok?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it also doesn't change "P149.0D" to "P149D", but that's perhaps is better
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it's not valid iso8601 so that should be ok. i think anything that is close but not ok, could be converted and then verified with the regex. i think my above comment was mostly for ensuring that the age format we use corresponds to one of those two iso8601 cases.
add to age as is
i also suspect that we need to somehow indicate in nwb the age reference (right now we are using gestat* to determine alternate or assigning birth reference). the ideal will be when we don't need to modify at all, just validate iso8601 + reference.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would disagree that the pattern you posted covers all valid ISO8601 duration values and only valid ones. At least not how I understand the wikipedia description, e.g. "P0,5Y" is a valid value that is not covered, and "P0.6Y2D" is not a valid value that would work with the pattern. But I understand that we try to cover as many format as possible
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's just not clear to me should I be fixing values like this "P0.6Y2D" or we just keep them