dandi · satra · Jun 11, 2021 · May 11, 2021 · May 11, 2021 · May 11, 2021
diff --git a/dandi/metadata.py b/dandi/metadata.py
@@ -95,9 +95,100 @@ def get_metadata(path):
     return meta
 
 
+def _parse_iso8601(age):
+    """ checking if age is proper iso8601, additional formatting"""
+    if "T" in age[1:]:
+        date, time = age[1:].split("T")
+    else:
+        date, time = age[1:], ""
+    age_frm = ["P"]
+
+    if date:
+        for unit in ["Y", "M", "W", "D"]:
+            if unit in date:
+                date, part_frm = _parse_iso8601_part(date, unit)
+                age_frm.append(part_frm)
+        if date:  # if there is still anything left
+            raise ValueError(f"ISO 8601 expected, but {age} was received")
+    if time:
+        age_frm.append("T")
+        for unit in ["H", "M", "S"]:
+            if unit in time:
+                time, part_frm = _parse_iso8601_part(time, unit)
+                age_frm.append(part_frm)
+        if date:  # if there is still anything left
+            raise ValueError(f"ISO 8601 expected, but {age} was received")
+    if age_frm == ["P"]:
+        raise ValueError(f"ISO 8601 expected, but {age} was received")
+    return age_frm
+
+
+def _parse_iso8601_part(age, unit):
+    # splitting to value and remaining part
+    val, age_rem = age.split(unit)
+    if float(val) == int(float(val)):
+        return age_rem, f"{int(float(val))}{unit}"
+    else:
+        return age_rem, f"{val}{unit}"
+
+
+def _parse_age_re(age, unit, tp="date"):
+    """ finding parts that have <value> <unit> in various forms"""
+    import re
+
+    if unit == "Y":
+        pat_un = "y(ear)?"
+    elif unit == "M" and tp == "date":
+        pat_un = "(month|mon|mo|m)"
+    elif unit == "W":
+        pat_un = "w(eek)?"
+    elif unit == "D":
+        pat_un = "d(ay)?"
+    elif unit == "H":
+        pat_un = "h(our)?"
+    elif unit == "M" and tp == "time":
+        pat_un = "(min|m(inute)?)"
+    elif unit == "S":
+        pat_un = "(sec|s(econd)?)"
+
+    pattern = rf"(\d+\.?\d*)\s*({pat_un}s?)"
+    matchstr = re.match(pattern, age, flags=re.I)
+    if matchstr is None:
+        # checking pattern with "unit" word
+        pattern_unit = rf"(\d+\.?\d*)\s*units?:?\s*({pat_un}s?)"
+        matchstr = re.match(pattern_unit, age, flags=re.I)
+        if matchstr is None:
+            return age, None
+
+    if "." in matchstr.group(1):
+        qty = float(matchstr.group(1))
+        if int(qty) == qty:
+            qty = int(qty)
+    else:
+        qty = int(matchstr.group(1))
+    age_rem = age.replace(matchstr.group(0), "")
+    age_rem = age_rem.strip()
+    if age_rem and age_rem[0] in [",", ";", "."]:
+        age_rem = age_rem[1:]
+    return age_rem, f"{qty}{unit}"
+
+
+def _parse_hours_format(age):
+    """ parsing format 0:30:10"""
+    pattern = r"\s*(\d\d?):(\d\d):(\d\d)"
+    matchstr = re.match(pattern, age, flags=re.I)
+    if matchstr:
+        time_part = f"T{int(matchstr.group(1))}H{int(matchstr.group(2))}M{int(matchstr.group(3))}S"
+        age_rem = age.replace(matchstr.group(0), "")
+        age_rem = age_rem.strip()
+        return age_rem, [time_part]
+    else:
+        return age, []
+
+
 def parse_age(age):
     """
-    Convert a human-friendly duration string into an ISO 8601 duration
+    Parsing age field and converting into an ISO 8601 duration
 
     Parameters
     ----------
@@ -107,13 +198,52 @@ def parse_age(age):
     -------
     str
     """
-    m = re.fullmatch(r"(\d+)\s*(y(ear)?|m(onth)?|w(eek)?|d(ay)?)s?", age, flags=re.I)
-    if m:
-        qty = int(m.group(1))
-        unit = m.group(2)[0].upper()
-        return f"P{qty}{unit}"
-    else:
-        raise ValueError(age)
+
+    if not age:
+        raise ValueError("age is empty")
+
+    age_orig = age
+    # removing some symbols
+    for symb in [",", ";", "(", ")"]:
+        age = age.replace(symb, " ")
+    age = age.strip()
+
+    if not age:
+        raise ValueError("age doesn't have any information")
+
+    if age[0] == "P":
+        age_f = _parse_iso8601(age)
+    else:  # trying to figure out any free form
+        date_f = []
+        for unit in ["Y", "M", "W", "D"]:
+            if not age:
+                break
+            age, part_f = _parse_age_re(age, unit)
+            if part_f and date_f:
+                date_f.append(part_f)
+            elif part_f:
+                date_f = ["P", part_f]
+
+        time_f = []
+        for un in ["H", "M", "S"]:
+            if not age:
+                break
+            age, part_f = _parse_age_re(age, un, tp="time")
+            if part_f and time_f:
+                time_f.append(part_f)
+            elif part_f:
+                time_f = ["T", part_f]
+        # trying to find formats 00:00:00 for time
+        if not time_f:
+            age, time_f = _parse_hours_format(age)
+
+        age_f = date_f + time_f
+        if set(age) - {" ", ".", ",", ":", ";"}:
+            raise ValueError(
+                f"not able to parse the age: {age_orig}, no rules to convert: {age}"
+            )
+
+    return "".join(age_f)
 
 
 def extract_age(metadata):

diff --git a/dandi/tests/test_metadata.py b/dandi/tests/test_metadata.py
@@ -39,6 +39,7 @@ def schema_dir(tmp_path_factory):
         ("2 m", "P2M"),
         ("2M", "P2M"),
         ("2m", "P2M"),
+        ("2mo", "P2M"),
         ("3 weeks", "P3W"),
         ("3 w", "P3W"),
         ("3 W", "P3W"),
@@ -49,12 +50,40 @@ def schema_dir(tmp_path_factory):
         ("7 D", "P7D"),
         ("7d", "P7D"),
         ("7D", "P7D"),
+        ("7 day", "P7D"),
+        ("7 Days", "P7D"),
+        ("P136D", "P136D"),
+        ("P22265.0D", "P22265D"),
+        ("349 days, 4 hours", "P349DT4H"),
+        ("12 weeks, 13 d; 10 hours, 30 min 1sec", "P12W13DT10H30M1S"),
+        ("342 days, 4:30:02", "P342DT4H30M2S"),
+        ("342 days, 00:00:00", "P342DT0H0M0S"),
+        ("14 (Units: days)", "P14D"),
+        ("14 unit day", "P14D"),
     ],
 )
 def test_parse_age(age, duration):
     assert parse_age(age) == duration
 
 
+@pytest.mark.parametrize(
+    "age, match_er",
+    [
+        ("123", "no rules to convert: 123"),
+        ("P12", "ISO 8601 expected, but P12 was received"),
+        ("3-7 months", "no rules to convert: 3-7 months"),
+        ("Gestational Week 19", "no rules to convert: Gestational Week 19"),
+        ("3 months, some extra", "no rules to convert: some extra"),
+        (" , ", "age doesn't have any information"),
+        ("", "age is empty"),
+        (None, "age is empty"),
+    ],
+)
+def test_parse_error(age, match_er):
+    with pytest.raises(ValueError, match=match_er):
+        parse_age(age)
+
+
 @pytest.mark.parametrize(
     "td,duration",
     [