Merge pull request #3 from HealthNLPorg/v0.0.1

V0.0.1
HealthNLPorg · Jan 31, 2024 · 8935776 · 8935776
2 parents 8410952 + 29473fb
commit 8935776
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -125,13 +125,7 @@ mybroker/bin/artemis stop
 ## Input and output structure
 
 Given the structure of the summarized gold timelines and the shared task data, the Docker assumes that the input in the `input`
-folder will take the form of a collection of notes comprising all the patients of a given cancer type cohort (for the shared task one of melanoma or ovarian or breast cancers), the base filenames of which will correspond to the scheme:
-```
-<patient identifier>_<four digit year>_<two digit month>_<two digit date>
-```
-Where the year month and date correspond to the creation time of the file.  All the files in the shared task dataset follow this schema so for our data there is nothing you need to do.
-
-Assuming successful processing, the output file will be a tab separated value (`tsv`) file in the `output` folder.
+folder will take the form of a collection of notes comprising all the patients of a given cancer type cohort.  Assuming successful processing, the output file will be a tab separated value (`tsv`) file in the `output` folder.
 The file will have the columns:
 ```
 DCT	patient_id	chemo_text	chemo_annotation_id	normed_timex	timex_annotation_id	tlink	note_name	tlink_inst

diff --git a/...instance-generator/src/main/java/org/apache/ctakes/temporal/ae/TimeMentionNormalizer.java b/...instance-generator/src/main/java/org/apache/ctakes/temporal/ae/TimeMentionNormalizer.java
@@ -111,7 +111,7 @@ public void process( JCas jCas ) throws AnalysisEngineProcessException {
                 .anyMatch( tui -> this.tuiSet.contains( tui ) );
 
             if ( !hasRelevantTUIs ){
-                LOGGER.info(fileName + " : no events with the provided TUIs " + this.tuis + "skipping to save time");
+                LOGGER.info(fileName + " : no events with the provided TUIs " + this.tuis + " skipping to save time");
                 return;
             }
         }

diff --git a/...er/resources/org/apache/ctakes/timelines/timelines_py/src/timelines/timeline_delegator.py b/...er/resources/org/apache/ctakes/timelines/timelines_py/src/timelines/timeline_delegator.py
@@ -377,6 +377,7 @@ def process(self, cas: Cas):
         if len(proc_mentions) > 0:
             self._write_raw_timelines(cas, proc_mentions)
         else:
+            self._add_empty_discovery(cas)
             patient_id, note_name = pt_and_note(cas)
             print(
                 f"No chemotherapy mentions ( using TUI: {CHEMO_TUI} ) found in patient {patient_id} note {note_name}  - skipping"
@@ -424,6 +425,7 @@ def _write_raw_timelines(self, cas: Cas, proc_mentions: List[FeatureStructure]):
             )
             self._write_actual_proc_mentions(cas, actual_proc_mentions)
         else:
+            self._add_empty_discovery(cas)
             print(
                 f"No concrete chemotherapy mentions found in patient {patient_id} note {note_name} - skipping"
             )
@@ -432,14 +434,18 @@ def _write_actual_proc_mentions(
         self, cas: Cas, positive_chemo_mentions: List[FeatureStructure]
     ):
         timex_type = cas.typesystem.get_type(ctakes_types.TimeMention)
-        event_type = cas.typesystem.get_type(ctakes_types.EventMention)
         cas_source_data = cas.select(ctakes_types.Metadata)[0].sourceData
         document_creation_time = cas_source_data.sourceOriginalDate
         relevant_timexes = timexes_with_normalization(cas.select(timex_type))
 
         base_tokens, token_map = tokens_and_map(cas, mode="dtr")
         begin2token, end2token = invert_map(token_map)
 
+        def local_window_mentions(chemo):
+            return get_tlink_window_mentions(
+                chemo, relevant_timexes, begin2token, end2token, token_map
+            )
+
         def dtr_result(chemo):
             inst = get_dtr_instance(chemo, base_tokens, begin2token, end2token)
             result = list(self.dtr_classifier(inst))[0]
@@ -455,9 +461,7 @@ def tlink_result(chemo, timex):
             return label, inst
 
         def tlink_result_dict(chemo):
-            window_mentions = get_tlink_window_mentions(
-                chemo, relevant_timexes, begin2token, end2token, token_map
-            )
+            window_mentions = local_window_mentions(chemo)
             return {
                 window_mention: tlink_result(chemo, window_mention)
                 for window_mention in window_mentions
@@ -475,17 +479,32 @@ def tlink_result_dict(chemo):
                 )
             )
         }
-        if len(list(relevant_timexes)) == 0:
+
+        if (
+            len(list(relevant_timexes)) == 0
+            or len(
+                list(
+                    chain.from_iterable(
+                        map(local_window_mentions, positive_chemo_mentions)
+                    )
+                )
+            )
+            == 0
+        ):
             print(
-                f"WARNING: No normalized timexes discovered in {patient_id} file {note_name}"
+                f"WARNING: Timexes suitable for TLINK pairing discovered in {patient_id} file {note_name}"
             )
+            self._add_empty_discovery(cas)
+            return
         for chemo in positive_chemo_mentions:
+            chemo_dtr, dtr_inst = "", ""  # purely so pyright stops complaining
             if self.use_dtr:
                 chemo_dtr, dtr_inst = dtr_result(chemo)
             tlink_dict = tlink_result_dict(chemo)
             for timex, tlink_inst_pair in tlink_dict.items():
                 tlink, tlink_inst = tlink_inst_pair
                 chemo_text = normalize_mention(chemo)
+                # if we made it this fair these attributes are populated
                 timex_text = timex.time.normalizedForm
                 if self.use_dtr:
                     instance = [
@@ -514,3 +533,35 @@ def tlink_result_dict(chemo):
                         tlink_inst,
                     ]
                 self.raw_events.append(instance)
+
+    def _add_empty_discovery(self, cas: Cas):
+        cas_source_data = cas.select(ctakes_types.Metadata)[0].sourceData
+        document_creation_time = cas_source_data.sourceOriginalDate
+        patient_id, note_name = pt_and_note(cas)
+        if self.use_dtr:
+            instance = [
+                document_creation_time,
+                patient_id,
+                "none",
+                "none",
+                "none",
+                "none",
+                "none",
+                "none",
+                note_name,
+                "none",
+                "none",
+            ]
+        else:
+            instance = [
+                document_creation_time,
+                patient_id,
+                "none",
+                "none",
+                "none",
+                "none",
+                "none",
+                note_name,
+                "none",
+            ]
+        self.raw_events.append(instance)