Skip to content

Commit

Permalink
Merge pull request #3 from HealthNLPorg/v0.0.1
Browse files Browse the repository at this point in the history
V0.0.1
  • Loading branch information
etgld authored Jan 31, 2024
2 parents 8410952 + 29473fb commit 8935776
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 14 deletions.
8 changes: 1 addition & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,13 +125,7 @@ mybroker/bin/artemis stop
## Input and output structure

Given the structure of the summarized gold timelines and the shared task data, the Docker assumes that the input in the `input`
folder will take the form of a collection of notes comprising all the patients of a given cancer type cohort (for the shared task one of melanoma or ovarian or breast cancers), the base filenames of which will correspond to the scheme:
```
<patient identifier>_<four digit year>_<two digit month>_<two digit date>
```
Where the year month and date correspond to the creation time of the file. All the files in the shared task dataset follow this schema so for our data there is nothing you need to do.

Assuming successful processing, the output file will be a tab separated value (`tsv`) file in the `output` folder.
folder will take the form of a collection of notes comprising all the patients of a given cancer type cohort. Assuming successful processing, the output file will be a tab separated value (`tsv`) file in the `output` folder.
The file will have the columns:
```
DCT patient_id chemo_text chemo_annotation_id normed_timex timex_annotation_id tlink note_name tlink_inst
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ public void process( JCas jCas ) throws AnalysisEngineProcessException {
.anyMatch( tui -> this.tuiSet.contains( tui ) );

if ( !hasRelevantTUIs ){
LOGGER.info(fileName + " : no events with the provided TUIs " + this.tuis + "skipping to save time");
LOGGER.info(fileName + " : no events with the provided TUIs " + this.tuis + " skipping to save time");
return;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,7 @@ def process(self, cas: Cas):
if len(proc_mentions) > 0:
self._write_raw_timelines(cas, proc_mentions)
else:
self._add_empty_discovery(cas)
patient_id, note_name = pt_and_note(cas)
print(
f"No chemotherapy mentions ( using TUI: {CHEMO_TUI} ) found in patient {patient_id} note {note_name} - skipping"
Expand Down Expand Up @@ -424,6 +425,7 @@ def _write_raw_timelines(self, cas: Cas, proc_mentions: List[FeatureStructure]):
)
self._write_actual_proc_mentions(cas, actual_proc_mentions)
else:
self._add_empty_discovery(cas)
print(
f"No concrete chemotherapy mentions found in patient {patient_id} note {note_name} - skipping"
)
Expand All @@ -432,14 +434,18 @@ def _write_actual_proc_mentions(
self, cas: Cas, positive_chemo_mentions: List[FeatureStructure]
):
timex_type = cas.typesystem.get_type(ctakes_types.TimeMention)
event_type = cas.typesystem.get_type(ctakes_types.EventMention)
cas_source_data = cas.select(ctakes_types.Metadata)[0].sourceData
document_creation_time = cas_source_data.sourceOriginalDate
relevant_timexes = timexes_with_normalization(cas.select(timex_type))

base_tokens, token_map = tokens_and_map(cas, mode="dtr")
begin2token, end2token = invert_map(token_map)

def local_window_mentions(chemo):
return get_tlink_window_mentions(
chemo, relevant_timexes, begin2token, end2token, token_map
)

def dtr_result(chemo):
inst = get_dtr_instance(chemo, base_tokens, begin2token, end2token)
result = list(self.dtr_classifier(inst))[0]
Expand All @@ -455,9 +461,7 @@ def tlink_result(chemo, timex):
return label, inst

def tlink_result_dict(chemo):
window_mentions = get_tlink_window_mentions(
chemo, relevant_timexes, begin2token, end2token, token_map
)
window_mentions = local_window_mentions(chemo)
return {
window_mention: tlink_result(chemo, window_mention)
for window_mention in window_mentions
Expand All @@ -475,17 +479,32 @@ def tlink_result_dict(chemo):
)
)
}
if len(list(relevant_timexes)) == 0:

if (
len(list(relevant_timexes)) == 0
or len(
list(
chain.from_iterable(
map(local_window_mentions, positive_chemo_mentions)
)
)
)
== 0
):
print(
f"WARNING: No normalized timexes discovered in {patient_id} file {note_name}"
f"WARNING: Timexes suitable for TLINK pairing discovered in {patient_id} file {note_name}"
)
self._add_empty_discovery(cas)
return
for chemo in positive_chemo_mentions:
chemo_dtr, dtr_inst = "", "" # purely so pyright stops complaining
if self.use_dtr:
chemo_dtr, dtr_inst = dtr_result(chemo)
tlink_dict = tlink_result_dict(chemo)
for timex, tlink_inst_pair in tlink_dict.items():
tlink, tlink_inst = tlink_inst_pair
chemo_text = normalize_mention(chemo)
# if we made it this fair these attributes are populated
timex_text = timex.time.normalizedForm
if self.use_dtr:
instance = [
Expand Down Expand Up @@ -514,3 +533,35 @@ def tlink_result_dict(chemo):
tlink_inst,
]
self.raw_events.append(instance)

def _add_empty_discovery(self, cas: Cas):
cas_source_data = cas.select(ctakes_types.Metadata)[0].sourceData
document_creation_time = cas_source_data.sourceOriginalDate
patient_id, note_name = pt_and_note(cas)
if self.use_dtr:
instance = [
document_creation_time,
patient_id,
"none",
"none",
"none",
"none",
"none",
"none",
note_name,
"none",
"none",
]
else:
instance = [
document_creation_time,
patient_id,
"none",
"none",
"none",
"none",
"none",
note_name,
"none",
]
self.raw_events.append(instance)

0 comments on commit 8935776

Please sign in to comment.