dmmiller612 · phil-scholarcy · Oct 21, 2022 · Oct 24, 2022 · Oct 26, 2022 · Oct 27, 2022
diff --git a/README.md b/README.md
@@ -121,7 +121,7 @@ Then to to use coreference, run the following:
 from summarizer import Summarizer
 from summarizer.text_processors.coreference_handler import CoreferenceHandler
 
-handler = CoreferenceHandler(greedyness=.4)
+handler = CoreferenceHandler()
 # How coreference works:
 # >>>handler.process('''My sister has a dog. She loves him.''', min_length=2)
 # ['My sister has a dog.', 'My sister loves a dog.']
@@ -255,7 +255,7 @@ arguments for custom and different models. This can be done through a command su
 
 ```
 docker build -t summary-service -f Dockerfile.service ./
-docker run --rm -it -p 5000:5000 summary-service:latest -model bert-large-uncased
+docker run --rm -it -p 5000:8080 summary-service:latest -model bert-large-uncased
 ```
 
 Other arguments can also be passed to the server. Below includes the list of available arguments.
@@ -271,6 +271,7 @@ This endpoint accepts a text/plain input which represents the text that you want
 passed as request arguments. The accepted arguments are:
 
 * ratio: Ratio of sentences to summarize to from the original body. (default to 0.2)
+* num_sentences: Number of sentences to use. Overrides ratio if supplied
 * min_length: The minimum length to accept as a sentence. (default to 25)
 * max_length: The maximum length to accept as a sentence. (default to 500)
 

diff --git a/requirements-service.txt b/requirements-service.txt
@@ -1,15 +1,15 @@
-numpy==1.17
-torch==1.6.0
-spacy==2.1.3
-transformers==4.4.0
-sentencepiece==0.1.95
-Cython==0.29.10
-tqdm==4.32.2
-neuralcoref==4.0
+numpy
+torch
+spacy
+spacy-experimental
+transformers
+sentencepiece
+Cython
+tqdm
 argparse
 scikit-learn
-bert-extractive-summarizer
 Flask
 flask-cors
 nltk
-https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.0/en_core_web_sm-2.1.0.tar.gz
+https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl
+https://github.com/explosion/spacy-experimental/releases/download/v0.6.0/en_coreference_web_trf-3.4.0a0-py3-none-any.whl
diff --git a/requirements.txt b/requirements.txt
@@ -1,12 +1,13 @@
 numpy
-torch==1.9.0
-spacy==2.1.3
-transformers==4.9.1
-sentencepiece==0.1.96
+torch
+spacy
+spacy-experimental
+transformers
+sentencepiece
 tqdm
-neuralcoref==4.0
 argparse
 scikit-learn
 pytest
-sentence-transformers==2.1.0
-https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz
+sentence-transformers
+https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl
+https://github.com/explosion/spacy-experimental/releases/download/v0.6.0/en_coreference_web_trf-3.4.0a0-py3-none-any.whl
diff --git a/server.py b/server.py
@@ -51,13 +51,17 @@ def convert_to_paragraphs(self) -> str:
         sentences: List[str] = self.run()
         return ' '.join([sentence.strip() for sentence in sentences]).strip()
 
+
 @app.route('/', methods=['GET'])
 def hello_world():
     return 'Hello, World!'
 
-@app.route('/summarize_by_ratio', methods=['POST'])
+
+@app.route('/summarize', methods=['POST'])
 def convert_raw_text_by_ratio():
     ratio = float(request.args.get('ratio', 0.2))
+    sentences = request.args.get('num_sentences')
+    num_sentences = int(sentences) if sentences else None
     min_length = int(request.args.get('min_length', 25))
     max_length = int(request.args.get('max_length', 500))
 
@@ -66,24 +70,7 @@ def convert_raw_text_by_ratio():
         abort(make_response(jsonify(message="Request must have raw text"), 400))
 
     parsed = Parser(data).convert_to_paragraphs()
-    summary = summarizer(parsed, ratio=ratio, min_length=min_length, max_length=max_length)
-
-    return jsonify({
-        'summary': summary
-    })
-
-@app.route('/summarize_by_sentence', methods=['POST'])
-def convert_raw_text_by_sent():
-    num_sentences = int(request.args.get('num_sentences', 5))
-    min_length = int(request.args.get('min_length', 25))
-    max_length = int(request.args.get('max_length', 500))
-
-    data = request.data
-    if not data:
-        abort(make_response(jsonify(message="Request must have raw text"), 400))
-
-    parsed = Parser(data).convert_to_paragraphs()
-    summary = summarizer(parsed, num_sentences=num_sentences, min_length=min_length, max_length=max_length)
+    summary = summarizer(parsed, ratio=ratio, num_sentences=num_sentences, min_length=min_length, max_length=max_length)
 
     return jsonify({
         'summary': summary

diff --git a/summarizer/text_processors/coreference_handler.py b/summarizer/text_processors/coreference_handler.py
@@ -1,29 +1,29 @@
-# removed previous import and related functionality since it's just a blank language model,
-#  while neuralcoref requires passing pretrained language model via spacy.load()
+#  Updated to use latest Spacy 3 coreference model
 
 from typing import List
 
-import neuralcoref
 import spacy
-
+import re
 from summarizer.text_processors.sentence_abc import SentenceABC
+from summarizer.text_processors.sentence_handler import SentenceHandler
+
+# Experimental coref model
+DEFAULT_MODEL = "en_coreference_web_trf"
 
 
 class CoreferenceHandler(SentenceABC):
     """HuggingFace Coreference Handler."""
 
     def __init__(
-        self, spacy_model: str = 'en_core_web_sm', greedyness: float = 0.45
+        self, spacy_model: str = DEFAULT_MODEL
     ):
         """
-        Corefence handler. Only works with spacy < 3.0.
+        Coreference handler. Updated to work with spacy > 3.0.
 
         :param spacy_model: The spacy model to use as default.
-        :param greedyness: The greedyness factor.
         """
         nlp = spacy.load(spacy_model)
-        neuralcoref.add_to_pipe(nlp, greedyness=greedyness)
-        super().__init__(nlp, is_spacy_3=False)
+        super().__init__(nlp)
 
     def process(self, body: str, min_length: int = 40, max_length: int = 600) -> List[str]:
         """
@@ -34,8 +34,29 @@ def process(self, body: str, min_length: int = 40, max_length: int = 600) -> Lis
         :param max_length: Max length that the sentences mus fall under
         :return: Returns a list of sentences.
         """
-        doc = self.nlp(body)._.coref_resolved
-        doc = self.nlp(doc)
-        return [c.string.strip()
-                for c in doc.sents
-                if max_length > len(c.string.strip()) > min_length]
+        doc = self.nlp(body)
+        resolved_text = body
+        offset = 0
+        reindex = []
+        for chain in doc.spans:
+            for idx, span in enumerate(doc.spans[chain]):
+                if idx > 0:
+                    reindex.append([span.start_char, span.end_char, doc.spans[chain][0].text, span.text])
+
+        for span in sorted(reindex, key=lambda x: x[0]):
+            antecedent = span[2]
+            coreferent = span[3]
+            antecedent_is_possessive = re.match(r".+['\u2019]s$", antecedent)
+            coreferent_is_possessive = re.match(r"^([Ii]ts|[Hh]is|[Hh]er|[Tt]heir|.+['\u2019]s)$", coreferent)
+            # Add possessive to antecedent if resolving in a possessive context
+            if coreferent_is_possessive and not antecedent_is_possessive:
+                    antecedent = antecedent + "\u2019s"
+            # Remove possessive from resolved antecedent if not resolving in a possessive context
+            elif antecedent_is_possessive and not coreferent_is_possessive:
+                antecedent = re.sub(r"['\u2019]s$", "", antecedent)
+            resolved_text = resolved_text[0:span[0] + offset] + antecedent + resolved_text[span[1] + offset:]
+            offset += len(antecedent) - (span[1] - span[0])
+
+        result_sents = SentenceHandler().process(resolved_text, min_length=min_length)
+        return result_sents
+
diff --git a/summarizer/text_processors/sentence_abc.py b/summarizer/text_processors/sentence_abc.py
@@ -6,15 +6,13 @@
 class SentenceABC:
     """Parent Class for sentence processing."""
 
-    def __init__(self, nlp: Language, is_spacy_3: bool):
+    def __init__(self, nlp: Language):
         """
         Base Sentence Handler with Spacy support.
 
         :param nlp: NLP Pipeline.
-        :param is_spacy_3: Whether or not we are using spacy 3.
         """
         self.nlp = nlp
-        self.is_spacy_3 = is_spacy_3
 
     def sentence_processor(
         self, doc, min_length: int = 40, max_length: int = 600
@@ -31,11 +29,7 @@ def sentence_processor(
 
         for c in doc.sents:
             if max_length > len(c.text.strip()) > min_length:
-
-                if self.is_spacy_3:
-                    to_return.append(c.text.strip())
-                else:
-                    to_return.append(c.string.strip())
+                to_return.append(c.text.strip())
 
         return to_return
 

diff --git a/summarizer/text_processors/sentence_handler.py b/summarizer/text_processors/sentence_handler.py
@@ -14,18 +14,9 @@ def __init__(self, language: Language = English):
 
         :param language: Determines the language to use with spacy.
         """
-        nlp = language()
-
-        is_spacy_3 = False
-        try:
-            # Supports spacy 2.0
-            nlp.add_pipe(nlp.create_pipe('sentencizer'))
-        except Exception:
-            # Supports spacy 3.0
-            nlp.add_pipe("sentencizer")
-            is_spacy_3 = True
-
-        super().__init__(nlp, is_spacy_3)
+        nlp = language(disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer", "ner"])
+        nlp.add_pipe("sentencizer")
+        super().__init__(nlp)
 
     def process(
         self, body: str, min_length: int = 40, max_length: int = 600

diff --git a/tests/test_coreference.py b/tests/test_coreference.py
@@ -9,7 +9,65 @@ def coreference_handler():
 
 
 def test_coreference_handler(coreference_handler):
-    orig = '''My sister has a dog. She loves him.'''
-    resolved = '''My sister has a dog. My sister loves a dog.'''
+    orig = "My sister has a dog. She loves him."
+    resolved = "My sister has a dog. My sister loves a dog."
     result = coreference_handler.process(orig, min_length=2)
-    assert ' '.join(result) == resolved
+    assert " ".join(result) == resolved
+
+
+def test_longer_coreference_handler(coreference_handler):
+    orig = "My sister has a dog. My sister loves him. John Smith called from London, he said it's raining in the city."
+    resolved = "My sister has a dog. My sister loves a dog. " \
+               "John Smith called from London, John Smith said it's raining in London."
+    result = coreference_handler.process(orig, min_length=2)
+    assert " ".join(result) == resolved
+
+
+def test_possessive_coreference_handler(coreference_handler):
+    orig = "The government announced the new policy on Wednesday. " \
+           "Their spokesperson, Angela Smith, said that they had listened to all proposals."
+    resolved = "The government announced the new policy on Wednesday. " \
+               "The government\u2019s spokesperson, Angela Smith, said that The government had " \
+               "listened to all proposals."
+    result = coreference_handler.process(orig, min_length=2)
+    assert " ".join(result) == resolved
+
+
+def test_possessive_coreference_handler_2(coreference_handler):
+    orig = "Next, the cat sat on the mat. We tickled its nose."
+    resolved = "Next, the cat sat on the mat. We tickled the cat\u2019s nose."
+    result = coreference_handler.process(orig, min_length=2)
+    assert " ".join(result) == resolved
+
+
+def test_possessive_coreference_handler_3(coreference_handler):
+    orig = "The government's handling of the announcement was poor. " \
+           "Their spokesperson, Angela Smith, said that they had listened to all proposals."
+    resolved = "The government's handling of the announcement was poor. " \
+               "The government's spokesperson, Angela Smith, said that The government had " \
+               "listened to all proposals."
+    result = coreference_handler.process(orig, min_length=2)
+    assert " ".join(result) == resolved
+
+
+def test_possessive_coreference_handler_4(coreference_handler):
+    orig = "Anna wrote to Tom. Tom is Anna's brother. He is living in New York."
+    resolved = "Anna wrote to Tom. Tom is Anna\u2019s brother. Tom is living in New York."
+    result = coreference_handler.process(orig, min_length=2)
+    assert " ".join(result) == resolved
+
+
+def test_overlapping_coreference_handler(coreference_handler):
+    orig = "Fidel Castro led a communist revolution that toppled the Cuban government in 1959, " \
+           "after which he declared himself prime minister. He held the title until 1976, when it was " \
+           "abolished and he became head of the Communist Party and president of the council of state and " \
+           "the council of ministers. With his health failing, Castro handed power to his brother, " \
+           "Raúl, in 2006. He died in 2016."
+    resolved = "Fidel Castro led a communist revolution that toppled the Cuban government in 1959, " \
+               "after which Fidel Castro declared Fidel Castro prime minister. Fidel Castro held himself prime " \
+               "minister until 1976, when himself prime minister was abolished and Fidel Castro became head of the " \
+               "Communist Party and president of the council of state and the council of ministers. " \
+               "With Fidel Castro\u2019s health failing, Fidel Castro handed power to Fidel Castro\u2019s brother, " \
+               "Raúl, in 2006. Fidel Castro died in 2016."
+    result = coreference_handler.process(orig, min_length=2)
+    assert " ".join(result) == resolved