Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use latest spacy 3 coreference module #142

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ Then to to use coreference, run the following:
from summarizer import Summarizer
from summarizer.text_processors.coreference_handler import CoreferenceHandler

handler = CoreferenceHandler(greedyness=.4)
handler = CoreferenceHandler()
# How coreference works:
# >>>handler.process('''My sister has a dog. She loves him.''', min_length=2)
# ['My sister has a dog.', 'My sister loves a dog.']
Expand Down Expand Up @@ -255,7 +255,7 @@ arguments for custom and different models. This can be done through a command su

```
docker build -t summary-service -f Dockerfile.service ./
docker run --rm -it -p 5000:5000 summary-service:latest -model bert-large-uncased
docker run --rm -it -p 5000:8080 summary-service:latest -model bert-large-uncased
```

Other arguments can also be passed to the server. Below includes the list of available arguments.
Expand All @@ -271,6 +271,7 @@ This endpoint accepts a text/plain input which represents the text that you want
passed as request arguments. The accepted arguments are:

* ratio: Ratio of sentences to summarize to from the original body. (default to 0.2)
* num_sentences: Number of sentences to use. Overrides ratio if supplied
* min_length: The minimum length to accept as a sentence. (default to 25)
* max_length: The maximum length to accept as a sentence. (default to 500)

Expand Down
20 changes: 10 additions & 10 deletions requirements-service.txt
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
numpy==1.17
torch==1.6.0
spacy==2.1.3
transformers==4.4.0
sentencepiece==0.1.95
Cython==0.29.10
tqdm==4.32.2
neuralcoref==4.0
numpy
torch
spacy
spacy-experimental
transformers
sentencepiece
Cython
tqdm
argparse
scikit-learn
bert-extractive-summarizer
Flask
flask-cors
nltk
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.0/en_core_web_sm-2.1.0.tar.gz
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl
https://github.com/explosion/spacy-experimental/releases/download/v0.6.0/en_coreference_web_trf-3.4.0a0-py3-none-any.whl
15 changes: 8 additions & 7 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
numpy
torch==1.9.0
spacy==2.1.3
transformers==4.9.1
sentencepiece==0.1.96
torch
spacy
spacy-experimental
transformers
sentencepiece
tqdm
neuralcoref==4.0
argparse
scikit-learn
pytest
sentence-transformers==2.1.0
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz
sentence-transformers
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl
https://github.com/explosion/spacy-experimental/releases/download/v0.6.0/en_coreference_web_trf-3.4.0a0-py3-none-any.whl
25 changes: 6 additions & 19 deletions server.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,17 @@ def convert_to_paragraphs(self) -> str:
sentences: List[str] = self.run()
return ' '.join([sentence.strip() for sentence in sentences]).strip()


@app.route('/', methods=['GET'])
def hello_world():
return 'Hello, World!'

@app.route('/summarize_by_ratio', methods=['POST'])

@app.route('/summarize', methods=['POST'])
def convert_raw_text_by_ratio():
ratio = float(request.args.get('ratio', 0.2))
sentences = request.args.get('num_sentences')
num_sentences = int(sentences) if sentences else None
min_length = int(request.args.get('min_length', 25))
max_length = int(request.args.get('max_length', 500))

Expand All @@ -66,24 +70,7 @@ def convert_raw_text_by_ratio():
abort(make_response(jsonify(message="Request must have raw text"), 400))

parsed = Parser(data).convert_to_paragraphs()
summary = summarizer(parsed, ratio=ratio, min_length=min_length, max_length=max_length)

return jsonify({
'summary': summary
})

@app.route('/summarize_by_sentence', methods=['POST'])
def convert_raw_text_by_sent():
num_sentences = int(request.args.get('num_sentences', 5))
min_length = int(request.args.get('min_length', 25))
max_length = int(request.args.get('max_length', 500))

data = request.data
if not data:
abort(make_response(jsonify(message="Request must have raw text"), 400))

parsed = Parser(data).convert_to_paragraphs()
summary = summarizer(parsed, num_sentences=num_sentences, min_length=min_length, max_length=max_length)
summary = summarizer(parsed, ratio=ratio, num_sentences=num_sentences, min_length=min_length, max_length=max_length)

return jsonify({
'summary': summary
Expand Down
49 changes: 35 additions & 14 deletions summarizer/text_processors/coreference_handler.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,29 @@
# removed previous import and related functionality since it's just a blank language model,
# while neuralcoref requires passing pretrained language model via spacy.load()
# Updated to use latest Spacy 3 coreference model

from typing import List

import neuralcoref
import spacy

import re
from summarizer.text_processors.sentence_abc import SentenceABC
from summarizer.text_processors.sentence_handler import SentenceHandler

# Experimental coref model
DEFAULT_MODEL = "en_coreference_web_trf"


class CoreferenceHandler(SentenceABC):
"""HuggingFace Coreference Handler."""

def __init__(
self, spacy_model: str = 'en_core_web_sm', greedyness: float = 0.45
self, spacy_model: str = DEFAULT_MODEL
):
"""
Corefence handler. Only works with spacy < 3.0.
Coreference handler. Updated to work with spacy > 3.0.

:param spacy_model: The spacy model to use as default.
:param greedyness: The greedyness factor.
"""
nlp = spacy.load(spacy_model)
neuralcoref.add_to_pipe(nlp, greedyness=greedyness)
super().__init__(nlp, is_spacy_3=False)
super().__init__(nlp)

def process(self, body: str, min_length: int = 40, max_length: int = 600) -> List[str]:
"""
Expand All @@ -34,8 +34,29 @@ def process(self, body: str, min_length: int = 40, max_length: int = 600) -> Lis
:param max_length: Max length that the sentences mus fall under
:return: Returns a list of sentences.
"""
doc = self.nlp(body)._.coref_resolved
doc = self.nlp(doc)
return [c.string.strip()
for c in doc.sents
if max_length > len(c.string.strip()) > min_length]
doc = self.nlp(body)
resolved_text = body
offset = 0
reindex = []
for chain in doc.spans:
for idx, span in enumerate(doc.spans[chain]):
if idx > 0:
reindex.append([span.start_char, span.end_char, doc.spans[chain][0].text, span.text])

for span in sorted(reindex, key=lambda x: x[0]):
antecedent = span[2]
coreferent = span[3]
antecedent_is_possessive = re.match(r".+['\u2019]s$", antecedent)
coreferent_is_possessive = re.match(r"^([Ii]ts|[Hh]is|[Hh]er|[Tt]heir|.+['\u2019]s)$", coreferent)
# Add possessive to antecedent if resolving in a possessive context
if coreferent_is_possessive and not antecedent_is_possessive:
antecedent = antecedent + "\u2019s"
# Remove possessive from resolved antecedent if not resolving in a possessive context
elif antecedent_is_possessive and not coreferent_is_possessive:
antecedent = re.sub(r"['\u2019]s$", "", antecedent)
resolved_text = resolved_text[0:span[0] + offset] + antecedent + resolved_text[span[1] + offset:]
offset += len(antecedent) - (span[1] - span[0])

result_sents = SentenceHandler().process(resolved_text, min_length=min_length)
return result_sents

10 changes: 2 additions & 8 deletions summarizer/text_processors/sentence_abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,13 @@
class SentenceABC:
"""Parent Class for sentence processing."""

def __init__(self, nlp: Language, is_spacy_3: bool):
def __init__(self, nlp: Language):
"""
Base Sentence Handler with Spacy support.

:param nlp: NLP Pipeline.
:param is_spacy_3: Whether or not we are using spacy 3.
"""
self.nlp = nlp
self.is_spacy_3 = is_spacy_3

def sentence_processor(
self, doc, min_length: int = 40, max_length: int = 600
Expand All @@ -31,11 +29,7 @@ def sentence_processor(

for c in doc.sents:
if max_length > len(c.text.strip()) > min_length:

if self.is_spacy_3:
to_return.append(c.text.strip())
else:
to_return.append(c.string.strip())
to_return.append(c.text.strip())

return to_return

Expand Down
15 changes: 3 additions & 12 deletions summarizer/text_processors/sentence_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,9 @@ def __init__(self, language: Language = English):

:param language: Determines the language to use with spacy.
"""
nlp = language()

is_spacy_3 = False
try:
# Supports spacy 2.0
nlp.add_pipe(nlp.create_pipe('sentencizer'))
except Exception:
# Supports spacy 3.0
nlp.add_pipe("sentencizer")
is_spacy_3 = True

super().__init__(nlp, is_spacy_3)
nlp = language(disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer", "ner"])
nlp.add_pipe("sentencizer")
super().__init__(nlp)

def process(
self, body: str, min_length: int = 40, max_length: int = 600
Expand Down
64 changes: 61 additions & 3 deletions tests/test_coreference.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,65 @@ def coreference_handler():


def test_coreference_handler(coreference_handler):
orig = '''My sister has a dog. She loves him.'''
resolved = '''My sister has a dog. My sister loves a dog.'''
orig = "My sister has a dog. She loves him."
resolved = "My sister has a dog. My sister loves a dog."
result = coreference_handler.process(orig, min_length=2)
assert ' '.join(result) == resolved
assert " ".join(result) == resolved


def test_longer_coreference_handler(coreference_handler):
orig = "My sister has a dog. My sister loves him. John Smith called from London, he said it's raining in the city."
resolved = "My sister has a dog. My sister loves a dog. " \
"John Smith called from London, John Smith said it's raining in London."
result = coreference_handler.process(orig, min_length=2)
assert " ".join(result) == resolved


def test_possessive_coreference_handler(coreference_handler):
orig = "The government announced the new policy on Wednesday. " \
"Their spokesperson, Angela Smith, said that they had listened to all proposals."
resolved = "The government announced the new policy on Wednesday. " \
"The government\u2019s spokesperson, Angela Smith, said that The government had " \
"listened to all proposals."
result = coreference_handler.process(orig, min_length=2)
assert " ".join(result) == resolved


def test_possessive_coreference_handler_2(coreference_handler):
orig = "Next, the cat sat on the mat. We tickled its nose."
resolved = "Next, the cat sat on the mat. We tickled the cat\u2019s nose."
result = coreference_handler.process(orig, min_length=2)
assert " ".join(result) == resolved


def test_possessive_coreference_handler_3(coreference_handler):
orig = "The government's handling of the announcement was poor. " \
"Their spokesperson, Angela Smith, said that they had listened to all proposals."
resolved = "The government's handling of the announcement was poor. " \
"The government's spokesperson, Angela Smith, said that The government had " \
"listened to all proposals."
result = coreference_handler.process(orig, min_length=2)
assert " ".join(result) == resolved


def test_possessive_coreference_handler_4(coreference_handler):
orig = "Anna wrote to Tom. Tom is Anna's brother. He is living in New York."
resolved = "Anna wrote to Tom. Tom is Anna\u2019s brother. Tom is living in New York."
result = coreference_handler.process(orig, min_length=2)
assert " ".join(result) == resolved


def test_overlapping_coreference_handler(coreference_handler):
orig = "Fidel Castro led a communist revolution that toppled the Cuban government in 1959, " \
"after which he declared himself prime minister. He held the title until 1976, when it was " \
"abolished and he became head of the Communist Party and president of the council of state and " \
"the council of ministers. With his health failing, Castro handed power to his brother, " \
"Raúl, in 2006. He died in 2016."
resolved = "Fidel Castro led a communist revolution that toppled the Cuban government in 1959, " \
"after which Fidel Castro declared Fidel Castro prime minister. Fidel Castro held himself prime " \
"minister until 1976, when himself prime minister was abolished and Fidel Castro became head of the " \
"Communist Party and president of the council of state and the council of ministers. " \
"With Fidel Castro\u2019s health failing, Fidel Castro handed power to Fidel Castro\u2019s brother, " \
"Raúl, in 2006. Fidel Castro died in 2016."
result = coreference_handler.process(orig, min_length=2)
assert " ".join(result) == resolved