Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
TimSchopf committed May 2, 2024
2 parents 3dc3212 + 0ca2367 commit 5e28116
Show file tree
Hide file tree
Showing 9 changed files with 120 additions and 21 deletions.
3 changes: 1 addition & 2 deletions .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ version: 2
sphinx:
configuration: docs/conf.py
builder: html
fail_on_warning: true
fail_on_warning: false

# Optionally build your docs in additional formats such as PDF
formats: all
Expand All @@ -18,7 +18,6 @@ formats: all
python:
install:
- requirements: docs/requirements.txt
- requirements: requirements.txt
- method: pip
path: .
extra_requirements:
Expand Down
21 changes: 11 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ vectorizer = KeyphraseTfidfVectorizer()

# Print parameters
print(vectorizer.get_params())
>> > {'binary': False, 'custom_pos_tagger': None, 'decay': None, 'delete_min_df': None, 'dtype': <
>>> {'binary': False, 'custom_pos_tagger': None, 'decay': None, 'delete_min_df': None, 'dtype': <


class 'numpy.int64'>, 'lowercase': True, 'max_df': None
Expand Down Expand Up @@ -434,7 +434,7 @@ vectorizer.fit(docs)
keyphrases = vectorizer.get_feature_names_out()
print(keyphrases)

>>>['output value' 'information retrieval' 'algorithm' 'vector' 'groups'
>>> ['output value' 'information retrieval' 'algorithm' 'vector' 'groups'
'main topics' 'task' 'precise summary' 'supervised learning'
'inductive bias' 'information retrieval environment'
'supervised learning algorithm' 'function' 'input' 'pair'
Expand Down Expand Up @@ -735,12 +735,12 @@ vectorizer = KeyphraseCountVectorizer(decay=0.5, delete_min_df=3)

# intitial vectorizer fit
vectorizer.fit_transform([docs[0]]).toarray()
>> > array([[1, 1, 3, 1, 1, 3, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 3, 1, 3,
>>> array([[1, 1, 3, 1, 1, 3, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 3, 1, 3,
1, 1, 1]])

# check learned keyphrases
print(vectorizer.get_feature_names_out())
>> > ['output pairs', 'output value', 'function', 'optimal scenario',
>>> ['output pairs', 'output value', 'function', 'optimal scenario',
'pair', 'supervised learning', 'supervisory signal', 'algorithm',
'supervised learning algorithm', 'way', 'training examples',
'input object', 'example', 'machine', 'output',
Expand All @@ -751,12 +751,12 @@ print(vectorizer.get_feature_names_out())
# learn additional keyphrases from new documents with partial fit
vectorizer.partial_fit([docs[1]])
vectorizer.transform([docs[1]]).toarray()
>> > array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
>>> array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 5, 1, 1, 5, 1]])

# check learned keyphrases, including newly learned ones
print(vectorizer.get_feature_names_out())
>> > ['output pairs', 'output value', 'function', 'optimal scenario',
>>> ['output pairs', 'output value', 'function', 'optimal scenario',
'pair', 'supervised learning', 'supervisory signal', 'algorithm',
'supervised learning algorithm', 'way', 'training examples',
'input object', 'example', 'machine', 'output',
Expand All @@ -771,16 +771,16 @@ print(vectorizer.get_feature_names_out())
# update list of learned keyphrases according to 'delete_min_df'
vectorizer.update_bow([docs[1]])
vectorizer.transform([docs[1]]).toarray()
>> > array([[5, 5]])
>>> array([[5, 5]])

# check updated list of learned keyphrases (only the ones that appear more than 'delete_min_df' remain)
print(vectorizer.get_feature_names_out())
>> > ['keywords', 'document']
>>> ['keywords', 'document']

# update again and check the impact of 'decay' on the learned document-keyphrase matrix
vectorizer.update_bow([docs[1]])
vectorizer.X_.toarray()
>> > array([[7.5, 7.5]])
>>> array([[7.5, 7.5]])
```

<a name="#citation-information"/></a>
Expand All @@ -790,7 +790,8 @@ vectorizer.X_.toarray()
[Back to Table of Contents](#toc)

When citing KeyphraseVectorizers or PatternRank in academic papers and theses, please use this BibTeX entry:
```

```plaintext
@conference{schopf_etal_kdir22,
author={Tim Schopf and Simon Klimek and Florian Matthes},
title={PatternRank: Leveraging Pretrained Language Models and Part of Speech for Unsupervised Keyphrase Extraction},
Expand Down
1 change: 1 addition & 0 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ docutils>=0.16
numpy>=1.18.5
spacy>=3.0.1
spacy-transformers>=1.1.6
spacy-curated-transformers>=0.2.2
nltk>=3.6.1
scikit-learn>=1.0
scipy>=1.7.3
Expand Down
16 changes: 10 additions & 6 deletions keyphrase_vectorizers/keyphrase_count_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class KeyphraseCountVectorizer(_KeyphraseVectorizerMixin, BaseEstimator):
must be customized accordingly.
Additionally, the ``pos_pattern`` parameter has to be customized as the `spaCy part-of-speech tags`_ differ between languages.
Without customizing, the words will be tagged with wrong part-of-speech tags and no stopwords will be considered.
In addition, you have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly.
In addition, you may have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly.
Parameters
----------
Expand Down Expand Up @@ -458,11 +458,15 @@ def update_bow(self, raw_documents: List[str]) -> csr_matrix:
that do not exceed `self.delete_min_df` are removed from its
vocabulary and bag-of-keywords matrix.
Arguments:
raw_documents: A list of documents
Parameters
----------
raw_documents : iterable
An iterable of strings.
Returns:
X_: Bag-of-keywords matrix
Returns
-------
X_ : scipy.sparse.csr_matrix
Bag-of-keywords matrix
"""

if hasattr(self, "X_"):
Expand Down Expand Up @@ -501,4 +505,4 @@ def _clean_bow(self) -> None:
x = np.array(self.keyphrases)
mask = np.full(len(self.keyphrases), True, dtype=bool)
mask[indices] = False
self.keyphrases = list(x[~mask])
self.keyphrases = list(x[~mask])
2 changes: 1 addition & 1 deletion keyphrase_vectorizers/keyphrase_tfidf_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class KeyphraseTfidfVectorizer(KeyphraseCountVectorizer):
must be customized accordingly.
Additionally, the ``pos_pattern`` parameter has to be customized as the `spaCy part-of-speech tags`_ differ between languages.
Without customizing, the words will be tagged with wrong part-of-speech tags and no stopwords will be considered.
In addition, you have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly.
In addition, you may have to exclude/include different pipeline components using the ``spacy_exclude`` parameter for the spaCy POS tagger to work properly.
Tf means term-frequency while tf-idf means term-frequency times inverse document-frequency.
This is a common term weighting scheme in information retrieval,
Expand Down
2 changes: 1 addition & 1 deletion keyphrase_vectorizers/keyphrase_vectorizer_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ def _get_pos_keyphrases(self, document_list: List[str], stop_words: Union[str, L
else:
pos_tuples = custom_pos_tagger(raw_documents=document_list)

# get the original documents after they were processed by spaCy
# get the original documents after they were processed by a tokenizer and a POS tagger
processed_docs = []
for tup in pos_tuples:
token = tup[0]
Expand Down
5 changes: 4 additions & 1 deletion tests/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
pytest>=7.0.1
keybert>=0.5.0
flair==0.11.3
scipy==1.7.3
scipy==1.7.3
bertopic>=0.16.1
scikit-learn>=1.0.1
umap-learn==0.5.4
46 changes: 46 additions & 0 deletions tests/test_vectorizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@

import flair
import spacy
from bertopic import BERTopic
from flair.models import SequenceTagger
from flair.tokenization import SegtokSentenceSplitter
from keybert import KeyBERT
from sklearn.datasets import fetch_20newsgroups

import tests.utils as utils
from keyphrase_vectorizers import KeyphraseCountVectorizer, KeyphraseTfidfVectorizer
Expand Down Expand Up @@ -132,3 +134,47 @@ def custom_pos_tagger(raw_documents: List[str], tagger: flair.models.SequenceTag
keyphrases = vectorizer.get_feature_names_out()

assert sorted(keyphrases) == sorted_english_test_keyphrases


def test_online_vectorizer():
first_doc_count_matrix = utils.get_sorted_english_first_doc_count_matrix()
second_doc_count_matrix = utils.get_sorted_english_second_doc_count_matrix()
first_doc_test_keyphrases = utils.get_english_first_doc_test_keyphrases()
english_keyphrases = utils.get_english_test_keyphrases()
frequencies_after_min_df = utils.get_frequencies_after_min_df()
frequent_keyphrases_after_min_df = utils.get_frequent_keyphrases_after_min_df()
frequencies_after_bow = utils.get_frequencies_after_bow()

# intitial vectorizer fit
vectorizer = KeyphraseCountVectorizer(decay=0.5, delete_min_df=3)

assert [sorted(count_list) for count_list in
vectorizer.fit_transform([english_docs[0]]).toarray()] == first_doc_count_matrix
assert sorted(vectorizer.get_feature_names_out()) == first_doc_test_keyphrases

# learn additional keyphrases from new documents with partial fit
vectorizer.partial_fit([english_docs[1]])

assert [sorted(count_list) for count_list in
vectorizer.transform([english_docs[1]]).toarray()] == second_doc_count_matrix
assert sorted(vectorizer.get_feature_names_out()) == english_keyphrases

# update list of learned keyphrases according to 'delete_min_df'
vectorizer.update_bow([english_docs[1]])
assert (vectorizer.transform([english_docs[1]]).toarray() == frequencies_after_min_df).all()

# check updated list of learned keyphrases (only the ones that appear more than 'delete_min_df' remain)
assert sorted(vectorizer.get_feature_names_out()) == frequent_keyphrases_after_min_df

# update again and check the impact of 'decay' on the learned document-keyphrase matrix
vectorizer.update_bow([english_docs[1]])
assert (vectorizer.X_.toarray() == frequencies_after_bow).all()


def test_bertopic():
data = fetch_20newsgroups(subset='train')
texts = data.data[:100]
topic_model = BERTopic(vectorizer_model=KeyphraseCountVectorizer())
topics, probs = topic_model.fit_transform(documents=texts)
new_topics = topic_model.reduce_outliers(texts, topics)
topic_model.update_topics(texts, topics=new_topics)
45 changes: 45 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy as np
def get_english_test_docs():
english_docs = ["""Supervised learning is the machine learning task of learning a function that
maps an input to an output based on example input-output pairs. It infers a
Expand Down Expand Up @@ -56,6 +57,17 @@ def get_english_test_keyphrases():
return sorted_english_test_keyphrases


def get_english_first_doc_test_keyphrases():
sorted_english_first_doc_test_keyphrases = ['algorithm', 'class labels', 'example', 'function', 'inductive bias',
'input', 'input object', 'machine', 'new examples', 'optimal scenario',
'output', 'output pairs', 'output value', 'pair', 'set',
'supervised learning', 'supervised learning algorithm',
'supervisory signal', 'task', 'training data', 'training examples',
'unseen instances', 'unseen situations', 'vector', 'way']

return sorted_english_first_doc_test_keyphrases


def get_sorted_english_keyphrases_custom_flair_tagger():
sorted_english_custom_tagger_keyphrases = ['algorithm', 'class labels', 'document', 'document content',
'document relevance',
Expand Down Expand Up @@ -102,6 +114,21 @@ def get_sorted_english_count_matrix():
return sorted_english_count_matrix


def get_sorted_english_first_doc_count_matrix():
sorted_english_first_doc_count_matrix = [
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 3, 3]]

return sorted_english_first_doc_count_matrix


def get_sorted_english_second_doc_count_matrix():
sorted_english_second_doc_count_matrix = [
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 2, 2, 5, 5]]

return sorted_english_second_doc_count_matrix


def get_sorted_french_count_matrix():
sorted_french_count_matrix = [[1, 1, 1, 1]]

Expand Down Expand Up @@ -130,3 +157,21 @@ def get_english_keybert_keyphrases():
'document content']]

return english_keybert_keyphrases


def get_frequencies_after_min_df():
frequency_array = np.array([[5, 5]])

return frequency_array


def get_frequencies_after_bow():
frequency_array = np.array([[7.5, 7.5]])

return frequency_array


def get_frequent_keyphrases_after_min_df():
keyphrases = ['document', 'keywords']

return keyphrases

0 comments on commit 5e28116

Please sign in to comment.