Skip to content

Commit

Permalink
add tests
Browse files Browse the repository at this point in the history
Signed-off-by: Tim Schopf <tim.schopf@t-online.de>
  • Loading branch information
TimSchopf committed Apr 29, 2024
1 parent e01123a commit eede5fe
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 2 deletions.
2 changes: 1 addition & 1 deletion .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ python:
build:
os: ubuntu-22.04
tools:
python: "3.8"
python: "3.7"

submodules:
include: all
4 changes: 3 additions & 1 deletion tests/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
pytest>=7.0.1
keybert>=0.5.0
flair==0.11.3
scipy==1.7.3
scipy==1.7.3
bertopic>=0.16.1
datasets==2.13.2
47 changes: 47 additions & 0 deletions tests/test_vectorizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import flair
import spacy
from bertopic import BERTopic
from datasets import load_dataset
from flair.models import SequenceTagger
from flair.tokenization import SegtokSentenceSplitter
from keybert import KeyBERT
Expand Down Expand Up @@ -132,3 +134,48 @@ def custom_pos_tagger(raw_documents: List[str], tagger: flair.models.SequenceTag
keyphrases = vectorizer.get_feature_names_out()

assert sorted(keyphrases) == sorted_english_test_keyphrases


def test_online_vectorizer():
first_doc_count_matrix = utils.get_sorted_english_first_doc_count_matrix()
second_doc_count_matrix = utils.get_sorted_english_second_doc_count_matrix()
first_doc_test_keyphrases = utils.get_english_first_doc_test_keyphrases()
english_keyphrases = utils.get_english_test_keyphrases()
frequencies_after_min_df = utils.get_frequencies_after_min_df()
frequent_keyphrases_after_min_df = utils.get_frequent_keyphrases_after_min_df()
frequencies_after_bow = utils.get_frequencies_after_bow()

# intitial vectorizer fit
vectorizer = KeyphraseCountVectorizer(decay=0.5, delete_min_df=3)

assert [sorted(count_list) for count_list in
vectorizer.fit_transform([english_docs[0]]).toarray()] == first_doc_count_matrix
assert sorted(vectorizer.get_feature_names_out()) == first_doc_test_keyphrases

# learn additional keyphrases from new documents with partial fit
vectorizer.partial_fit([english_docs[1]])

assert [sorted(count_list) for count_list in
vectorizer.transform([english_docs[1]]).toarray()] == second_doc_count_matrix
assert sorted(vectorizer.get_feature_names_out()) == english_keyphrases

# update list of learned keyphrases according to 'delete_min_df'
vectorizer.update_bow([english_docs[1]])
assert (vectorizer.transform([english_docs[1]]).toarray() == frequencies_after_min_df).all()

# check updated list of learned keyphrases (only the ones that appear more than 'delete_min_df' remain)
assert sorted(vectorizer.get_feature_names_out()) == frequent_keyphrases_after_min_df

# update again and check the impact of 'decay' on the learned document-keyphrase matrix
vectorizer.update_bow([english_docs[1]])
assert (vectorizer.X_.toarray() == frequencies_after_bow).all()


def test_bertopic():
data = load_dataset("ag_news")
texts = data['train']['text']
texts = texts[:100]
topic_model = BERTopic(vectorizer_model=KeyphraseCountVectorizer())
topics, probs = topic_model.fit_transform(documents=texts)
new_topics = topic_model.reduce_outliers(texts, topics)
topic_model.update_topics(texts, topics=new_topics)
64 changes: 64 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy as np
def get_english_test_docs():
english_docs = ["""Supervised learning is the machine learning task of learning a function that
maps an input to an output based on example input-output pairs. It infers a
Expand Down Expand Up @@ -56,6 +57,36 @@ def get_english_test_keyphrases():
return sorted_english_test_keyphrases


def get_english_first_doc_test_keyphrases():
sorted_english_first_doc_test_keyphrases = ['algorithm',
'class labels',
'example',
'function',
'inductive bias',
'input',
'input object',
'machine',
'new examples',
'optimal scenario',
'output',
'output pairs',
'output value',
'pair',
'set',
'supervised learning',
'supervised learning algorithm',
'supervisory signal',
'task',
'training data',
'training examples',
'unseen instances',
'unseen situations',
'vector',
'way']

return sorted_english_first_doc_test_keyphrases


def get_sorted_english_keyphrases_custom_flair_tagger():
sorted_english_custom_tagger_keyphrases = ['algorithm', 'class labels', 'document', 'document content',
'document relevance',
Expand Down Expand Up @@ -102,6 +133,21 @@ def get_sorted_english_count_matrix():
return sorted_english_count_matrix


def get_sorted_english_first_doc_count_matrix():
sorted_english_first_doc_count_matrix = [
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 3, 3]]

return sorted_english_first_doc_count_matrix


def get_sorted_english_second_doc_count_matrix():
sorted_english_second_doc_count_matrix = [
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 2, 2, 5, 5]]

return sorted_english_second_doc_count_matrix


def get_sorted_french_count_matrix():
sorted_french_count_matrix = [[1, 1, 1, 1]]

Expand Down Expand Up @@ -130,3 +176,21 @@ def get_english_keybert_keyphrases():
'document content']]

return english_keybert_keyphrases


def get_frequencies_after_min_df():
frequency_array = np.array([[5, 5]])

return frequency_array


def get_frequencies_after_bow():
frequency_array = np.array([[7.5, 7.5]])

return frequency_array


def get_frequent_keyphrases_after_min_df():
keyphrases = ['document', 'keywords']

return keyphrases

0 comments on commit eede5fe

Please sign in to comment.