Skip to content

Commit

Permalink
Merge pull request #8 from kissarat/master
Browse files Browse the repository at this point in the history
Using the cache is optional

Thanks @kissarat
  • Loading branch information
Alir3z4 committed Feb 23, 2015
2 parents dc683c4 + 9aa9605 commit 2a41dbb
Show file tree
Hide file tree
Showing 7 changed files with 80 additions and 14 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ src/
.c9/
bin/
develop-eggs/
eggs/
eggs/
2 changes: 1 addition & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[submodule "stop_words/stop-words"]
path = stop_words/stop-words
url = https://github.com/Alir3z4/stop-words.git
url = git@github.com:Alir3z4/stop-words.git
8 changes: 8 additions & 0 deletions ChangeLog.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
2015.2.23
=========
----

* Feature: Using the cache is optional
* Feature: Filtering stopwords

2015.2.21
=========
----
Expand Down Expand Up @@ -39,3 +46,4 @@
* Initial release.
* Package on pypi.
* github.com/Alir3z4/stop-words as submodule.
2 changes: 0 additions & 2 deletions bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,6 @@
setup_args['version'] = options.setuptools_version

ez['use_setuptools'](**setup_args)
import setuptools
import pkg_resources

# This does not (always?) update the default working set. We will
# do it.
Expand Down
57 changes: 52 additions & 5 deletions stop_words/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
import os

__VERSION__ = (2015, 2, 21)
__VERSION__ = (2015, 2, 23)
CURRENT_DIR = os.path.dirname(os.path.realpath(__file__))
STOP_WORDS_DIR = os.path.join(CURRENT_DIR, 'stop-words')
STOP_WORDS_CACHE = {}
Expand All @@ -11,7 +11,7 @@
buffer = buffer.decode('ascii')
LANGUAGE_MAPPING = json.loads(buffer)

AVAILABLE_LANGUAGES = LANGUAGE_MAPPING.values()
AVAILABLE_LANGUAGES = list(LANGUAGE_MAPPING.values())


def get_version():
Expand All @@ -25,7 +25,7 @@ class StopWordError(Exception):
pass


def get_stop_words(language):
def get_stop_words(language, cache=True):
"""
:type language: basestring
Expand All @@ -39,25 +39,72 @@ def get_stop_words(language):
language
))

if language in STOP_WORDS_CACHE:
if cache and language in STOP_WORDS_CACHE:
return STOP_WORDS_CACHE[language]

language_filename = os.path.join(STOP_WORDS_DIR, language + '.txt')
try:
with open(language_filename, 'rb') as language_file:
stop_words = [line.decode('utf-8').strip()
for line in language_file.readlines()]
stop_words = apply_filters(stop_words, language)
except IOError:
raise StopWordError(
'{0}" file is unreadable, check your installation.'.format(
language_filename
)
)

STOP_WORDS_CACHE[language] = stop_words
if cache:
STOP_WORDS_CACHE[language] = stop_words

return stop_words

_filters = {None: []}


def apply_filters(stopwords, language):
"""
Apply registered filters to stopwords
:param stopwords: list
:param language: string
:return: filtered stopwords
"""
if language in _filters:
for func in _filters[language]:
stopwords = func(stopwords)

for func in _filters[None]:
stopwords = func(stopwords, language)

return stopwords


def add_filter(func, language=None):
"""
Register filters for specific language.
If language == None the filter applies for all languages.
Filter will not apply for stop words in cache.
:param func: callable
:param language: string|None
:return:
"""
if language not in _filters:
_filters[language] = []
_filters[language].append(func)


def remove_filter(func, language=None):
"""
:param func:
:param language:
:return:
"""
if not (language in _filters and func in _filters[language]):
return False
_filters[language].remove(func)
return True


def safe_get_stop_words(language):
"""
Expand Down
2 changes: 1 addition & 1 deletion stop_words/stop-words
Submodule stop-words updated 0 files
21 changes: 17 additions & 4 deletions stop_words/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from stop_words import get_stop_words
from stop_words import safe_get_stop_words
from stop_words import StopWordError
from stop_words import STOP_WORDS_CACHE
from stop_words import LANGUAGE_MAPPING
from stop_words import AVAILABLE_LANGUAGES

Expand All @@ -28,9 +27,9 @@ def test_get_stop_words_language_mapping(self):
self.assertEqual(sw, get_stop_words('english'))

def test_get_stop_words_cache(self):
self.assertFalse('french' in STOP_WORDS_CACHE)
self.assertFalse('french' in stop_words.STOP_WORDS_CACHE)
sw = get_stop_words('fr')
self.assertTrue('french' in STOP_WORDS_CACHE)
self.assertTrue('french' in stop_words.STOP_WORDS_CACHE)
original_stop_words_dir = stop_words.STOP_WORDS_DIR
stop_words.STOP_WORDS_DIR = 'not-existing-directory'
self.assertEqual(sw, get_stop_words('french'))
Expand All @@ -39,7 +38,7 @@ def test_get_stop_words_cache(self):
get_stop_words('klingon')
except:
pass
self.assertFalse('klingon' in STOP_WORDS_CACHE)
self.assertFalse('klingon' in stop_words.STOP_WORDS_CACHE)

def test_get_stop_words_unavailable_language(self):
self.assertRaises(StopWordError, get_stop_words, 'sindarin')
Expand All @@ -64,6 +63,20 @@ def test_random_language_stop_words_load(self):
'Cannot load stopwords for {0} language'.format(language)
)

def test_filters(self):
language = 'en'
before = get_stop_words(language, False)
letter = random.choice(random.choice(before))

def remove_letter(stopwords, language):
return [word for word in stopwords if letter not in word]
stop_words.add_filter(remove_letter)
after = get_stop_words(language, False)
for stopword in after:
self.assertFalse(letter in stopword)
self.assertTrue(stop_words.remove_filter(remove_letter))


loader = TestLoader()

test_suite = TestSuite(
Expand Down

0 comments on commit 2a41dbb

Please sign in to comment.