Skip to content

Commit

Permalink
Add setters for the delimiters
Browse files Browse the repository at this point in the history
Adding for new methods to allow the user to set the delimiters:
	* set_block_delimiters
	* set_delimiter_paragraph
	* set_delimiter_sentence
	* set_delimiter_word
  • Loading branch information
sergioburdisso committed Feb 9, 2020
1 parent 253c778 commit b632fe0
Showing 1 changed file with 67 additions and 6 deletions.
73 changes: 67 additions & 6 deletions pyss3/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -867,18 +867,79 @@ def set_model_path(self, path):
"""
self.__models_folder__ = os.path.join(path, STR_MODEL_FOLDER)

def set_block_delimeters(self, parag=None, sent=None, word=None):
"""Overwrite the default delimiters used to split input documents into blocks.
def set_block_delimiters(self, parag=None, sent=None, word=None):
r"""Overwrite the default delimiters used to split input documents into blocks.
:param parag: the path
delimiters are any regular expression from simple ones (e.g. " ") to
more complex ones (e.g. r"[^\s\w\d]").
Note: remember that there are certain reserved characters for regular expression,
for example, the dot (.), in which case use the backslash to indicate you're
referring the character itself and not its interpretation (e.g. \.)
e.g.
>>> ss3.set_block_delimiters(word="\s")
>>> ss3.set_block_delimiters(word="\s", parag="\n\n")
>>> ss3.set_block_delimiters(parag="\n---\n")
>>> ss3.set_block_delimiters(sent="\.")
>>> ss3.set_block_delimiters(word="\|")
>>> ss3.set_block_delimiters(word=" ")
:param parag: the paragraph new delimiter
:type parag: str
:param sent: the sentence new delimiter
:type sent: str
:param word: the word new delimiter
:type word: str
"""
if parag:
self.__parag_delimiter__ = parag
self.set_delimiter_paragraph(parag)
if sent:
self.__sent_delimiter__ = sent
self.set_delimiter_sentence(sent)
if word:
self.__word_delimiter__ = word
self.set_delimiter_word(word)

def set_delimiter_paragraph(self, regex):
r"""
Set the delimiter used to split documents into paragraphs.
Remember that there are certain reserved characters for regular expression,
for example, the dot (.), in which case use the backslash to indicate you're
referring the character itself and not its interpretation (e.g. \.)
:param regex: the regular expression of the new delimiter
:type regex: str
"""
self.__parag_delimiter__ = regex

def set_delimiter_sentence(self, regex):
r"""
Set the delimiter used to split documents into sentences.
Remember that there are certain reserved characters for regular expression,
for example, the dot (.), in which case use the backslash to indicate you're
referring the character itself and not its interpretation (e.g. \.)
:param regex: the regular expression of the new delimiter
:type regex: str
"""
if not re.match(r"\(.*\)", regex):
# force the inclusion of unmatched items by re.split
regex = "(%s)" % regex
self.__sent_delimiter__ = regex

def set_delimiter_word(self, regex):
r"""
Set the delimiter used to split documents into words.
Remember that there are certain reserved characters for regular expression,
for example, the dot (.), in which case use the backslash to indicate you're
referring the character itself and not its interpretation (e.g. \.)
:param regex: the regular expression of the new delimiter
:type regex: str
"""
self.__word_delimiter__ = regex

def set_s(self, value):
"""
Expand Down

0 comments on commit b632fe0

Please sign in to comment.