Skip to content

Commit

Permalink
not -> is None
Browse files Browse the repository at this point in the history
  • Loading branch information
guipenedo committed Jul 17, 2024
1 parent 452e69a commit 7349902
Show file tree
Hide file tree
Showing 7 changed files with 11 additions and 11 deletions.
2 changes: 1 addition & 1 deletion src/datatrove/pipeline/dedup/bloom_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def parameters(self):
where a and b are numpy uint64 arrays of shape (1, k) containing the
random parameters for the hash functions.
"""
if not self._parameters:
if self._parameters is None:
gen = np.random.RandomState(self.config.seed)
self._parameters = (
gen.randint(1, _mersenne_prime, dtype=np.uint64, size=(1, self.config.k)),
Expand Down
2 changes: 1 addition & 1 deletion src/datatrove/pipeline/dedup/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def parameters(self):
Note: For 64-bit hashes the upper-bound for codomain is not [0,2**64) but [0,2**61 - 1)
"""
if not self._parameters:
if self._parameters is None:
gen = np.random.RandomState(self.config.seed)
self._parameters = (
gen.randint(1, _mersenne_prime, dtype=np.uint64, size=(1, self.num_hashes)),
Expand Down
2 changes: 1 addition & 1 deletion src/datatrove/pipeline/filters/fasttext_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def __init__(

@property
def model(self):
if not self._model:
if self._model is None:
from fasttext.FastText import _FastText

model_file = cached_asset_path_or_download(
Expand Down
2 changes: 1 addition & 1 deletion src/datatrove/utils/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(
self._f = None

def __getitem__(self, item):
if not self._f:
if self._f is None:
self._f = self.fs.open(self.file_path, "rb")
chunk_size = self.token_size * (self.seq_len + 1)
self._f.seek(item * chunk_size)
Expand Down
2 changes: 1 addition & 1 deletion src/datatrove/utils/lid.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(self, languages: list[str] | None = None, k: int = -1) -> None:

@property
def model(self):
if not self._model:
if self._model is None:
check_required_dependencies("lid", [("fasttext", "fasttext-wheel")])
from fasttext.FastText import _FastText

Expand Down
4 changes: 2 additions & 2 deletions src/datatrove/utils/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __init__(self):

@property
def token_size(self) -> int:
if not self._token_size:
if self._token_size is None:
self._token_size = 4 if self.tokenizer.get_vocab_size() > np.iinfo(np.uint16).max + 1 else 2
return self._token_size

Expand All @@ -44,7 +44,7 @@ def token_format(self) -> str:

@property
def tokenizer(self) -> "Tokenizer":
if not self._tokenizer:
if self._tokenizer is None:
if not self.tokenizer_name_or_path:
raise ValueError("self.tokenizer_name_or_path needs to be set!")
self._tokenizer = load_tokenizer(self.tokenizer_name_or_path)
Expand Down
8 changes: 4 additions & 4 deletions src/datatrove/utils/word_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def __init__(self, punkt_language: str):

@property
def tokenizer(self):
if not self._tokenizer:
if self._tokenizer is None:
from nltk import load

self._tokenizer = load(f"tokenizers/punkt/{self.punkt_language}.pickle")
Expand Down Expand Up @@ -77,7 +77,7 @@ def __init__(self, spacy_language: str, config=None):

@property
def tokenizer(self):
if not self._tokenizer:
if self._tokenizer is None:
import spacy

if self.config is None:
Expand Down Expand Up @@ -114,7 +114,7 @@ def __init__(self, stanza_language: str, **stanza_kwargs):

@property
def tokenizer(self):
if not self._tokenizer:
if self._tokenizer is None:
import stanza
from stanza.pipeline.core import DownloadMethod

Expand Down Expand Up @@ -196,7 +196,7 @@ def __init__(self, model_type="sbg"):

@property
def tokenizer(self):
if not self._tokenizer:
if self._tokenizer is None:
from kiwipiepy import Kiwi

self._tokenizer = Kiwi(model_type=self.model_type)
Expand Down

0 comments on commit 7349902

Please sign in to comment.