Skip to content

Commit

Permalink
Use np.random.Generator for rng (#694)
Browse files Browse the repository at this point in the history
* Use np.random.Generator for rng

Replace RandomState with random.Generator for generating random numbers.
The RandomState is marked as a legacy api, and will have no further improvements
made to it. The Generator API also gives us some extra features, like
the ability to specify dtypes rather than converting after the fact etc.

* spelling

* windows fix
  • Loading branch information
benfred authored Sep 29, 2023
1 parent f475283 commit c62b23f
Show file tree
Hide file tree
Showing 12 changed files with 35 additions and 31 deletions.
6 changes: 3 additions & 3 deletions implicit/cpu/als.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class AlternatingLeastSquares(MatrixFactorizationBase):
num_threads : int, optional
The number of threads to use for fitting the model and batch recommend calls.
Specifying 0 means to default to the number of cores on the machine.
random_state : int, numpy.random.RandomState or None, optional
random_state : int, numpy.random.RandomState, np.random.Generator or None, optional
The random state for seeding the initial item and user factors.
Default is None.
Expand Down Expand Up @@ -141,9 +141,9 @@ def fit(self, user_items, show_progress=True, callback=None):
s = time.time()
# Initialize the variables randomly if they haven't already been set
if self.user_factors is None:
self.user_factors = random_state.rand(users, self.factors).astype(self.dtype) * 0.01
self.user_factors = random_state.random((users, self.factors), dtype=self.dtype) * 0.01
if self.item_factors is None:
self.item_factors = random_state.rand(items, self.factors).astype(self.dtype) * 0.01
self.item_factors = random_state.random((items, self.factors), dtype=self.dtype) * 0.01

log.debug("Initialized factors in %s", time.time() - s)

Expand Down
8 changes: 4 additions & 4 deletions implicit/cpu/bpr.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ class BayesianPersonalizedRanking(MatrixFactorizationBase):
num_threads : int, optional
The number of threads to use for fitting the model and batch recommend calls.
Specifying 0 means to default to the number of cores on the machine.
random_state : int, RandomState or None, optional
random_state : int, RandomState, Generator or None, optional
The random state for seeding the initial item and user factors.
Default is None.
Expand Down Expand Up @@ -156,15 +156,15 @@ class BayesianPersonalizedRanking(MatrixFactorizationBase):
# Note: the final dimension is for the item bias term - which is set to a 1 for all users
# this simplifies interfacing with approximate nearest neighbours libraries etc
if self.item_factors is None:
self.item_factors = (rs.rand(items, self.factors + 1).astype(self.dtype) - .5)
self.item_factors = (rs.random((items, self.factors + 1), dtype=self.dtype) - .5)
self.item_factors /= self.factors

# set factors to all zeros for items without any ratings
item_counts = np.bincount(user_items.indices, minlength=items)
self.item_factors[item_counts == 0] = np.zeros(self.factors + 1)

if self.user_factors is None:
self.user_factors = (rs.rand(users, self.factors + 1).astype(self.dtype) - .5)
self.user_factors = (rs.random((users, self.factors + 1), dtype=self.dtype) - .5)
self.user_factors /= self.factors

# set factors to all zeros for users without any ratings
Expand All @@ -183,7 +183,7 @@ class BayesianPersonalizedRanking(MatrixFactorizationBase):
num_threads = multiprocessing.cpu_count()

# initialize RNG's, one per thread. Also pass the seeds for each thread's RNG
cdef long[:] rng_seeds = rs.randint(0, 2**31, size=num_threads)
cdef long[:] rng_seeds = rs.integers(0, 2**31, size=num_threads, dtype="long")
cdef RNGVector rng = RNGVector(num_threads, len(user_items.data) - 1, rng_seeds)

log.debug("Running %i BPR training epochs", self.iterations)
Expand Down
6 changes: 3 additions & 3 deletions implicit/cpu/lmf.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -151,14 +151,14 @@ class LogisticMatrixFactorization(MatrixFactorizationBase):
# user_factors[-2] = user bias, item factors[-1] = item bias
# This significantly simplifies both training, and serving
if self.item_factors is None:
self.item_factors = rs.normal(size=(items, self.factors + 2)).astype(np.float32)
self.item_factors = rs.standard_normal(size=(items, self.factors + 2), dtype=np.float32)
self.item_factors[:, -1] = 1.0

# set factors to all zeros for items without any ratings
self.item_factors[item_counts == 0] = np.zeros(self.factors + 2)

if self.user_factors is None:
self.user_factors = rs.normal(size=(users, self.factors + 2)).astype(np.float32)
self.user_factors = rs.standard_normal(size=(users, self.factors + 2), dtype=np.float32)
self.user_factors[:, -2] = 1.0

# set factors to all zeros for users without any ratings
Expand All @@ -173,7 +173,7 @@ class LogisticMatrixFactorization(MatrixFactorizationBase):
num_threads = multiprocessing.cpu_count()

# initialize RNG's, one per thread. Also pass the seeds for each thread's RNG
cdef long[:] rng_seeds = rs.randint(0, 2**31, size=num_threads)
cdef long[:] rng_seeds = rs.integers(0, 2**31, size=num_threads, dtype="long")
cdef RNGVector rng = RNGVector(num_threads, len(user_items.data) - 1, rng_seeds)

log.debug("Running %i LMF training epochs", self.iterations)
Expand Down
2 changes: 1 addition & 1 deletion implicit/evaluation.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def train_test_split(ratings, train_percentage=0.8, random_state=None):

ratings = ratings.tocoo()
random_state = check_random_state(random_state)
random_index = random_state.random_sample(len(ratings.data))
random_index = random_state.random(len(ratings.data))
train_index = random_index < train_percentage
test_index = random_index >= train_percentage

Expand Down
8 changes: 4 additions & 4 deletions implicit/gpu/bpr.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class BayesianPersonalizedRanking(MatrixFactorizationBase):
When sampling negative items, check if the randomly picked negative item has actually
been liked by the user. This check increases the time needed to train but usually leads
to better predictions.
random_state : int, RandomState or None, optional
random_state : int, RandomState, Generator or None, optional
The random state for seeding the initial item and user factors.
Default is None.
Expand Down Expand Up @@ -103,7 +103,7 @@ def fit(self, user_items, show_progress=True, callback=None):
# Note: the final dimension is for the item bias term - which is set to a 1 for all users
# this simplifies interfacing with approximate nearest neighbours libraries etc
if self.item_factors is None:
item_factors = rs.rand(items, self.factors + 1).astype("float32") - 0.5
item_factors = rs.random((items, self.factors + 1), "float32") - 0.5
item_factors /= self.factors

# set factors to all zeros for items without any ratings
Expand All @@ -112,7 +112,7 @@ def fit(self, user_items, show_progress=True, callback=None):
self.item_factors = implicit.gpu.Matrix(item_factors)

if self.user_factors is None:
user_factors = rs.rand(users, self.factors + 1).astype("float32") - 0.5
user_factors = rs.random((users, self.factors + 1), "float32") - 0.5
user_factors /= self.factors

# set factors to all zeros for users without any ratings
Expand Down Expand Up @@ -142,7 +142,7 @@ def fit(self, user_items, show_progress=True, callback=None):
Y,
self.learning_rate,
self.regularization,
rs.randint(2**31),
rs.integers(2**31),
self.verify_negative_samples,
)
progress.update(1)
Expand Down
6 changes: 5 additions & 1 deletion implicit/gpu/matrix_factorization_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,14 +239,18 @@ def check_random_state(random_state):
Parameters
----------
random_state : int, None or RandomState
random_state : int, None, np.random.RandomState or np.random.Generator
The existing RandomState. If None, or an int, will be used
to seed a new curand RandomState generator
"""
if isinstance(random_state, np.random.RandomState):
# we need to convert from numpy random state our internal random state
return implicit.gpu.RandomState(random_state.randint(2**31))

if isinstance(random_state, np.random.Generator):
# we need to convert from numpy random state our internal random state
return implicit.gpu.RandomState(random_state.integers(2**31))

# otherwise try to initialize a new one, and let it fail through
# on the numpy side if it doesn't work
return implicit.gpu.RandomState(random_state or int(time.time()))
13 changes: 7 additions & 6 deletions implicit/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,21 +65,22 @@ def check_blas_config():
def check_random_state(random_state):
"""Validate the random state.
Check a random seed or existing numpy RandomState
and get back an initialized RandomState.
Check a random seed or existing numpy rng
and get back an initialized numpy.randon.Generator
Parameters
----------
random_state : int, None or RandomState
random_state : int, None, np.random.RandomState or np.random.Generator
The existing RandomState. If None, or an int, will be used
to seed a new numpy RandomState.
"""
# if it's an existing random state, pass through
# backwards compatibility
if isinstance(random_state, np.random.RandomState):
return random_state
return np.random.default_rng(random_state.rand_int(2**31))

# otherwise try to initialize a new one, and let it fail through
# on the numpy side if it doesn't work
return np.random.RandomState(random_state)
return np.random.default_rng(random_state)


def augment_inner_product_matrix(factors):
Expand Down
2 changes: 0 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,5 @@ CUDACXX = "/usr/local/cuda/bin/nvcc"
[tool.cibuildwheel.macos]
archs = ["x86_64", "universal2", "arm64"]



[tool.pytest.ini_options]
filterwarnings = ['ignore::implicit.utils.ParameterWarning']
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,6 @@ def exclude_non_implicit_cmake_files(cmake_manifest):
"Collaborative Filtering, Recommender Systems"
),
packages=find_packages(),
install_requires=["numpy", "scipy>=0.16", "tqdm>=4.27", "threadpoolctl"],
install_requires=["numpy>=1.17.0", "scipy>=0.16", "tqdm>=4.27", "threadpoolctl"],
cmake_process_manifest_hook=exclude_non_implicit_cmake_files,
)
2 changes: 1 addition & 1 deletion tests/als_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def test_factorize(use_native, use_gpu, use_cg, dtype):
reconstructed = rows.dot(cols.T)
for i in range(counts.shape[0]):
for j in range(counts.shape[1]):
assert pytest.approx(counts[i, j], abs=1e-4) == reconstructed[i, j], (
assert pytest.approx(counts[i, j], abs=1e-3) == reconstructed[i, j], (
"failed to reconstruct row=%s, col=%s,"
" value=%.5f, dtype=%s, cg=%s, native=%s gpu=%s"
% (i, j, reconstructed[i, j], dtype, use_cg, use_native, use_gpu)
Expand Down
2 changes: 1 addition & 1 deletion tests/bpr_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def _get_model(self):
factors=3,
regularization=0,
use_gpu=True,
learning_rate=0.05,
learning_rate=0.1,
random_state=42,
)

Expand Down
9 changes: 5 additions & 4 deletions tests/gpu_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,10 @@ def test_topk_ascending(k, batch, temp_memory):
def test_topk_random(k, batch, temp_memory):
num_items = 1000
factors = 10
np.random.seed(0)
items = np.random.uniform(size=(num_items, factors)).astype("float32")
queries = np.random.uniform(size=(batch, factors)).astype("float32")

rs = np.random.default_rng(0)
items = rs.random(size=(num_items, factors), dtype="float32")
queries = rs.random(size=(batch, factors), dtype="float32")
_check_knn_queries(items, queries, k, max_temp_memory=temp_memory)


Expand All @@ -46,8 +47,8 @@ def _check_knn_queries(items, queries, k=5, max_temp_memory=500_000_000):
exact_distances[r] = batch[r][exact_ids[r]]

# make sure that we match
assert_array_equal(ids, exact_ids)
assert_allclose(distances, exact_distances, rtol=1e-06)
assert_array_equal(ids, exact_ids)


@pytest.mark.skipif(not implicit.gpu.HAS_CUDA, reason="needs cuda build")
Expand Down

0 comments on commit c62b23f

Please sign in to comment.