Skip to content

Commit

Permalink
fix logic, remove debug logs
Browse files Browse the repository at this point in the history
  • Loading branch information
jparismorgan committed Jul 16, 2024
1 parent fecd658 commit 21cf4e5
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 65 deletions.
6 changes: 2 additions & 4 deletions apis/python/src/tiledb/vector_search/ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ def ingest(
raise ValueError("source_uri should not be provided alongside input_vectors")
if source_type and input_vectors:
raise ValueError("source_type should not be provided alongside input_vectors")

for variable in [
"training_input_vectors",
"training_source_uri",
Expand Down Expand Up @@ -1614,9 +1614,7 @@ def ingest_type_erased(
trace_id=trace_id,
)

if retrain_index and index_type == "IVF_PQ":
# For IVF_PQ, we cannot re-ingest the data, as we only store the PQ encoded
# vectors. Instead leave the centroids and just update the stored vectors.
if not retrain_index and index_type == "IVF_PQ":
print(
"[ingestion@ingest_type_erased] additions_vectors:",
additions_vectors,
Expand Down
41 changes: 21 additions & 20 deletions src/include/index/ivf_pq_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -483,18 +483,18 @@ class ivf_pq_index {
// through the training set. We need to move iteration over subspaces to
// the inner loop -- and SIMDize it
for (size_t subspace = 0; subspace < num_subspaces_; ++subspace) {
std::cout << " ============ " << std::endl;
// std::cout << " ============ " << std::endl;
auto sub_begin = subspace * dimensions_ / num_subspaces_;
auto sub_end = (subspace + 1) * dimensions_ / num_subspaces_;
std::cout << "[ivf_pq_index@train_pq] sub_begin: " << sub_begin
<< ", sub_end: " << sub_end << std::endl;
// std::cout << "[ivf_pq_index@train_pq] sub_begin: " << sub_begin
// << ", sub_end: " << sub_end << std::endl;

// auto local_sub_distance = SubDistance{sub_begin, sub_end};

// @todo Make choice of kmeans init configurable
sub_kmeans_random_init(
training_set, cluster_centroids_, sub_begin, sub_end, 0xdeadbeef);
debug_matrix(cluster_centroids_, "cluster_centroids_ before");
// debug_matrix(cluster_centroids_, "cluster_centroids_ before");

// sub_kmeans will invoke the sub_distance function with centroids
// against new_centroids, and will call flat::qv_partition with centroids
Expand All @@ -515,25 +515,26 @@ class ivf_pq_index {
tol_,
max_iter_,
num_threads_);
debug_matrix(cluster_centroids_, "cluster_centroids_ after");
// debug_matrix(cluster_centroids_, "cluster_centroids_ after");

max_local_iters_taken = std::max(max_local_iters_taken, iters);
min_local_conv = std::min(min_local_conv, conv);
}
std::cout << "New we create table! ~~~~~~~~~~~~~~~~~~~~~~~ " << std::endl;
// Create tables of distances storing distance between encoding keys,
// one table for each subspace. That is, distance_tables_[i](j, k) is
debug_matrix(cluster_centroids_, "cluster_centroids_ after");
// std::cout << "Now create distance table ~~~~~~~~~~~~~~~~~~~~~~~ " <<
// std::endl; Create tables of distances storing distance between encoding
// keys, one table for each subspace. That is, distance_tables_[i](j, k) is
// the distance between the jth and kth centroids in the ith subspace.
// The distance between two encoded vectors is looked up using the
// keys of the vectors in each subspace (summing up the results obtained
// from each subspace).
// @todo SIMDize with subspace iteration in inner loop
for (size_t subspace = 0; subspace < num_subspaces_; ++subspace) {
std::cout << " ~~~~~~~~~~~ " << std::endl;
// std::cout << " ~~~~~~~~~~~ " << std::endl;
auto sub_begin = subspace * sub_dimensions_;
auto sub_end = (subspace + 1) * sub_dimensions_;
std::cout << "[ivf_pq_index@train_pq] sub_begin: " << sub_begin
<< ", sub_end: " << sub_end << std::endl;
// std::cout << "[ivf_pq_index@train_pq] sub_begin: " << sub_begin
// << ", sub_end: " << sub_end << std::endl;
auto local_sub_distance = SubDistance{sub_begin, sub_end};

for (size_t i = 0; i < num_clusters_; ++i) {
Expand Down Expand Up @@ -881,16 +882,16 @@ class ivf_pq_index {
auto part_indices = partitioned_pq_vectors_->indices();
debug_vector(part_indices, "[ivf_pq_index@update] part_indices");
for (int i = 0; i < ::num_vectors(*partitioned_pq_vectors_); ++i) {
std::cout << "i: " << i
<< " (" + std::to_string((*partitioned_pq_vectors_).ids()[i]) +
")~~~"
<< std::endl;
// std::cout << "i: " << i
// << " (" + std::to_string((*partitioned_pq_vectors_).ids()[i]) +
// ")~~~"
// << std::endl;
if (std::find(
vector_ids_to_remove.begin(),
vector_ids_to_remove.end(),
(*partitioned_pq_vectors_).ids()[i]) ==
vector_ids_to_remove.end()) {
std::cout << "will copy over into idx: " << idx << std::endl;
// std::cout << "will copy over into idx: " << idx << std::endl;
// This vector is not marked for deletion, copy it over.
// unpartitioned_pq_vectors[idx] = (*partitioned_pq_vectors_)[i];
std::copy(
Expand All @@ -911,14 +912,14 @@ class ivf_pq_index {
// So right now we know that we're looking at vector `i`. Determine
// which partition it belongs to using part_indices.
auto partition = find_partition(part_indices, i);
std::cout << "partition: " << partition << std::endl;
// std::cout << "partition: " << partition << std::endl;
partition_labels.push_back(partition);

idx++;
}
debug_matrix_with_ids(
unpartitioned_pq_vectors,
" [ivf_pq_index@update] unpartitioned_pq_vectors");
// debug_matrix_with_ids(
// unpartitioned_pq_vectors,
// " [ivf_pq_index@update] unpartitioned_pq_vectors");
}
debug_matrix_with_ids(
unpartitioned_pq_vectors,
Expand Down
1 change: 0 additions & 1 deletion src/include/test/unit_api_feature_vector_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,6 @@ TEST_CASE("MatrixWithIds constructors and destructors", "[api]") {
(DataType*)b.data(), extents(b)[0], extents(b)[1]};
CHECK(data(0, 0) == 0);
CHECK(data(5, 0) == 5);
debug_matrix(data, "data");

CHECK(b.ids() != nullptr);
auto ids = std::span<IdsType>((IdsType*)b.ids(), b.num_vectors());
Expand Down
39 changes: 11 additions & 28 deletions src/include/test/unit_api_ivf_pq_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1032,17 +1032,14 @@ TEST_CASE("update index", "[api_ivf_pq_index]") {

// Replace id 4 with id 44.
{
std::cout << "IndexIVFPQ() ========================" << std::endl;
auto index = IndexIVFPQ(ctx, index_uri);

std::cout << "index.update() ========================" << std::endl;
auto vectors_to_add = FeatureVectorArray(
ColMajorMatrixWithIds<feature_type_type, id_type_type>{
{{4, 4, 4, 4, 4, 4}}, {44}});
auto vector_ids_to_remove = FeatureVector(std::vector<id_type_type>{4});

auto index = IndexIVFPQ(ctx, index_uri);
index.update(vectors_to_add, vector_ids_to_remove);

std::cout << "index.query() ========================" << std::endl;
query_and_check_equals(
index,
FeatureVectorArray(ColMajorMatrix<feature_type_type>{
Expand Down Expand Up @@ -1074,18 +1071,16 @@ TEST_CASE("update index", "[api_ivf_pq_index]") {
// Replace id 44 with id 444, but also delete ID's which do not exist at the
// same time.
{
std::cout << "IndexIVFPQ() ========================" << std::endl;
auto index = IndexIVFPQ(ctx, index_uri);

std::cout << "index.update() ========================" << std::endl;
auto vectors_to_add = FeatureVectorArray(
ColMajorMatrixWithIds<feature_type_type, id_type_type>{
{{4, 4, 4, 4, 4, 4}}, {444}});
auto vector_ids_to_remove = FeatureVector(
std::vector<id_type_type>{4, 44, 99, 123, 456, 1000, 999});

auto index = IndexIVFPQ(ctx, index_uri);
index.update(vectors_to_add, vector_ids_to_remove);
index.write_index(ctx, index_uri);

std::cout << "index.query() ========================" << std::endl;
query_and_check_equals(
index,
FeatureVectorArray(ColMajorMatrix<feature_type_type>{
Expand All @@ -1097,23 +1092,19 @@ TEST_CASE("update index", "[api_ivf_pq_index]") {
ColMajorMatrix<uint32_t>{{1}, {2}, {3}, {444}},
ColMajorMatrix<float>{{0}, {0}, {0}, {0}},
n_list);

index.write_index(ctx, index_uri);
}

// Add a new vector
{
std::cout << "IndexIVFPQ() ========================" << std::endl;
auto index = IndexIVFPQ(ctx, index_uri);

std::cout << "index.update() ========================" << std::endl;
auto vectors_to_add = FeatureVectorArray(
ColMajorMatrixWithIds<feature_type_type, id_type_type>{
{{5, 5, 5, 5, 5, 5}}, {5}});
auto vector_ids_to_remove = FeatureVector(std::vector<id_type_type>{5});

auto index = IndexIVFPQ(ctx, index_uri);
index.update(vectors_to_add, vector_ids_to_remove);
index.write_index(ctx, index_uri);

std::cout << "index.query() ========================" << std::endl;
query_and_check_equals(
index,
FeatureVectorArray(ColMajorMatrix<feature_type_type>{
Expand All @@ -1126,26 +1117,18 @@ TEST_CASE("update index", "[api_ivf_pq_index]") {
ColMajorMatrix<uint32_t>{{1}, {2}, {3}, {444}, {444}},
ColMajorMatrix<float>{{0}, {0}, {0}, {0}, {6}},
n_list);

index.write_index(ctx, index_uri);
}

// Remove id 1.
std::cout << "Then test that we can remove data from the index. "
"==============================================================="
"==============="
<< std::endl;
{
std::cout << "IndexIVFPQ() ========================" << std::endl;
auto index = IndexIVFPQ(ctx, index_uri);

std::cout << "index.update() ========================" << std::endl;
auto vectors_to_add = FeatureVectorArray(
ColMajorMatrixWithIds<feature_type_type, id_type_type>{});
auto vector_ids_to_remove = FeatureVector(std::vector<id_type_type>{1});

auto index = IndexIVFPQ(ctx, index_uri);
index.update(vectors_to_add, vector_ids_to_remove);
index.write_index(ctx, index_uri);

std::cout << "index.query() ========================" << std::endl;
query_and_check_equals(
index,
FeatureVectorArray(ColMajorMatrix<feature_type_type>{
Expand Down
12 changes: 0 additions & 12 deletions src/include/test/utils/test_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,18 +236,6 @@ void query_and_check_equals(
false, "[test_utils@query_and_check_equals] Scores did not match");
}
}

// CHECK(std::equal(
// scores.begin(),
// scores.end(),
// std::vector<float>{
// default_score, default_score, default_score, default_score}
// .begin()));
// CHECK(std::equal(
// ids.begin(),
// ids.end(),
// std::vector<uint32_t>{default_id, default_id, default_id, default_id}
// .begin()));
}

#endif // TILEDB_TEST_UTILS_H

0 comments on commit 21cf4e5

Please sign in to comment.