Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify bench/ann scripts to Python based module #1642

Merged
merged 33 commits into from
Jul 26, 2023
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
d6b5f4e
add utility to download and move files
divyegala Jul 11, 2023
0c6c33b
run pre-commit
divyegala Jul 11, 2023
1be4eb1
add copyright
divyegala Jul 11, 2023
a50cd97
start working on runner script
divyegala Jul 11, 2023
e9b4eca
working build and search
divyegala Jul 13, 2023
87ad2b3
Merge remote-tracking branch 'upstream/branch-23.08' into bench-ann-s…
divyegala Jul 13, 2023
3a12d40
fix spelling
divyegala Jul 13, 2023
eb6d9a2
run flake8 manually
divyegala Jul 13, 2023
cd86fa3
add data_export.py script
divyegala Jul 13, 2023
a548b22
run flake8 manually
divyegala Jul 13, 2023
df16559
Update cpp/bench/ann/scripts/run.py
divyegala Jul 14, 2023
d4f30de
review suggestions
divyegala Jul 14, 2023
b665a64
add docs
divyegala Jul 14, 2023
079d8ef
spelling check
divyegala Jul 14, 2023
c62c423
address review
divyegala Jul 14, 2023
746c214
Merge remote-tracking branch 'upstream/branch-23.08' into bench-ann-s…
divyegala Jul 14, 2023
1430155
add faiss_gpu_ivf_sq
divyegala Jul 18, 2023
a38f21c
address review to use new string formatting, add plot.py
divyegala Jul 19, 2023
94ddec4
add end-to-end docs for b scale
divyegala Jul 19, 2023
cf44279
add plotting
divyegala Jul 19, 2023
7b4711e
correct executable=>algo strategy
divyegala Jul 19, 2023
7465b8d
address review
divyegala Jul 20, 2023
9b978c9
Merge branch 'branch-23.08' into bench-ann-scripts
cjnolet Jul 20, 2023
76d45fd
modify docs
divyegala Jul 20, 2023
494609e
fix some typos
divyegala Jul 20, 2023
d46d49d
run benchmarks with conda package
divyegala Jul 20, 2023
5adbf36
fix spelling
divyegala Jul 21, 2023
2f1e8ca
add build/search params to run.py
divyegala Jul 21, 2023
3ac8d76
add destructors to fix running raft benchmarks
divyegala Jul 21, 2023
ee61877
move algos.yaml
divyegala Jul 21, 2023
dbfae90
Merge branch 'branch-23.08' into bench-ann-scripts
cjnolet Jul 24, 2023
a0bf789
address review
divyegala Jul 25, 2023
7c1a6cf
add cmake example
divyegala Jul 25, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions cpp/bench/ann/algos.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
faise_gpu_ivf_flat:
executable: FAISS_IVF_FLAT_ANN_BENCH
disabled: false
faiss_gpu_flat:
executable: FAISS_IVF_FLAT_ANN_BENCH
disabled: false
faiss_gpu_ivf_pq:
executable: FAISS_IVF_PQ_ANN_BENCH
disabled: false
faiss_gpu_bfknn:
executable: FAISS_BFKNN_ANN_BENCH
disabled: false
raft_ivf_flat:
executable: RAFT_IVF_FLAT_ANN_BENCH
disabled: false
raft_ivf_pq:
executable: RAFT_IVF_PQ_ANN_BENCH
disabled: false
raft_cagra:
executable: RAFT_CAGRA_ANN_BENCH
disabled: false
ggnn:
executable: GGNN_ANN_BENCH
disabled: false
hnswlib:
executable: HNSWLIB_ANN_BENCH
disabled: false
6 changes: 1 addition & 5 deletions cpp/bench/ann/conf/glove-100-inner.json
Original file line number Diff line number Diff line change
Expand Up @@ -789,9 +789,5 @@

],
"search_result_file" : "result/glove-100-inner/ggnn/kbuild96-segment64-refine2-k10"
},


]

}]
}
58 changes: 58 additions & 0 deletions cpp/bench/ann/scripts/data_export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#
# Copyright (c) 2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import os
import subprocess


def export_results(output_filepath, recompute, groundtruth_filepath,
result_filepaths):
# result_filepaths = " ".join(result_filepaths)
# print(result_filepaths)
divyegala marked this conversation as resolved.
Show resolved Hide resolved
if recompute:
p = subprocess.Popen(["scripts/eval.pl", "-f", "-o", output_filepath,
groundtruth_filepath] + result_filepaths)
else:
p = subprocess.Popen(["scripts/eval.pl", "-o", output_filepath,
groundtruth_filepath] + result_filepaths)
p.wait()


def main():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--output", help="Path to the output file",
divyegala marked this conversation as resolved.
Show resolved Hide resolved
required=True)
parser.add_argument("--recompute", action="store_true",
help="Recompute metrics")
parser.add_argument("--groundtruth",
help="Dataset whose groundtruth is used",
required=True)
args, result_filepaths = parser.parse_known_args()

# assume "result/<groundtruth_dataset>" folder to be default
# if nothing is provided
divyegala marked this conversation as resolved.
Show resolved Hide resolved
if len(result_filepaths) == 0:
result_filepaths = ["result/%s" % args.groundtruth]

groundtruth_filepath = os.path.join("data", args.groundtruth,
"groundtruth.neighbors.ibin")
export_results(args.output, args.recompute, groundtruth_filepath,
result_filepaths)


if __name__ == "__main__":
main()
88 changes: 88 additions & 0 deletions cpp/bench/ann/scripts/get_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#
# Copyright (c) 2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import os
import subprocess
from urllib.request import urlretrieve


def get_dataset_path(name):
if not os.path.exists("data"):
os.mkdir("data")
return os.path.join("data", "%s.hdf5" % name)


def download_dataset(url, path):
if not os.path.exists(path):
# TODO: should be atomic
divyegala marked this conversation as resolved.
Show resolved Hide resolved
print("downloading %s -> %s..." % (url, path))
urlretrieve(url, path)


def convert_hdf5_to_fbin(path, normalize):
if normalize and "angular" in path:
p = subprocess.Popen(["python", "scripts/hdf5_to_fbin.py", "-n",
"%s" % path])
else:
p = subprocess.Popen(["python", "scripts/hdf5_to_fbin.py",
"%s" % path])
p.wait()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of invoking via a subprocess - can we just call the python function directly?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately, that script doesn't have a callable function. Would you prefer I refactor the script to make it work or we can do it later?



def move(name, path):
if "angular" in name:
new_name = name.replace("angular", "inner")
else:
new_name = name
new_path = os.path.join("data", new_name)
if not os.path.exists(new_path):
os.mkdir(new_path)
for bin_name in ["base.fbin", "query.fbin", "groundtruth.neighbors.ibin",
"groundtruth.distances.fbin"]:
os.rename("data/%s.%s" % (name, bin_name),
"%s/%s" % (new_path, bin_name))
divyegala marked this conversation as resolved.
Show resolved Hide resolved


def download(name, normalize):
path = get_dataset_path(name)
try:
url = "http://ann-benchmarks.com/%s.hdf5" % name
download_dataset(url, path)

convert_hdf5_to_fbin(path, normalize)

move(name, path)
except Exception:
print("Cannot download %s" % url)
divyegala marked this conversation as resolved.
Show resolved Hide resolved
raise


def main():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--name", help="dataset to download",
default="glove-100-angular")
parser.add_argument("--normalize",
help="normalize cosine distance to inner product",
action="store_true")

args = parser.parse_args()

download(args.name, args.normalize)


if __name__ == "__main__":
main()
155 changes: 155 additions & 0 deletions cpp/bench/ann/scripts/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#
# Copyright (c) 2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import json
import os
import subprocess
import yaml


def validate_algorithm_and_executable(algos_conf, algo):
algos_conf_keys = set(algos_conf.keys())
if algo in algos_conf_keys and not algos_conf[algo]["disabled"]:
# executable is assumed to be in folder "<root>/cpp/build"
executable_filepath = "../../build/%s" % algos_conf[algo]["executable"]
if not os.path.exists(executable_filepath):
raise FileNotFoundError(executable_filepath)
return True
else:
return False


def run_build_and_search(conf_filename, conf_file, executables_to_run, force):
# Need to write temporary configuration
temp_conf_filename = "temporary_%s" % conf_filename
temp_conf_filepath = os.path.join("conf", temp_conf_filename)
with open(temp_conf_filepath, "w") as f:
json.dump(conf_file, f)

print("Building indices for configuration %s" % conf_filename)
for executable in executables_to_run:
if force:
p = subprocess.Popen(["../../build/%s" % executable, "-b", "-f",
temp_conf_filepath])
p.wait()
else:
p = subprocess.Popen(["../../build/%s" % executable, "-b",
temp_conf_filepath])
p.wait()

print("Searching indices for configuration %s" % conf_filename)
for executable in executables_to_run:
if force:
p = subprocess.Popen(["../../build/%s" % executable, "-s", "-f",
temp_conf_filepath])
p.wait()
else:
p = subprocess.Popen(["../../build/%s" % executable, "-s",
temp_conf_filepath])
p.wait()

os.remove(temp_conf_filepath)


def main():
# Read list of allowed algorithms
with open("algos.yaml", "r") as f:
algos_conf = yaml.safe_load(f)

parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
"--dataset",
help="the dataset to load training points from",
default="glove-100-inner",
)
parser.add_argument("--algorithms",
help="run only comma separated list of named \
algorithms",
default=None)
parser.add_argument("--indices",
help="run only comma separated list of named indices. \
parameter `algorithms` is ignored",
default=None)
parser.add_argument("--force",
help="re-run algorithms even if their results \
already exist",
action="store_true")

args = parser.parse_args()

# Read configuration file associated to dataset
conf_filename = "%s.json" % args.dataset
conf_filepath = os.path.join("conf", conf_filename)
if not os.path.exists(conf_filepath):
raise FileNotFoundError(conf_filename)

with open(conf_filepath, "r") as f:
conf_file = json.load(f)

# Ensure base and query files exist for dataset
if not os.path.exists(conf_file["dataset"]["base_file"]):
raise FileNotFoundError(conf_file["dataset"]["base_file"])
if not os.path.exists(conf_file["dataset"]["query_file"]):
raise FileNotFoundError(conf_file["dataset"]["query_file"])

temporary_conf = conf_file.copy()
found_pos = []
executables_to_run = set()
# At least one named index should exist in config file
if args.indices:
indices = set(args.indices.split(","))
# algo associated with index should still be present in algos.yaml
# and enabled
for pos, index in enumerate(conf_file["index"]):
curr_algo = index["algo"]
if index["name"] in indices and \
validate_algorithm_and_executable(algos_conf, curr_algo):
found_pos.append(pos)
executables_to_run.add(algos_conf[curr_algo]["executable"])

# switch to named algorithms if indices parameter is not supplied
elif args.algorithms:
algorithms = set(args.algorithms.split(","))
# pick out algorithms from conf file that exist
# and are enabled in algos.yaml
for pos, index in enumerate(conf_file["index"]):
curr_algo = index["algo"]
if curr_algo in algorithms and \
validate_algorithm_and_executable(algos_conf, curr_algo):
found_pos.append(pos)
executables_to_run.add(algos_conf[curr_algo]["executable"])

# default, try to run all available algorithms
else:
for pos, index in enumerate(conf_file["index"]):
curr_algo = index["algo"]
if validate_algorithm_and_executable(algos_conf, curr_algo):
found_pos.append(pos)
executables_to_run.add(algos_conf[curr_algo]["executable"])

# filter available algorithms or indices
if len(found_pos) == 0:
raise Exception("No named indices/algorithms found in %s"
% conf_filename)
temporary_conf["index"] = [temporary_conf["index"][p] for p in found_pos]

run_build_and_search(conf_filename, temporary_conf, executables_to_run,
args.force)


if __name__ == "__main__":
main()
Loading
Loading