rapidsai · rapids-bot · Jul 26, 2023 · Jul 11, 2023 · Jul 11, 2023 · Jul 11, 2023
diff --git a/cpp/bench/ann/algos.yaml b/cpp/bench/ann/algos.yaml
@@ -0,0 +1,27 @@
+faise_gpu_ivf_flat:
+  executable: FAISS_IVF_FLAT_ANN_BENCH
+  disabled: false
+faiss_gpu_flat:
+  executable: FAISS_IVF_FLAT_ANN_BENCH
+  disabled: false
+faiss_gpu_ivf_pq:
+  executable: FAISS_IVF_PQ_ANN_BENCH
+  disabled: false
+faiss_gpu_bfknn:
+  executable: FAISS_BFKNN_ANN_BENCH
+  disabled: false
+raft_ivf_flat:
+  executable: RAFT_IVF_FLAT_ANN_BENCH
+  disabled: false
+raft_ivf_pq:
+  executable: RAFT_IVF_PQ_ANN_BENCH
+  disabled: false
+raft_cagra:
+  executable: RAFT_CAGRA_ANN_BENCH
+  disabled: false
+ggnn:
+  executable: GGNN_ANN_BENCH
+  disabled: false
+hnswlib:
+  executable: HNSWLIB_ANN_BENCH
+  disabled: false
@@ -789,9 +789,5 @@
 
       ],
       "search_result_file" : "result/glove-100-inner/ggnn/kbuild96-segment64-refine2-k10"
-    },
-
-
-  ]
-
+    }]
 }
diff --git a/cpp/bench/ann/scripts/data_export.py b/cpp/bench/ann/scripts/data_export.py
@@ -0,0 +1,58 @@
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import subprocess
+
+
+def export_results(output_filepath, recompute, groundtruth_filepath,
+                   result_filepaths):
+    # result_filepaths = " ".join(result_filepaths)
+    # print(result_filepaths)
+    if recompute:
+        p = subprocess.Popen(["scripts/eval.pl", "-f", "-o", output_filepath,
+                              groundtruth_filepath] + result_filepaths)
+    else:
+        p = subprocess.Popen(["scripts/eval.pl", "-o", output_filepath,
+                              groundtruth_filepath] + result_filepaths)
+    p.wait()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--output", help="Path to the output file",
+                        required=True)
+    parser.add_argument("--recompute", action="store_true",
+                        help="Recompute metrics")
+    parser.add_argument("--groundtruth",
+                        help="Dataset whose groundtruth is used",
+                        required=True)
+    args, result_filepaths = parser.parse_known_args()
+
+    # assume "result/<groundtruth_dataset>" folder to be default
+    # if nothing is provided
+    if len(result_filepaths) == 0:
+        result_filepaths = ["result/%s" % args.groundtruth]
+
+    groundtruth_filepath = os.path.join("data", args.groundtruth,
+                                        "groundtruth.neighbors.ibin")
+    export_results(args.output, args.recompute, groundtruth_filepath,
+                   result_filepaths)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cpp/bench/ann/scripts/get_dataset.py b/cpp/bench/ann/scripts/get_dataset.py
@@ -0,0 +1,88 @@
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import subprocess
+from urllib.request import urlretrieve
+
+
+def get_dataset_path(name):
+    if not os.path.exists("data"):
+        os.mkdir("data")
+    return os.path.join("data", "%s.hdf5" % name)
+
+
+def download_dataset(url, path):
+    if not os.path.exists(path):
+        # TODO: should be atomic
+        print("downloading %s -> %s..." % (url, path))
+        urlretrieve(url, path)
+
+
+def convert_hdf5_to_fbin(path, normalize):
+    if normalize and "angular" in path:
+        p = subprocess.Popen(["python", "scripts/hdf5_to_fbin.py", "-n",
+                              "%s" % path])
+    else:
+        p = subprocess.Popen(["python", "scripts/hdf5_to_fbin.py",
+                              "%s" % path])
+    p.wait()
+
+
+def move(name, path):
+    if "angular" in name:
+        new_name = name.replace("angular", "inner")
+    else:
+        new_name = name
+    new_path = os.path.join("data", new_name)
+    if not os.path.exists(new_path):
+        os.mkdir(new_path)
+    for bin_name in ["base.fbin", "query.fbin", "groundtruth.neighbors.ibin",
+                     "groundtruth.distances.fbin"]:
+        os.rename("data/%s.%s" % (name, bin_name),
+                  "%s/%s" % (new_path, bin_name))
+
+
+def download(name, normalize):
+    path = get_dataset_path(name)
+    try:
+        url = "http://ann-benchmarks.com/%s.hdf5" % name
+        download_dataset(url, path)
+
+        convert_hdf5_to_fbin(path, normalize)
+
+        move(name, path)
+    except Exception:
+        print("Cannot download %s" % url)
+        raise
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--name", help="dataset to download",
+                        default="glove-100-angular")
+    parser.add_argument("--normalize",
+                        help="normalize cosine distance to inner product",
+                        action="store_true")
+
+    args = parser.parse_args()
+
+    download(args.name, args.normalize)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cpp/bench/ann/scripts/run.py b/cpp/bench/ann/scripts/run.py
@@ -0,0 +1,155 @@
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import os
+import subprocess
+import yaml
+
+
+def validate_algorithm_and_executable(algos_conf, algo):
+    algos_conf_keys = set(algos_conf.keys())
+    if algo in algos_conf_keys and not algos_conf[algo]["disabled"]:
+        # executable is assumed to be in folder "<root>/cpp/build"
+        executable_filepath = "../../build/%s" % algos_conf[algo]["executable"]
+        if not os.path.exists(executable_filepath):
+            raise FileNotFoundError(executable_filepath)
+        return True
+    else:
+        return False
+
+
+def run_build_and_search(conf_filename, conf_file, executables_to_run, force):
+    # Need to write temporary configuration
+    temp_conf_filename = "temporary_%s" % conf_filename
+    temp_conf_filepath = os.path.join("conf", temp_conf_filename)
+    with open(temp_conf_filepath, "w") as f:
+        json.dump(conf_file, f)
+
+    print("Building indices for configuration %s" % conf_filename)
+    for executable in executables_to_run:
+        if force:
+            p = subprocess.Popen(["../../build/%s" % executable, "-b", "-f",
+                                  temp_conf_filepath])
+            p.wait()
+        else:
+            p = subprocess.Popen(["../../build/%s" % executable, "-b",
+                                  temp_conf_filepath])
+            p.wait()
+
+    print("Searching indices for configuration %s" % conf_filename)
+    for executable in executables_to_run:
+        if force:
+            p = subprocess.Popen(["../../build/%s" % executable, "-s", "-f",
+                                  temp_conf_filepath])
+            p.wait()
+        else:
+            p = subprocess.Popen(["../../build/%s" % executable, "-s",
+                                  temp_conf_filepath])
+            p.wait()
+
+    os.remove(temp_conf_filepath)
+
+
+def main():
+    # Read list of allowed algorithms
+    with open("algos.yaml", "r") as f:
+        algos_conf = yaml.safe_load(f)
+
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "--dataset",
+        help="the dataset to load training points from",
+        default="glove-100-inner",
+    )
+    parser.add_argument("--algorithms",
+                        help="run only comma separated list of named \
+                              algorithms",
+                        default=None)
+    parser.add_argument("--indices",
+                        help="run only comma separated list of named indices. \
+                              parameter `algorithms` is ignored",
+                        default=None)
+    parser.add_argument("--force",
+                        help="re-run algorithms even if their results \
+                              already exist",
+                        action="store_true")
+
+    args = parser.parse_args()
+
+    # Read configuration file associated to dataset
+    conf_filename = "%s.json" % args.dataset
+    conf_filepath = os.path.join("conf", conf_filename)
+    if not os.path.exists(conf_filepath):
+        raise FileNotFoundError(conf_filename)
+
+    with open(conf_filepath, "r") as f:
+        conf_file = json.load(f)
+
+    # Ensure base and query files exist for dataset
+    if not os.path.exists(conf_file["dataset"]["base_file"]):
+        raise FileNotFoundError(conf_file["dataset"]["base_file"])
+    if not os.path.exists(conf_file["dataset"]["query_file"]):
+        raise FileNotFoundError(conf_file["dataset"]["query_file"])
+
+    temporary_conf = conf_file.copy()
+    found_pos = []
+    executables_to_run = set()
+    # At least one named index should exist in config file
+    if args.indices:
+        indices = set(args.indices.split(","))
+        # algo associated with index should still be present in algos.yaml
+        # and enabled
+        for pos, index in enumerate(conf_file["index"]):
+            curr_algo = index["algo"]
+            if index["name"] in indices and \
+                    validate_algorithm_and_executable(algos_conf, curr_algo):
+                found_pos.append(pos)
+                executables_to_run.add(algos_conf[curr_algo]["executable"])
+
+    # switch to named algorithms if indices parameter is not supplied
+    elif args.algorithms:
+        algorithms = set(args.algorithms.split(","))
+        # pick out algorithms from conf file that exist
+        # and are enabled in algos.yaml
+        for pos, index in enumerate(conf_file["index"]):
+            curr_algo = index["algo"]
+            if curr_algo in algorithms and \
+                    validate_algorithm_and_executable(algos_conf, curr_algo):
+                found_pos.append(pos)
+                executables_to_run.add(algos_conf[curr_algo]["executable"])
+
+    # default, try to run all available algorithms
+    else:
+        for pos, index in enumerate(conf_file["index"]):
+            curr_algo = index["algo"]
+            if validate_algorithm_and_executable(algos_conf, curr_algo):
+                found_pos.append(pos)
+                executables_to_run.add(algos_conf[curr_algo]["executable"])
+
+    # filter available algorithms or indices
+    if len(found_pos) == 0:
+        raise Exception("No named indices/algorithms found in %s"
+                        % conf_filename)
+    temporary_conf["index"] = [temporary_conf["index"][p] for p in found_pos]
+
+    run_build_and_search(conf_filename, temporary_conf, executables_to_run,
+                         args.force)
+
+
+if __name__ == "__main__":
+    main()