From 8549b546ef1a97b4c25a0f25b73700802d563d17 Mon Sep 17 00:00:00 2001
From: Naim <110031745+naimnv@users.noreply.github.com>
Date: Mon, 20 Nov 2023 21:39:53 +0100
Subject: [PATCH] Fix Leiden refinement phase (#3990)

- Normalization factor was missing in the equation to decide if a node and a refined community is strongly connected inside their Louvain community. This PR adds that factor.
- Disable random moves in the refinement phase. We plan to expose a flag to enable/disable random moves in a future PR.
- Adds new function to flatten Leiden dendrogram as dendrogram flattening process needs additional info to unroll hierarchical leiden clustering

Closes #3850
Closes #3749

Authors:
  - Naim (https://github.com/naimnv)
  - Alex Barghi (https://github.com/alexbarghi-nv)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Seunghwa Kang (https://github.com/seunghwak)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/cugraph/pull/3990
---
 cpp/src/community/detail/common_methods.cuh   |  46 +++-
 cpp/src/community/detail/refine_impl.cuh      |  22 +-
 cpp/src/community/flatten_dendrogram.hpp      |  29 ++-
 cpp/src/community/leiden_impl.cuh             | 200 ++++++++++--------
 cpp/tests/c_api/leiden_test.c                 |   4 +-
 cpp/tests/c_api/louvain_test.c                |  39 +++-
 cpp/tests/community/louvain_test.cpp          |  81 +------
 .../cugraph/tests/community/test_leiden.py    |  28 +--
 8 files changed, 242 insertions(+), 207 deletions(-)
diff --git a/cpp/src/community/detail/common_methods.cuh b/cpp/src/community/detail/common_methods.cuh
index b388ba53e81..f67d4d939ad 100644
--- a/cpp/src/community/detail/common_methods.cuh
+++ b/cpp/src/community/detail/common_methods.cuh
@@ -52,7 +52,7 @@ struct is_bitwise_comparable<cuco::pair<int32_t, float>> : std::true_type {};
 namespace cugraph {
 namespace detail {
 
-// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
+// FIXME: a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
 template <typename vertex_t, typename weight_t>
 struct key_aggregated_edge_op_t {
   weight_t total_edge_weight{};
@@ -80,7 +80,7 @@ struct key_aggregated_edge_op_t {
   }
 };
 
-// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
+// FIXME: a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
 template <typename vertex_t, typename weight_t>
 struct reduce_op_t {
   using type                          = thrust::tuple<vertex_t, weight_t>;
@@ -100,7 +100,28 @@ struct reduce_op_t {
   }
 };
 
-// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
+// FIXME: a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
+template <typename vertex_t, typename weight_t>
+struct count_updown_moves_op_t {
+  bool up_down{};
+  __device__ auto operator()(thrust::tuple<vertex_t, thrust::tuple<vertex_t, weight_t>> p) const
+  {
+    vertex_t old_cluster       = thrust::get<0>(p);
+    auto new_cluster_gain_pair = thrust::get<1>(p);
+    vertex_t new_cluster       = thrust::get<0>(new_cluster_gain_pair);
+    weight_t delta_modularity  = thrust::get<1>(new_cluster_gain_pair);
+
+    auto result_assignment =
+      (delta_modularity > weight_t{0})
+        ? (((new_cluster > old_cluster) != up_down) ? old_cluster : new_cluster)
+        : old_cluster;
+
+    return (delta_modularity > weight_t{0})
+             ? (((new_cluster > old_cluster) != up_down) ? false : true)
+             : false;
+  }
+};
+// FIXME: a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
 template <typename vertex_t, typename weight_t>
 struct cluster_update_op_t {
   bool up_down{};
@@ -115,7 +136,7 @@ struct cluster_update_op_t {
   }
 };
 
-// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
+// FIXME: a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
 template <typename vertex_t, typename weight_t>
 struct return_edge_weight_t {
   __device__ auto operator()(
@@ -125,7 +146,7 @@ struct return_edge_weight_t {
   }
 };
 
-// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
+// FIXME: a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
 template <typename vertex_t, typename weight_t>
 struct return_one_t {
   __device__ auto operator()(
@@ -394,6 +415,21 @@ rmm::device_uvector<vertex_t> update_clustering_by_delta_modularity(
     detail::reduce_op_t<vertex_t, weight_t>{},
     cugraph::get_dataframe_buffer_begin(output_buffer));
 
+  int nr_moves = thrust::count_if(
+    handle.get_thrust_policy(),
+    thrust::make_zip_iterator(thrust::make_tuple(
+      next_clusters_v.begin(), cugraph::get_dataframe_buffer_begin(output_buffer))),
+    thrust::make_zip_iterator(
+      thrust::make_tuple(next_clusters_v.end(), cugraph::get_dataframe_buffer_end(output_buffer))),
+    detail::count_updown_moves_op_t<vertex_t, weight_t>{up_down});
+
+  if (multi_gpu) {
+    nr_moves = host_scalar_allreduce(
+      handle.get_comms(), nr_moves, raft::comms::op_t::SUM, handle.get_stream());
+  }
+
+  if (nr_moves == 0) { up_down = !up_down; }
+
   thrust::transform(handle.get_thrust_policy(),
                     next_clusters_v.begin(),
                     next_clusters_v.end(),
diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh
index 6b6470991bb..ebaae498d04 100644
--- a/cpp/src/community/detail/refine_impl.cuh
+++ b/cpp/src/community/detail/refine_impl.cuh
@@ -89,8 +89,9 @@ struct leiden_key_aggregated_edge_op_t {
 
     // E(Cr, S-Cr) > ||Cr||*(||S|| -||Cr||)
     bool is_dst_leiden_cluster_well_connected =
-      dst_leiden_cut_to_louvain >
-      resolution * dst_leiden_volume * (louvain_cluster_volume - dst_leiden_volume);
+      dst_leiden_cut_to_louvain > resolution * dst_leiden_volume *
+                                    (louvain_cluster_volume - dst_leiden_volume) /
+                                    total_edge_weight;
 
     // E(v, Cr-v) - ||v||* ||Cr-v||/||V(G)||
     // aggregated_weight_to_neighboring_leiden_cluster == E(v, Cr-v)?
@@ -98,11 +99,11 @@ struct leiden_key_aggregated_edge_op_t {
     weight_t mod_gain = -1.0;
     if (is_src_active > 0) {
       if ((louvain_of_dst_leiden_cluster == src_louvain_cluster) &&
-          is_dst_leiden_cluster_well_connected) {
+          (dst_leiden_cluster_id != src_leiden_cluster) && is_dst_leiden_cluster_well_connected) {
         mod_gain = aggregated_weight_to_neighboring_leiden_cluster -
-                   resolution * src_weighted_deg * (dst_leiden_volume - src_weighted_deg) /
-                     total_edge_weight;
-
+                   resolution * src_weighted_deg * dst_leiden_volume / total_edge_weight;
+// FIXME: Disable random moves in refinement phase for now.
+#if 0
         weight_t random_number{0.0};
         if (mod_gain > 0.0) {
           auto flat_id = uint64_t{threadIdx.x + blockIdx.x * blockDim.x};
@@ -117,6 +118,8 @@ struct leiden_key_aggregated_edge_op_t {
                      ? __expf(static_cast<float>((2.0 * mod_gain) / (theta * total_edge_weight))) *
                          random_number
                      : -1.0;
+#endif
+        mod_gain = mod_gain > 0.0 ? mod_gain : -1.0;
       }
     }
 
@@ -240,11 +243,12 @@ refine_clustering(
                     wcut_deg_and_cluster_vol_triple_begin,
                     wcut_deg_and_cluster_vol_triple_end,
                     singleton_and_connected_flags.begin(),
-                    [resolution] __device__(auto wcut_wdeg_and_louvain_volume) {
+                    [resolution, total_edge_weight] __device__(auto wcut_wdeg_and_louvain_volume) {
                       auto wcut           = thrust::get<0>(wcut_wdeg_and_louvain_volume);
                       auto wdeg           = thrust::get<1>(wcut_wdeg_and_louvain_volume);
                       auto louvain_volume = thrust::get<2>(wcut_wdeg_and_louvain_volume);
-                      return wcut > (resolution * wdeg * (louvain_volume - wdeg));
+                      return wcut >
+                             (resolution * wdeg * (louvain_volume - wdeg) / total_edge_weight);
                     });
 
   edge_src_property_t<GraphViewType, weight_t> src_louvain_cluster_weight_cache(handle);
@@ -478,7 +482,7 @@ refine_clustering(
     auto values_for_leiden_cluster_keys = thrust::make_zip_iterator(
       thrust::make_tuple(refined_community_volumes.begin(),
                          refined_community_cuts.begin(),
-                         leiden_keys_used_in_edge_reduction.begin(),  // redundant
+                         leiden_keys_used_in_edge_reduction.begin(),
                          louvain_of_leiden_keys_used_in_edge_reduction.begin()));
 
     using value_t = thrust::tuple<weight_t, weight_t, vertex_t, vertex_t>;
diff --git a/cpp/src/community/flatten_dendrogram.hpp b/cpp/src/community/flatten_dendrogram.hpp
index 9a0c103c01f..eac20389765 100644
--- a/cpp/src/community/flatten_dendrogram.hpp
+++ b/cpp/src/community/flatten_dendrogram.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,4 +59,31 @@ void partition_at_level(raft::handle_t const& handle,
     });
 }
 
+template <typename vertex_t, bool multi_gpu>
+void leiden_partition_at_level(raft::handle_t const& handle,
+                               Dendrogram<vertex_t> const& dendrogram,
+                               vertex_t* d_partition,
+                               size_t level)
+{
+  vertex_t local_num_verts = dendrogram.get_level_size_nocheck(0);
+  raft::copy(
+    d_partition, dendrogram.get_level_ptr_nocheck(0), local_num_verts, handle.get_stream());
+
+  rmm::device_uvector<vertex_t> local_vertex_ids_v(local_num_verts, handle.get_stream());
+
+  std::for_each(
+    thrust::make_counting_iterator<size_t>(0),
+    thrust::make_counting_iterator<size_t>((level - 1) / 2),
+    [&handle, &dendrogram, &local_vertex_ids_v, &d_partition, local_num_verts](size_t l) {
+      cugraph::relabel<vertex_t, false>(
+        handle,
+        std::tuple<vertex_t const*, vertex_t const*>(dendrogram.get_level_ptr_nocheck(2 * l + 1),
+                                                     dendrogram.get_level_ptr_nocheck(2 * l + 2)),
+        dendrogram.get_level_size_nocheck(2 * l + 1),
+        d_partition,
+        local_num_verts,
+        false);
+    });
+}
+
 }  // namespace cugraph
diff --git a/cpp/src/community/leiden_impl.cuh b/cpp/src/community/leiden_impl.cuh
index a9faf2f2d82..b6e20272de9 100644
--- a/cpp/src/community/leiden_impl.cuh
+++ b/cpp/src/community/leiden_impl.cuh
@@ -43,6 +43,34 @@ void check_clustering(graph_view_t<vertex_t, edge_t, false, multi_gpu> const& gr
   if (graph_view.local_vertex_partition_range_size() > 0)
     CUGRAPH_EXPECTS(clustering != nullptr, "Invalid input argument: clustering is null");
 }
+template <typename vertex_t, bool multi_gpu>
+vertex_t remove_duplicates(raft::handle_t const& handle, rmm::device_uvector<vertex_t>& input_array)
+{
+  thrust::sort(handle.get_thrust_policy(), input_array.begin(), input_array.end());
+
+  auto nr_unique_elements = static_cast<vertex_t>(thrust::distance(
+    input_array.begin(),
+    thrust::unique(handle.get_thrust_policy(), input_array.begin(), input_array.end())));
+
+  input_array.resize(nr_unique_elements, handle.get_stream());
+
+  if constexpr (multi_gpu) {
+    input_array = cugraph::detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(
+      handle, std::move(input_array));
+
+    thrust::sort(handle.get_thrust_policy(), input_array.begin(), input_array.end());
+
+    nr_unique_elements = static_cast<vertex_t>(thrust::distance(
+      input_array.begin(),
+      thrust::unique(handle.get_thrust_policy(), input_array.begin(), input_array.end())));
+
+    input_array.resize(nr_unique_elements, handle.get_stream());
+
+    nr_unique_elements = host_scalar_allreduce(
+      handle.get_comms(), nr_unique_elements, raft::comms::op_t::SUM, handle.get_stream());
+  }
+  return nr_unique_elements;
+}
 
 template <typename vertex_t,
           typename edge_t,
@@ -80,7 +108,7 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
 
   rmm::device_uvector<vertex_t> louvain_of_refined_graph(0, handle.get_stream());  // #V
 
-  while (dendrogram->num_levels() < max_level) {
+  while (dendrogram->num_levels() < 2 * max_level + 1) {
     //
     //  Initialize every cluster to reference each vertex to itself
     //
@@ -353,40 +381,8 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
                  dendrogram->current_level_begin(),
                  dendrogram->current_level_begin() + dendrogram->current_level_size(),
                  copied_louvain_partition.begin());
-
-    thrust::sort(
-      handle.get_thrust_policy(), copied_louvain_partition.begin(), copied_louvain_partition.end());
-
     auto nr_unique_louvain_clusters =
-      static_cast<vertex_t>(thrust::distance(copied_louvain_partition.begin(),
-                                             thrust::unique(handle.get_thrust_policy(),
-                                                            copied_louvain_partition.begin(),
-                                                            copied_louvain_partition.end())));
-
-    copied_louvain_partition.resize(nr_unique_louvain_clusters, handle.get_stream());
-
-    if constexpr (graph_view_t::is_multi_gpu) {
-      copied_louvain_partition =
-        cugraph::detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(
-          handle, std::move(copied_louvain_partition));
-
-      thrust::sort(handle.get_thrust_policy(),
-                   copied_louvain_partition.begin(),
-                   copied_louvain_partition.end());
-
-      nr_unique_louvain_clusters =
-        static_cast<vertex_t>(thrust::distance(copied_louvain_partition.begin(),
-                                               thrust::unique(handle.get_thrust_policy(),
-                                                              copied_louvain_partition.begin(),
-                                                              copied_louvain_partition.end())));
-
-      copied_louvain_partition.resize(nr_unique_louvain_clusters, handle.get_stream());
-
-      nr_unique_louvain_clusters = host_scalar_allreduce(handle.get_comms(),
-                                                         nr_unique_louvain_clusters,
-                                                         raft::comms::op_t::SUM,
-                                                         handle.get_stream());
-    }
+      remove_duplicates<vertex_t, multi_gpu>(handle, copied_louvain_partition);
 
     terminate =
       terminate || (nr_unique_louvain_clusters == current_graph_view.number_of_vertices());
@@ -481,6 +477,15 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
           (*cluster_assignment).data(),
           (*cluster_assignment).size(),
           false);
+        // louvain assignment of aggregated graph which is necessary to flatten dendrogram
+        dendrogram->add_level(current_graph_view.local_vertex_partition_range_first(),
+                              current_graph_view.local_vertex_partition_range_size(),
+                              handle.get_stream());
+
+        raft::copy(dendrogram->current_level_begin(),
+                   (*cluster_assignment).begin(),
+                   (*cluster_assignment).size(),
+                   handle.get_stream());
 
         louvain_of_refined_graph.resize(current_graph_view.local_vertex_partition_range_size(),
                                         handle.get_stream());
@@ -492,47 +497,6 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
       }
     }
 
-    // Relabel dendrogram
-    vertex_t local_cluster_id_first{0};
-    if constexpr (multi_gpu) {
-      auto unique_cluster_range_lasts = cugraph::partition_manager::compute_partition_range_lasts(
-        handle, static_cast<vertex_t>(copied_louvain_partition.size()));
-
-      auto& comm           = handle.get_comms();
-      auto const comm_size = comm.get_size();
-      auto const comm_rank = comm.get_rank();
-      auto& major_comm     = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
-      auto const major_comm_size = major_comm.get_size();
-      auto const major_comm_rank = major_comm.get_rank();
-      auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
-      auto const minor_comm_size = minor_comm.get_size();
-      auto const minor_comm_rank = minor_comm.get_rank();
-
-      auto vertex_partition_id =
-        partition_manager::compute_vertex_partition_id_from_graph_subcomm_ranks(
-          major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank);
-
-      local_cluster_id_first = vertex_partition_id == 0
-                                 ? vertex_t{0}
-                                 : unique_cluster_range_lasts[vertex_partition_id - 1];
-    }
-
-    rmm::device_uvector<vertex_t> numbering_indices(copied_louvain_partition.size(),
-                                                    handle.get_stream());
-    detail::sequence_fill(handle.get_stream(),
-                          numbering_indices.data(),
-                          numbering_indices.size(),
-                          local_cluster_id_first);
-
-    relabel<vertex_t, multi_gpu>(
-      handle,
-      std::make_tuple(static_cast<vertex_t const*>(copied_louvain_partition.begin()),
-                      static_cast<vertex_t const*>(numbering_indices.begin())),
-      copied_louvain_partition.size(),
-      dendrogram->current_level_begin(),
-      dendrogram->current_level_size(),
-      false);
-
     copied_louvain_partition.resize(0, handle.get_stream());
     copied_louvain_partition.shrink_to_fit(handle.get_stream());
 
@@ -550,23 +514,71 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
   return std::make_pair(std::move(dendrogram), best_modularity);
 }
 
-// FIXME: Can we have a common flatten_dendrogram to be used by both
-// Louvain and Leiden, and possibly other clustering methods?
+template <typename vertex_t, bool multi_gpu>
+void relabel_cluster_ids(raft::handle_t const& handle,
+                         rmm::device_uvector<vertex_t>& unique_cluster_ids,
+                         vertex_t* clustering,
+                         size_t num_nodes)
+{
+  vertex_t local_cluster_id_first{0};
+  if constexpr (multi_gpu) {
+    auto unique_cluster_range_lasts = cugraph::partition_manager::compute_partition_range_lasts(
+      handle, static_cast<vertex_t>(unique_cluster_ids.size()));
+
+    auto& comm                 = handle.get_comms();
+    auto const comm_size       = comm.get_size();
+    auto const comm_rank       = comm.get_rank();
+    auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+    auto const major_comm_size = major_comm.get_size();
+    auto const major_comm_rank = major_comm.get_rank();
+    auto& minor_comm           = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+    auto const minor_comm_size = minor_comm.get_size();
+    auto const minor_comm_rank = minor_comm.get_rank();
+
+    auto vertex_partition_id =
+      partition_manager::compute_vertex_partition_id_from_graph_subcomm_ranks(
+        major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank);
+
+    local_cluster_id_first =
+      vertex_partition_id == 0 ? vertex_t{0} : unique_cluster_range_lasts[vertex_partition_id - 1];
+  }
+
+  rmm::device_uvector<vertex_t> numbering_indices(unique_cluster_ids.size(), handle.get_stream());
+  detail::sequence_fill(handle.get_stream(),
+                        numbering_indices.data(),
+                        numbering_indices.size(),
+                        local_cluster_id_first);
+
+  relabel<vertex_t, multi_gpu>(
+    handle,
+    std::make_tuple(static_cast<vertex_t const*>(unique_cluster_ids.begin()),
+                    static_cast<vertex_t const*>(numbering_indices.begin())),
+    unique_cluster_ids.size(),
+    clustering,
+    num_nodes,
+    false);
+}
+
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-void flatten_dendrogram(raft::handle_t const& handle,
-                        graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
-                        Dendrogram<vertex_t> const& dendrogram,
-                        vertex_t* clustering)
+void flatten_leiden_dendrogram(raft::handle_t const& handle,
+                               graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+                               Dendrogram<vertex_t> const& dendrogram,
+                               vertex_t* clustering)
 {
-  rmm::device_uvector<vertex_t> vertex_ids_v(graph_view.number_of_vertices(), handle.get_stream());
+  leiden_partition_at_level<vertex_t, multi_gpu>(
+    handle, dendrogram, clustering, dendrogram.num_levels());
+
+  rmm::device_uvector<vertex_t> unique_cluster_ids(graph_view.number_of_vertices(),
+                                                   handle.get_stream());
+  thrust::copy(handle.get_thrust_policy(),
+               clustering,
+               clustering + graph_view.number_of_vertices(),
+               unique_cluster_ids.begin());
 
-  thrust::sequence(handle.get_thrust_policy(),
-                   vertex_ids_v.begin(),
-                   vertex_ids_v.end(),
-                   graph_view.local_vertex_partition_range_first());
+  remove_duplicates<vertex_t, multi_gpu>(handle, unique_cluster_ids);
 
-  partition_at_level<vertex_t, multi_gpu>(
-    handle, dendrogram, vertex_ids_v.data(), clustering, dendrogram.num_levels());
+  relabel_cluster_ids<vertex_t, multi_gpu>(
+    handle, unique_cluster_ids, clustering, graph_view.number_of_vertices());
 }
 
 }  // namespace detail
@@ -588,14 +600,14 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
 }
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-void flatten_dendrogram(raft::handle_t const& handle,
-                        graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
-                        Dendrogram<vertex_t> const& dendrogram,
-                        vertex_t* clustering)
+void flatten_leiden_dendrogram(raft::handle_t const& handle,
+                               graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+                               Dendrogram<vertex_t> const& dendrogram,
+                               vertex_t* clustering)
 {
   CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
 
-  detail::flatten_dendrogram(handle, graph_view, dendrogram, clustering);
+  detail::flatten_leiden_dendrogram(handle, graph_view, dendrogram, clustering);
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
@@ -620,7 +632,7 @@ std::pair<size_t, weight_t> leiden(
   std::tie(dendrogram, modularity) =
     detail::leiden(handle, rng_state, graph_view, edge_weight_view, max_level, resolution, theta);
 
-  detail::flatten_dendrogram(handle, graph_view, *dendrogram, clustering);
+  detail::flatten_leiden_dendrogram(handle, graph_view, *dendrogram, clustering);
 
   return std::make_pair(dendrogram->num_levels(), modularity);
 }
diff --git a/cpp/tests/c_api/leiden_test.c b/cpp/tests/c_api/leiden_test.c
index 9e91adf9f89..df206ebd1ed 100644
--- a/cpp/tests/c_api/leiden_test.c
+++ b/cpp/tests/c_api/leiden_test.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -161,7 +161,7 @@ int test_leiden_no_weights()
   vertex_t h_src[]             = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
   vertex_t h_dst[]             = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
   vertex_t h_result[]          = {1, 1, 1, 2, 0, 0};
-  weight_t expected_modularity = 0.0859375;
+  weight_t expected_modularity = 0.125;
 
   // Louvain wants store_transposed = FALSE
   return generic_leiden_test(h_src,
diff --git a/cpp/tests/c_api/louvain_test.c b/cpp/tests/c_api/louvain_test.c
index e9ac5c9ff06..41d777545b2 100644
--- a/cpp/tests/c_api/louvain_test.c
+++ b/cpp/tests/c_api/louvain_test.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,22 +46,39 @@ int generic_louvain_test(vertex_t* h_src,
   cugraph_graph_t* p_graph                           = NULL;
   cugraph_hierarchical_clustering_result_t* p_result = NULL;
 
-  data_type_id_t vertex_tid = INT32;
-  data_type_id_t edge_tid   = INT32;
-  data_type_id_t weight_tid = FLOAT32;
+  data_type_id_t vertex_tid    = INT32;
+  data_type_id_t edge_tid      = INT32;
+  data_type_id_t weight_tid    = FLOAT32;
   data_type_id_t edge_id_tid   = INT32;
   data_type_id_t edge_type_tid = INT32;
 
   p_handle = cugraph_create_resource_handle(NULL);
   TEST_ASSERT(test_ret_value, p_handle != NULL, "resource handle creation failed.");
 
-  ret_code = create_sg_test_graph(p_handle, vertex_tid, edge_tid, h_src, h_dst, weight_tid, h_wgt, edge_type_tid, NULL, edge_id_tid, NULL, num_edges, store_transposed, FALSE, FALSE, FALSE, &p_graph, &ret_error);
+  ret_code = create_sg_test_graph(p_handle,
+                                  vertex_tid,
+                                  edge_tid,
+                                  h_src,
+                                  h_dst,
+                                  weight_tid,
+                                  h_wgt,
+                                  edge_type_tid,
+                                  NULL,
+                                  edge_id_tid,
+                                  NULL,
+                                  num_edges,
+                                  store_transposed,
+                                  FALSE,
+                                  FALSE,
+                                  FALSE,
+                                  &p_graph,
+                                  &ret_error);
 
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed.");
   TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
 
-  ret_code =
-    cugraph_louvain(p_handle, p_graph, max_level, threshold, resolution, FALSE, &p_result, &ret_error);
+  ret_code = cugraph_louvain(
+    p_handle, p_graph, max_level, threshold, resolution, FALSE, &p_result, &ret_error);
 
   TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error));
   TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, "cugraph_louvain failed.");
@@ -141,10 +158,10 @@ int test_louvain_no_weight()
   weight_t threshold  = 1e-7;
   weight_t resolution = 1.0;
 
-  vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
-  vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
-  vertex_t h_result[]          = {1, 1, 1, 2, 0, 0};
-  weight_t expected_modularity = 0.0859375;
+  vertex_t h_src[]             = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5};
+  vertex_t h_dst[]             = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4};
+  vertex_t h_result[]          = {1, 1, 1, 1, 0, 0};
+  weight_t expected_modularity = 0.125;
 
   // Louvain wants store_transposed = FALSE
   return generic_louvain_test(h_src,
diff --git a/cpp/tests/community/louvain_test.cpp b/cpp/tests/community/louvain_test.cpp
index 1e1fb6d4c33..284dcc94b8c 100644
--- a/cpp/tests/community/louvain_test.cpp
+++ b/cpp/tests/community/louvain_test.cpp
@@ -317,72 +317,6 @@ TEST(louvain_legacy, success)
   }
 }
 
-TEST(louvain_legacy_renumbered, success)
-{
-  raft::handle_t handle;
-
-  auto stream = handle.get_stream();
-
-  std::vector<int> off_h = {0,   16,  25,  30,  34,  38,  42,  44,  46,  48,  50,  52,
-                            54,  56,  73,  85,  95,  101, 107, 112, 117, 121, 125, 129,
-                            132, 135, 138, 141, 144, 147, 149, 151, 153, 155, 156};
-  std::vector<int> ind_h = {
-    1,  3,  7,  11, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 30, 33, 0,  5,  11, 15, 16, 19, 21,
-    25, 30, 4,  13, 14, 22, 27, 0,  9,  20, 24, 2,  13, 15, 26, 1,  13, 14, 18, 13, 15, 0,  16,
-    13, 14, 3,  20, 13, 14, 0,  1,  13, 22, 2,  4,  5,  6,  8,  10, 12, 14, 17, 18, 19, 22, 25,
-    28, 29, 31, 32, 2,  5,  8,  10, 13, 15, 17, 18, 22, 29, 31, 32, 0,  1,  4,  6,  14, 16, 18,
-    19, 21, 28, 0,  1,  7,  15, 19, 21, 0,  13, 14, 26, 27, 28, 0,  5,  13, 14, 15, 0,  1,  13,
-    16, 16, 0,  3,  9,  23, 0,  1,  15, 16, 2,  12, 13, 14, 0,  20, 24, 0,  3,  23, 0,  1,  13,
-    4,  17, 27, 2,  17, 26, 13, 15, 17, 13, 14, 0,  1,  13, 14, 13, 14, 0};
-
-  std::vector<float> w_h = {
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
-
-  int num_verts = off_h.size() - 1;
-  int num_edges = ind_h.size();
-
-  rmm::device_uvector<int> offsets_v(num_verts + 1, stream);
-  rmm::device_uvector<int> indices_v(num_edges, stream);
-  rmm::device_uvector<float> weights_v(num_edges, stream);
-  rmm::device_uvector<int> result_v(num_verts, stream);
-
-  raft::update_device(offsets_v.data(), off_h.data(), off_h.size(), stream);
-  raft::update_device(indices_v.data(), ind_h.data(), ind_h.size(), stream);
-  raft::update_device(weights_v.data(), w_h.data(), w_h.size(), stream);
-
-  cugraph::legacy::GraphCSRView<int, int, float> G(
-    offsets_v.data(), indices_v.data(), weights_v.data(), num_verts, num_edges);
-
-  float modularity{0.0};
-  size_t num_level = 40;
-
-  // "FIXME": remove this check once we drop support for Pascal
-  //
-  // Calling louvain on Pascal will throw an exception, we'll check that
-  // this is the behavior while we still support Pascal (device_prop.major < 7)
-  //
-  if (handle.get_device_properties().major < 7) {
-    EXPECT_THROW(cugraph::louvain(handle, G, result_v.data()), cugraph::logic_error);
-  } else {
-    std::tie(num_level, modularity) = cugraph::louvain(handle, G, result_v.data());
-
-    auto cluster_id = cugraph::test::to_host(handle, result_v);
-
-    int min = *min_element(cluster_id.begin(), cluster_id.end());
-
-    ASSERT_GE(min, 0);
-    ASSERT_FLOAT_EQ(modularity, 0.41880345);
-  }
-}
-
 using Tests_Louvain_File   = Tests_Louvain<cugraph::test::File_Usecase>;
 using Tests_Louvain_File32 = Tests_Louvain<cugraph::test::File_Usecase>;
 using Tests_Louvain_File64 = Tests_Louvain<cugraph::test::File_Usecase>;
@@ -390,11 +324,15 @@ using Tests_Louvain_Rmat   = Tests_Louvain<cugraph::test::Rmat_Usecase>;
 using Tests_Louvain_Rmat32 = Tests_Louvain<cugraph::test::Rmat_Usecase>;
 using Tests_Louvain_Rmat64 = Tests_Louvain<cugraph::test::Rmat_Usecase>;
 
+#if 0
+// FIXME: Reenable legacy tests once threshold parameter is exposed
+//  by louvain legacy API.
 TEST_P(Tests_Louvain_File, CheckInt32Int32FloatFloatLegacy)
 {
   run_legacy_test<int32_t, int32_t, float, float>(
     override_File_Usecase_with_cmd_line_arguments(GetParam()));
 }
+#endif
 
 TEST_P(Tests_Louvain_File, CheckInt32Int32FloatFloat)
 {
@@ -458,11 +396,12 @@ TEST_P(Tests_Louvain_Rmat64, CheckInt64Int64FloatFloat)
 INSTANTIATE_TEST_SUITE_P(
   simple_test,
   Tests_Louvain_File,
-  ::testing::Combine(
-    ::testing::Values(Louvain_Usecase{std::nullopt, std::nullopt, std::nullopt, true, 3, 0.408695},
-                      Louvain_Usecase{20, double{1e-4}, std::nullopt, true, 3, 0.408695},
-                      Louvain_Usecase{100, double{1e-4}, double{0.8}, true, 3, 0.48336622}),
-    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+  ::testing::Combine(::testing::Values(
+                       Louvain_Usecase{
+                         std::nullopt, std::nullopt, std::nullopt, true, 3, 0.39907956},
+                       Louvain_Usecase{20, double{1e-3}, std::nullopt, true, 3, 0.39907956},
+                       Louvain_Usecase{100, double{1e-3}, double{0.8}, true, 3, 0.47547662}),
+                     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
 
 INSTANTIATE_TEST_SUITE_P(
   file_benchmark_test, /* note that the test filename can be overridden in benchmarking (with
diff --git a/python/cugraph/cugraph/tests/community/test_leiden.py b/python/cugraph/cugraph/tests/community/test_leiden.py
index a06b0dd22c5..71117c4210f 100644
--- a/python/cugraph/cugraph/tests/community/test_leiden.py
+++ b/python/cugraph/cugraph/tests/community/test_leiden.py
@@ -22,8 +22,6 @@
 from cugraph.testing import utils, UNDIRECTED_DATASETS
 from cugraph.datasets import karate_asymmetric
 
-from cudf.testing.testing import assert_series_equal
-
 
 # =============================================================================
 # Test data
@@ -43,8 +41,8 @@
         "resolution": 1.0,
         "input_type": "COO",
         "expected_output": {
-            "partition": [1, 0, 1, 2, 2, 2],
-            "modularity_score": 0.1757322,
+            "partition": [0, 0, 0, 1, 1, 1],
+            "modularity_score": 0.215969,
         },
     },
     "data_2": {
@@ -85,10 +83,10 @@
         "input_type": "CSR",
         "expected_output": {
             # fmt: off
-            "partition": [6, 6, 3, 3, 1, 5, 5, 3, 0, 3, 1, 6, 3, 3, 4, 4, 5, 6, 4, 6, 4,
-                          6, 4, 4, 2, 2, 4, 4, 2, 4, 0, 2, 4, 4],
+            "partition": [3, 3, 3, 3, 2, 2, 2, 3, 1, 3, 2, 3, 3, 3, 1, 1, 2, 3, 1, 3,
+                          1, 3, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1],
             # fmt: on
-            "modularity_score": 0.3468113,
+            "modularity_score": 0.41880345,
         },
     },
 }
@@ -138,7 +136,7 @@ def input_and_expected_output(request):
         # Create graph from csr
         offsets = src_or_offset_array
         indices = dst_or_index_array
-        G.from_cudf_adjlist(offsets, indices, weight)
+        G.from_cudf_adjlist(offsets, indices, weight, renumber=False)
 
     parts, mod = cugraph.leiden(G, max_level, resolution)
 
@@ -223,9 +221,7 @@ def test_leiden_directed_graph():
 
 @pytest.mark.sg
 def test_leiden_golden_results(input_and_expected_output):
-    expected_partition = cudf.Series(
-        input_and_expected_output["expected_output"]["partition"]
-    )
+    expected_partition = input_and_expected_output["expected_output"]["partition"]
     expected_mod = input_and_expected_output["expected_output"]["modularity_score"]
 
     result_partition = input_and_expected_output["result_output"]["partition"]
@@ -233,6 +229,10 @@ def test_leiden_golden_results(input_and_expected_output):
 
     assert abs(expected_mod - result_mod) < 0.0001
 
-    assert_series_equal(
-        expected_partition, result_partition, check_dtype=False, check_names=False
-    )
+    expected_to_result_map = {}
+    for e, r in zip(expected_partition, list(result_partition.to_pandas())):
+        if e in expected_to_result_map.keys():
+            assert r == expected_to_result_map[e]
+
+        else:
+            expected_to_result_map[e] = r